diff --git a/CMakeLists.txt b/CMakeLists.txt
index b576242d8..d4f734933 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,8 +125,13 @@ set(CCL_INSTALL_KERNELS "${CMAKE_INSTALL_PREFIX}/lib/ccl/kernels")
 # setup dependency directories
 set(DEPS_DIR "${PROJECT_SOURCE_DIR}/deps")
 
-set(MPI_INCLUDE_DIR "${DEPS_DIR}/mpi/include/")
-set(MPI_LIB_DIR "${DEPS_DIR}/mpi/lib/")
+if ("${MPI_DIR}" STREQUAL "")
+    set(MPI_INCLUDE_DIR "${DEPS_DIR}/mpi/include/")
+    set(MPI_LIB_DIR "${DEPS_DIR}/mpi/lib/")
+else()
+    set(MPI_INCLUDE_DIR "${MPI_DIR}/include/")
+    set(MPI_LIB_DIR "${MPI_DIR}/lib/")
+endif()
 message(STATUS "MPI_INCLUDE_DIR: ${MPI_INCLUDE_DIR}")
 message(STATUS "MPI_LIB_DIR: ${MPI_LIB_DIR}")
 
@@ -302,8 +307,8 @@ file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/kernels.spv")
 endif()
 
 set(CCL_MAJOR_VERSION     "2021")
-set(CCL_MINOR_VERSION     "11")
-set(CCL_UPDATE_VERSION    "2")
+set(CCL_MINOR_VERSION     "12")
+set(CCL_UPDATE_VERSION    "0")
 set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
 get_vcs_properties("git")
@@ -335,13 +340,12 @@ if (ENABLE_MPI_TESTS)
         add_subdirectory(examples/benchmark)
         add_subdirectory(examples/common)
         add_subdirectory(examples/cpu)
-        if (BUILD_CONFIG)
+        add_subdirectory(examples/pt2pt)
+	if (BUILD_CONFIG)
             add_subdirectory(examples/external_launcher)
         endif()
         if (CCL_ENABLE_SYCL)
             add_subdirectory(examples/sycl)
-            #TODO: add cpu support
-            add_subdirectory(examples/pt2pt)
         endif()
     endif()
     if (BUILD_FT)
diff --git a/README.md b/README.md
index 21091bf47..f4186f3f6 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ oneCCL is part of [oneAPI](https://oneapi.io).
 - [Installation](#installation)
 - [Usage](#usage)
   - [Launching Example Application](#launching-example-application)
+    - [Using external MPI](#using-external-mpi)
   - [Setting workers affinity](#setting-workers-affinity)
     - [Automatic setup](#automatic-setup)
     - [Explicit setup](#explicit-setup)
@@ -73,6 +74,19 @@ Use the command:
 $ source <install_dir>/env/setvars.sh
 $ mpirun -n 2 <install_dir>/examples/benchmark/benchmark
 ```
+
+#### Using external mpi
+
+The ccl-bundled-mpi flag in vars.sh can take values "yes" or "no" to control if bundled Intel MPI should be used or not. Current default is "yes", which means that oneCCL temporarily overrides the mpi implementation in use.
+
+In order to suppress the behavior and use user-supplied or system-default mpi use the following command *instead* of sourcing `setvars.sh`:
+
+```bash
+$ source <install_dir>/env/vars.sh --ccl-bundled-mpi=no
+```
+
+The mpi implementation will not be overridden. Please note that, in this case, user needs to assure the system finds all required mpi-related binaries.
+
 ### Setting workers affinity
 
 There are two ways to set worker threads (workers) affinity: [automatically](#setting-affinity-automatically) and [explicitly](#setting-affinity-explicitly).
diff --git a/cmake/setvars.sh.in b/cmake/setvars.sh.in
index 4892b9952..5840905af 100644
--- a/cmake/setvars.sh.in
+++ b/cmake/setvars.sh.in
@@ -94,7 +94,8 @@ fi
 
 WORK_DIR=$(get_script_path "${vars_script_name:-}")
 CCL_ROOT="$(cd "${WORK_DIR}"/../; pwd -P)"; export CCL_ROOT
-export I_MPI_ROOT="${CCL_ROOT}"
+export I_MPI_ROOT="${CCL_ROOT}/opt/mpi"
+export SETVARS_CALL=1
 
 source ${CCL_ROOT}/env/vars.sh ${1:-}
 
@@ -104,5 +105,12 @@ then
 else
     PATH="${CCL_ROOT}/bin:${PATH}"; export PATH
 fi
-LD_LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/opt/mpi/libfabric/lib" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH
-FI_PROVIDER_PATH="${I_MPI_ROOT}/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric"; export FI_PROVIDER_PATH
+LD_LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/libfabric/lib" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH
+FI_PROVIDER_PATH="${I_MPI_ROOT}/libfabric/lib/prov:/usr/lib64/libfabric"; export FI_PROVIDER_PATH
+
+CPATH=$(prepend_path "${I_MPI_ROOT}/include" "${CPATH:-}"); export CPATH
+LD_LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/lib" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH
+LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/lib" "${LIBRARY_PATH:-}"); export LIBRARY_PATH
+PATH="${I_MPI_ROOT}/bin:${PATH}"; export PATH
+
+unset -v SETVARS_CALL
diff --git a/cmake/vars.sh.in b/cmake/vars.sh.in
index bb761c68b..559852d44 100644
--- a/cmake/vars.sh.in
+++ b/cmake/vars.sh.in
@@ -16,6 +16,8 @@
 #
 # shellcheck shell=sh
 
+DEFAULT_BUNDLED_MPI="yes"
+
 get_script_path() (
     script="$1"
     while [ -L "$script" ] ; do
@@ -112,3 +114,36 @@ CPATH=$(prepend_path "${CCL_ROOT}/include" "${CPATH:-}"); export CPATH
 CMAKE_PREFIX_PATH=$(prepend_path "${CCL_ROOT}/lib/cmake/oneCCL" "${CMAKE_PREFIX_PATH:-}"); export CMAKE_PREFIX_PATH
 LIBRARY_PATH=$(prepend_path "${CCL_ROOT}/lib" "${LIBRARY_PATH:-}"); export LIBRARY_PATH
 LD_LIBRARY_PATH=$(prepend_path "${CCL_ROOT}/lib" "${LD_LIBRARY_PATH:-}"); export LD_LIBRARY_PATH
+
+args=$*
+for arg in $args
+do
+    case "$arg" in
+        --ccl-bundled-mpi=*)
+            ccl_bundled_mpi="${arg#*=}"
+            ;;
+    esac
+done
+
+if [ -z "${SETVARS_CALL:-}" ] ; then
+    if [ -z "${ccl_bundled_mpi:-}" ]; then
+        ccl_bundled_mpi="${DEFAULT_BUNDLED_MPI}"
+    elif [ "$ccl_bundled_mpi" != "yes" ] && [ "$ccl_bundled_mpi" != "no" ]; then
+        echo ":: WARNING: ccl_bundled_mpi=${ccl_bundled_mpi} is unrecognized."
+        echo ":: ccl_bundled_mpi will be set to ${DEFAULT_BUNDLED_MPI}"
+        ccl_bundled_mpi="${DEFAULT_BUNDLED_MPI}"
+    fi
+
+    if [ "$ccl_bundled_mpi" = "yes" ] ; then
+        export I_MPI_ROOT="${CCL_ROOT}/opt/mpi"
+        CPATH=$(prepend_path "${I_MPI_ROOT}/include" "${CPATH:-}"); export CPATH
+        LD_LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/lib" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH
+        LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/lib" "${LIBRARY_PATH:-}"); export LIBRARY_PATH
+        PATH="${I_MPI_ROOT}/bin:${PATH}"; export PATH
+    fi
+else
+    if [ ! -z "${ccl_bundled_mpi:-}" ]; then
+        echo ":: WARNING: ccl_bundled_mpi was specified for setvars.sh script"
+        echo ":: ccl_bundled_mpi is only supported by direct call vars.sh, ignoring"
+    fi
+fi
diff --git a/deps/hwloc/include/hwloc.h b/deps/hwloc/include/hwloc.h
index f58adec5e..1b759bc09 100644
--- a/deps/hwloc/include/hwloc.h
+++ b/deps/hwloc/include/hwloc.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2022 Inria.  All rights reserved.
+ * Copyright © 2009-2023 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2020 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -263,6 +263,11 @@ typedef enum {
 			  * This is the smallest object representing Memory resources,
 			  * it cannot have any child except Misc objects.
 			  * However it may have Memory-side cache parents.
+                          *
+                          * NUMA nodes may correspond to different kinds of memory
+                          * (DRAM, HBM, CXL-DRAM, etc.). When hwloc is able to guess
+                          * that kind, it is specified in the subtype field of the object.
+                          * See also \ref attributes_normal in the main documentation.
 			  *
 			  * There is always at least one such object in the topology
 			  * even if the machine is not NUMA.
@@ -338,6 +343,12 @@ typedef enum {
 
   HWLOC_OBJ_DIE,	/**< \brief Die within a physical package.
 			 * A subpart of the physical package, that contains multiple cores.
+			 *
+			 * Some operating systems (e.g. Linux) may expose a single die per package
+			 * even if the hardware does not support dies at all. To avoid showing
+			 * such non-existing dies, the corresponding hwloc backend may filter them out.
+			 * This is functionally equivalent to ::HWLOC_TYPE_FILTER_KEEP_STRUCTURE
+			 * being enforced.
 			 */
 
   HWLOC_OBJ_TYPE_MAX    /**< \private Sentinel value */
@@ -656,33 +667,48 @@ union hwloc_obj_attr_u {
   /** \brief PCI Device specific Object Attributes */
   struct hwloc_pcidev_attr_s {
 #ifndef HWLOC_HAVE_32BITS_PCI_DOMAIN
-    unsigned short domain; /* Only 16bits PCI domains are supported by default */
+    unsigned short domain; /**< \brief Domain number (xxxx in the PCI BDF notation xxxx:yy:zz.t).
+                            *   Only 16bits PCI domains are supported by default. */
 #else
-    unsigned int domain; /* 32bits PCI domain support break the library ABI, hence it's disabled by default */
+    unsigned int domain; /**< \brief Domain number   (xxxx in the PCI BDF notation xxxx:yy:zz.t).
+                          *   32bits PCI domain support break the library ABI, hence it's disabled by default. */
 #endif
-    unsigned char bus, dev, func;
-    unsigned short class_id;
-    unsigned short vendor_id, device_id, subvendor_id, subdevice_id;
-    unsigned char revision;
-    float linkspeed; /* in GB/s */
+    unsigned char bus;   /**< \brief Bus number      (yy   in the PCI BDF notation xxxx:yy:zz.t). */
+    unsigned char dev;   /**< \brief Device number   (zz   in the PCI BDF notation xxxx:yy:zz.t). */
+    unsigned char func;  /**< \brief Function number (t    in the PCI BDF notation xxxx:yy:zz.t). */
+    unsigned short class_id;  /**< \brief The class number (first two bytes, without the prog_if). */
+    unsigned short vendor_id;    /**< \brief Vendor ID (xxxx in [xxxx:yyyy]). */
+    unsigned short device_id;    /**< \brief Device ID (yyyy in [xxxx:yyyy]). */
+    unsigned short subvendor_id; /**< \brief Sub-Vendor ID. */
+    unsigned short subdevice_id; /**< \brief Sub-Device ID. */
+    unsigned char revision;   /**< \brief Revision number. */
+    float linkspeed; /**< \brief Link speed in GB/s.
+                      *   This datarate is the currently configured speed of the entire PCI link
+                      *   (sum of the bandwidth of all PCI lanes in that link).
+                      *   It may change during execution since some devices are able to
+                      *   slow their PCI links down when idle.
+                      */
   } pcidev;
   /** \brief Bridge specific Object Attributes */
   struct hwloc_bridge_attr_s {
     union {
-      struct hwloc_pcidev_attr_s pci;
+      struct hwloc_pcidev_attr_s pci; /**< \brief PCI attribute of the upstream part as a PCI device. */
     } upstream;
-    hwloc_obj_bridge_type_t upstream_type;
+    hwloc_obj_bridge_type_t upstream_type; /**< \brief Upstream Bridge type. */
     union {
       struct {
 #ifndef HWLOC_HAVE_32BITS_PCI_DOMAIN
-	unsigned short domain; /* Only 16bits PCI domains are supported by default */
+        unsigned short domain; /**< \brief Domain number the downstream PCI buses.
+                                *   Only 16bits PCI domains are supported by default. */
 #else
-	unsigned int domain; /* 32bits PCI domain support break the library ABI, hence it's disabled by default */
+        unsigned int domain;   /**< \brief Domain number the downstream PCI buses.
+	                        *   32bits PCI domain support break the library ABI, hence it's disabled by default */
 #endif
-	unsigned char secondary_bus, subordinate_bus;
+        unsigned char secondary_bus;   /**< \brief First PCI bus number below the bridge. */
+        unsigned char subordinate_bus; /**< \brief Highest PCI bus number below the bridge. */
       } pci;
     } downstream;
-    hwloc_obj_bridge_type_t downstream_type;
+    hwloc_obj_bridge_type_t downstream_type; /**< \brief Downstream Bridge type. */
     unsigned depth;
   } bridge;
   /** \brief OS Device specific Object Attributes */
@@ -1872,6 +1898,10 @@ HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
  * \note -1 is returned and errno is set to \c ENOSYS on platforms that do not
  * support this feature.
  *
+ * \note The PID will not actually be used until hwloc_topology_load().
+ * If the corresponding process exits in the meantime, hwloc will ignore the PID.
+ * If another process reuses the PID, the view of that process will be used.
+ *
  * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topology, hwloc_pid_t pid);
@@ -1935,15 +1965,20 @@ HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restric
  * \note On success, the XML component replaces the previously enabled
  * component (if any), but the topology is not actually modified until
  * hwloc_topology_load().
+ *
+ * \note If an invalid XML input file is given, the error may be reported
+ * either here or later by hwloc_topology_load() depending on the XML library
+ * used by hwloc.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict xmlpath);
 
 /** \brief Enable XML based topology using a memory buffer (instead of
  * a file, as with hwloc_topology_set_xml()).
  *
- * Gather topology information from the XML memory buffer given at \p
- * buffer and of length \p size.  This buffer may have been filled
- * earlier with hwloc_topology_export_xmlbuffer() in hwloc/export.h.
+ * Gather topology information from the XML memory buffer given at
+ * \p buffer and of length \p size (including an ending \0).
+ * This buffer may have been filled earlier with
+ * hwloc_topology_export_xmlbuffer() in hwloc/export.h.
  *
  * Note that this function does not actually load topology
  * information; it just tells hwloc where to load it from.  You'll
@@ -1964,6 +1999,10 @@ HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topo
  * \note On success, the XML component replaces the previously enabled
  * component (if any), but the topology is not actually modified until
  * hwloc_topology_load().
+ *
+ * \note If an invalid XML input file is given, the error may be reported
+ * either here or later by hwloc_topology_load() depending on the XML library
+ * used by hwloc.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict buffer, int size);
 
@@ -2171,9 +2210,10 @@ enum hwloc_topology_flags_e {
    */
   HWLOC_TOPOLOGY_FLAG_NO_DISTANCES = (1UL<<7),
 
-  /** \brief Ignore memory attributes.
+  /** \brief Ignore memory attributes and tiers.
    *
-   * Ignore memory attribues from the operating systems (and from XML).
+   * Ignore memory attribues from the operating systems (and from XML)
+   * Hence also do not try to build memory tiers.
    */
   HWLOC_TOPOLOGY_FLAG_NO_MEMATTRS = (1UL<<8),
 
@@ -2362,8 +2402,8 @@ HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(h
 /** \brief Type filtering flags.
  *
  * By default, most objects are kept (::HWLOC_TYPE_FILTER_KEEP_ALL).
- * Instruction caches, I/O and Misc objects are ignored by default (::HWLOC_TYPE_FILTER_KEEP_NONE).
- * Die and Group levels are ignored unless they bring structure (::HWLOC_TYPE_FILTER_KEEP_STRUCTURE).
+ * Instruction caches, memory-side caches, I/O and Misc objects are ignored by default (::HWLOC_TYPE_FILTER_KEEP_NONE).
+ * Group levels are ignored unless they bring structure (::HWLOC_TYPE_FILTER_KEEP_STRUCTURE).
  *
  * Note that group objects are also ignored individually (without the entire level)
  * when they do not bring structure.
@@ -2627,13 +2667,33 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t to
  * This function returns a new Group object.
  *
  * The caller should (at least) initialize its sets before inserting
- * the object in the topology. See hwloc_topology_insert_group_object().
+ * the object in the topology, see hwloc_topology_insert_group_object().
+ * Or it may decide not to insert and just free the group object
+ * by calling hwloc_topology_free_group_object().
  *
  * \return The allocated object on success.
  * \return \c NULL on error.
- */
+ *
+ * \note If successfully inserted by hwloc_topology_insert_group_object(),
+ * the object will be freed when the entire topology is freed.
+ * If insertion failed (e.g. \c NULL or empty CPU and node-sets),
+ * it is freed before returning the error.
+  */
 HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t topology);
 
+/** \brief Free a group object allocated with hwloc_topology_alloc_group_object().
+ *
+ * This function is only useful if the group object was not given
+ * to hwloc_topology_insert_group_object() as planned.
+ *
+ * \note \p topology must be the same as the one previously passed
+ * to hwloc_topology_alloc_group_object().
+ *
+ * \return \c 0 on success.
+ * \return \c -1 on error, for instance if an invalid topology is given.
+ */
+HWLOC_DECLSPEC int hwloc_topology_free_group_object(hwloc_topology_t topology, hwloc_obj_t group);
+
 /** \brief Add more structure to the topology by adding an intermediate Group
  *
  * The caller should first allocate a new Group object with hwloc_topology_alloc_group_object().
@@ -2671,6 +2731,14 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t to
  * hence the existing objects may get reordered (including PUs and NUMA nodes),
  * and their logical indexes may change.
  *
+ * \note If the insertion fails, the input group object is freed.
+ *
+ * \note If the group object should be discarded instead of inserted,
+ * it may be passed to hwloc_topology_free_group_object() instead.
+ *
+ * \note \p topology must be the same as the one previously passed
+ * to hwloc_topology_alloc_group_object().
+ *
  * \return The inserted object if it was properly inserted.
  *
  * \return An existing object if the Group was merged or discarded
diff --git a/deps/hwloc/include/hwloc/autogen/config.h b/deps/hwloc/include/hwloc/autogen/config.h
index 3d4238a87..c8c1b4572 100644
--- a/deps/hwloc/include/hwloc/autogen/config.h
+++ b/deps/hwloc/include/hwloc/autogen/config.h
@@ -12,11 +12,11 @@
 #ifndef HWLOC_CONFIG_H
 #define HWLOC_CONFIG_H
 
-#define HWLOC_VERSION "2.9.3rc2-git"
+#define HWLOC_VERSION "2.10.0rc3-git"
 #define HWLOC_VERSION_MAJOR 2
-#define HWLOC_VERSION_MINOR 9
-#define HWLOC_VERSION_RELEASE 3
-#define HWLOC_VERSION_GREEK "rc2"
+#define HWLOC_VERSION_MINOR 10
+#define HWLOC_VERSION_RELEASE 0
+#define HWLOC_VERSION_GREEK "rc3"
 
 /* #undef HWLOC_PCI_COMPONENT_BUILTIN */
 /* #undef HWLOC_OPENCL_COMPONENT_BUILTIN */
diff --git a/deps/hwloc/include/hwloc/diff.h b/deps/hwloc/include/hwloc/diff.h
index f7e6fb1e7..4d822434d 100644
--- a/deps/hwloc/include/hwloc/diff.h
+++ b/deps/hwloc/include/hwloc/diff.h
@@ -256,6 +256,11 @@ HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(const char *xmlpath, hwloc_topol
 HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
 
 /** \brief Load a list of topology differences from a XML buffer.
+ *
+ * Build a list of differences from the XML memory buffer given
+ * at \p xmlbuffer and of length \p buflen (including an ending \0).
+ * This buffer may have been filled earlier with
+ * hwloc_topology_diff_export_xmlbuffer().
  *
  * If not \c NULL, \p refname will be filled with the identifier
  * string of the reference topology for the difference file,
diff --git a/deps/hwloc/include/hwloc/helper.h b/deps/hwloc/include/hwloc/helper.h
index acd9ef782..01619c5fe 100644
--- a/deps/hwloc/include/hwloc/helper.h
+++ b/deps/hwloc/include/hwloc/helper.h
@@ -26,6 +26,86 @@ extern "C" {
 #endif
 
 
+/** \defgroup hwlocality_helper_types Kinds of object Type
+ * @{
+ *
+ * Each object type is
+ * either Normal (i.e. hwloc_obj_type_is_normal() returns 1),
+ * or Memory (i.e. hwloc_obj_type_is_memory() returns 1)
+ * or I/O (i.e. hwloc_obj_type_is_io() returns 1)
+ * or Misc (i.e. equal to ::HWLOC_OBJ_MISC).
+ * It cannot be of more than one of these kinds.
+ *
+ * See also Object Kind in \ref termsanddefs.
+ */
+
+/** \brief Check whether an object type is Normal.
+ *
+ * Normal objects are objects of the main CPU hierarchy
+ * (Machine, Package, Core, PU, CPU caches, etc.),
+ * but they are not NUMA nodes, I/O devices or Misc objects.
+ *
+ * They are attached to parent as Normal children,
+ * not as Memory, I/O or Misc children.
+ *
+ * \return 1 if an object of type \p type is a Normal object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_normal(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is I/O.
+ *
+ * I/O objects are objects attached to their parents
+ * in the I/O children list.
+ * This current includes Bridges, PCI and OS devices.
+ *
+ * \return 1 if an object of type \p type is a I/O object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_io(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is Memory.
+ *
+ * Memory objects are objects attached to their parents
+ * in the Memory children list.
+ * This current includes NUMA nodes and Memory-side caches.
+ *
+ * \return 1 if an object of type \p type is a Memory object, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_memory(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Cache (Data, Unified or Instruction).
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_cache(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Data or Unified Cache.
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a CPU Data or Unified Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_dcache(hwloc_obj_type_t type);
+
+/** \brief Check whether an object type is a CPU Instruction Cache,
+ *
+ * Memory-side caches are not CPU caches.
+ *
+ * \return 1 if an object of type \p type is a CPU Instruction Cache, 0 otherwise.
+ */
+HWLOC_DECLSPEC int
+hwloc_obj_type_is_icache(hwloc_obj_type_t type);
+
+/** @} */
+
+
+
 /** \defgroup hwlocality_helper_find_inside Finding Objects inside a CPU set
  * @{
  */
@@ -504,9 +584,9 @@ hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_
   if (prev) {
     if (prev->type == HWLOC_OBJ_MISC)
       state = 3;
-    else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE)
+    else if (hwloc_obj_type_is_io(prev->type))
       state = 2;
-    else if (prev->type == HWLOC_OBJ_NUMANODE || prev->type == HWLOC_OBJ_MEMCACHE)
+    else if (hwloc_obj_type_is_memory(prev->type))
       state = 1;
     obj = prev->next_sibling;
   } else {
@@ -531,84 +611,6 @@ hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_
 
 
 
-/** \defgroup hwlocality_helper_types Kinds of object Type
- * @{
- *
- * Each object type is
- * either Normal (i.e. hwloc_obj_type_is_normal() returns 1),
- * or Memory (i.e. hwloc_obj_type_is_memory() returns 1)
- * or I/O (i.e. hwloc_obj_type_is_io() returns 1)
- * or Misc (i.e. equal to ::HWLOC_OBJ_MISC).
- * It cannot be of more than one of these kinds.
- */
-
-/** \brief Check whether an object type is Normal.
- *
- * Normal objects are objects of the main CPU hierarchy
- * (Machine, Package, Core, PU, CPU caches, etc.),
- * but they are not NUMA nodes, I/O devices or Misc objects.
- *
- * They are attached to parent as Normal children,
- * not as Memory, I/O or Misc children.
- *
- * \return 1 if an object of type \p type is a Normal object, 0 otherwise.
- */
-HWLOC_DECLSPEC int
-hwloc_obj_type_is_normal(hwloc_obj_type_t type);
-
-/** \brief Check whether an object type is I/O.
- *
- * I/O objects are objects attached to their parents
- * in the I/O children list.
- * This current includes Bridges, PCI and OS devices.
- *
- * \return 1 if an object of type \p type is a I/O object, 0 otherwise.
- */
-HWLOC_DECLSPEC int
-hwloc_obj_type_is_io(hwloc_obj_type_t type);
-
-/** \brief Check whether an object type is Memory.
- *
- * Memory objects are objects attached to their parents
- * in the Memory children list.
- * This current includes NUMA nodes and Memory-side caches.
- *
- * \return 1 if an object of type \p type is a Memory object, 0 otherwise.
- */
-HWLOC_DECLSPEC int
-hwloc_obj_type_is_memory(hwloc_obj_type_t type);
-
-/** \brief Check whether an object type is a CPU Cache (Data, Unified or Instruction).
- *
- * Memory-side caches are not CPU caches.
- *
- * \return 1 if an object of type \p type is a Cache, 0 otherwise.
- */
-HWLOC_DECLSPEC int
-hwloc_obj_type_is_cache(hwloc_obj_type_t type);
-
-/** \brief Check whether an object type is a CPU Data or Unified Cache.
- *
- * Memory-side caches are not CPU caches.
- *
- * \return 1 if an object of type \p type is a CPU Data or Unified Cache, 0 otherwise.
- */
-HWLOC_DECLSPEC int
-hwloc_obj_type_is_dcache(hwloc_obj_type_t type);
-
-/** \brief Check whether an object type is a CPU Instruction Cache,
- *
- * Memory-side caches are not CPU caches.
- *
- * \return 1 if an object of type \p type is a CPU Instruction Cache, 0 otherwise.
- */
-HWLOC_DECLSPEC int
-hwloc_obj_type_is_icache(hwloc_obj_type_t type);
-
-/** @} */
-
-
-
 /** \defgroup hwlocality_helper_find_cache Looking at Cache Objects
  * @{
  */
diff --git a/deps/hwloc/include/hwloc/memattrs.h b/deps/hwloc/include/hwloc/memattrs.h
index 6d2cff9be..10332b8e0 100644
--- a/deps/hwloc/include/hwloc/memattrs.h
+++ b/deps/hwloc/include/hwloc/memattrs.h
@@ -54,6 +54,10 @@ extern "C" {
  * Attribute values for these nodes, if any, may then be obtained with
  * hwloc_memattr_get_value() and manually compared with the desired criteria.
  *
+ * Memory attributes are also used internally to build Memory Tiers which provide
+ * an easy way to distinguish NUMA nodes of different kinds, as explained
+ * in \ref heteromem.
+ *
  * \sa An example is available in doc/examples/memory-attributes.c in the source tree.
  *
  * \note The API also supports specific objects as initiator,
diff --git a/deps/hwloc/include/hwloc/rename.h b/deps/hwloc/include/hwloc/rename.h
index 279ecd842..d5687b694 100644
--- a/deps/hwloc/include/hwloc/rename.h
+++ b/deps/hwloc/include/hwloc/rename.h
@@ -176,6 +176,7 @@ extern "C" {
 
 #define hwloc_topology_insert_misc_object HWLOC_NAME(topology_insert_misc_object)
 #define hwloc_topology_alloc_group_object HWLOC_NAME(topology_alloc_group_object)
+#define hwloc_topology_free_group_object HWLOC_NAME(topology_free_group_object)
 #define hwloc_topology_insert_group_object HWLOC_NAME(topology_insert_group_object)
 #define hwloc_obj_add_other_obj_sets HWLOC_NAME(obj_add_other_obj_sets)
 #define hwloc_topology_refresh HWLOC_NAME(topology_refresh)
diff --git a/deps/hwloc/lib/libhwloc.a b/deps/hwloc/lib/libhwloc.a
index b2eb53c6e..d43356ce5 100644
Binary files a/deps/hwloc/lib/libhwloc.a and b/deps/hwloc/lib/libhwloc.a differ
diff --git a/deps/itt/include/ittnotify.h b/deps/itt/include/ittnotify.h
index d3df4b5e3..1dbc5cda6 100755
--- a/deps/itt/include/ittnotify.h
+++ b/deps/itt/include/ittnotify.h
@@ -309,30 +309,57 @@ void ITTAPI __itt_resume(void);
 /** @brief Detach collection */
 void ITTAPI __itt_detach(void);
 
+/**
+ * @enum __itt_collection_scope
+ * @brief Enumerator for collection scopes
+ */
+typedef enum {
+    __itt_collection_scope_host    = 1 << 0,
+    __itt_collection_scope_offload = 1 << 1,
+    __itt_collection_scope_all     = 0x7FFFFFFF
+} __itt_collection_scope;
+
+/** @brief Pause scoped collection */
+void ITTAPI __itt_pause_scoped(__itt_collection_scope);
+/** @brief Resume scoped collection */
+void ITTAPI __itt_resume_scoped(__itt_collection_scope);
+
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, pause,  (void))
-ITT_STUBV(ITTAPI, void, resume, (void))
-ITT_STUBV(ITTAPI, void, detach, (void))
-#define __itt_pause      ITTNOTIFY_VOID(pause)
-#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
-#define __itt_resume     ITTNOTIFY_VOID(resume)
-#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
-#define __itt_detach     ITTNOTIFY_VOID(detach)
-#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
+ITT_STUBV(ITTAPI, void, pause,         (void))
+ITT_STUBV(ITTAPI, void, pause_scoped,  (__itt_collection_scope))
+ITT_STUBV(ITTAPI, void, resume,        (void))
+ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope))
+ITT_STUBV(ITTAPI, void, detach,        (void))
+#define __itt_pause             ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr         ITTNOTIFY_NAME(pause)
+#define __itt_pause_scoped      ITTNOTIFY_VOID(pause_scoped)
+#define __itt_pause_scoped_ptr  ITTNOTIFY_NAME(pause_scoped)
+#define __itt_resume            ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr        ITTNOTIFY_NAME(resume)
+#define __itt_resume_scoped     ITTNOTIFY_VOID(resume_scoped)
+#define __itt_resume_scoped_ptr ITTNOTIFY_NAME(resume_scoped)
+#define __itt_detach            ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr        ITTNOTIFY_NAME(detach)
 #else  /* INTEL_NO_ITTNOTIFY_API */
 #define __itt_pause()
-#define __itt_pause_ptr  0
+#define __itt_pause_ptr           0
+#define __itt_pause_scoped(scope)
+#define __itt_pause_scoped_ptr    0
 #define __itt_resume()
-#define __itt_resume_ptr 0
+#define __itt_resume_ptr          0
+#define __itt_resume_scoped(scope)
+#define __itt_resume_scoped_ptr   0
 #define __itt_detach()
-#define __itt_detach_ptr 0
+#define __itt_detach_ptr          0
 #endif /* INTEL_NO_ITTNOTIFY_API */
 #else  /* INTEL_NO_MACRO_BODY */
-#define __itt_pause_ptr  0
-#define __itt_resume_ptr 0
-#define __itt_detach_ptr 0
+#define __itt_pause_ptr           0
+#define __itt_pause_scoped_ptr    0
+#define __itt_resume_ptr          0
+#define __itt_resume_scoped_ptr   0
+#define __itt_detach_ptr          0
 #endif /* INTEL_NO_MACRO_BODY */
 /** @endcond */
 /** @} control group */
@@ -576,8 +603,8 @@ ITT_STUBV(ITTAPI, void, suppress_pop, (void))
 /** @endcond */
 
 /**
- * @enum __itt_model_disable
- * @brief Enumerator for the disable methods
+ * @enum __itt_suppress_mode
+ * @brief Enumerator for the suppressing modes
  */
 typedef enum __itt_suppress_mode {
     __itt_unsuppress_range,
diff --git a/deps/itt/lib64/libittnotify.a b/deps/itt/lib64/libittnotify.a
index f6918bd26..b5a9961d4 100755
Binary files a/deps/itt/lib64/libittnotify.a and b/deps/itt/lib64/libittnotify.a differ
diff --git a/deps/level_zero/include/ze_api.h b/deps/level_zero/include/ze_api.h
index eb0938db4..9d0b425c2 100644
--- a/deps/level_zero/include/ze_api.h
+++ b/deps/level_zero/include/ze_api.h
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT
  *
  * @file ze_api.h
- * @version v1.6-r1.6.3
+ * @version v1.8-r1.8.0
  *
  */
 #ifndef _ZE_API_H
@@ -168,7 +168,7 @@ typedef struct _ze_fabric_edge_handle_t *ze_fabric_edge_handle_t;
 /// @brief IPC handle to a memory allocation
 typedef struct _ze_ipc_mem_handle_t
 {
-    char data[ZE_MAX_IPC_HANDLE_SIZE];              ///< [out] Opaque data representing an IPC handle
+    char data[ZE_MAX_IPC_HANDLE_SIZE];                                      ///< [out] Opaque data representing an IPC handle
 
 } ze_ipc_mem_handle_t;
 
@@ -176,7 +176,7 @@ typedef struct _ze_ipc_mem_handle_t
 /// @brief IPC handle to a event pool allocation
 typedef struct _ze_ipc_event_pool_handle_t
 {
-    char data[ZE_MAX_IPC_HANDLE_SIZE];              ///< [out] Opaque data representing an IPC handle
+    char data[ZE_MAX_IPC_HANDLE_SIZE];                                      ///< [out] Opaque data representing an IPC handle
 
 } ze_ipc_event_pool_handle_t;
 
@@ -190,59 +190,65 @@ typedef struct _ze_ipc_event_pool_handle_t
 /// @brief Defines Return/Error codes
 typedef enum _ze_result_t
 {
-    ZE_RESULT_SUCCESS = 0,                          ///< [Core] success
-    ZE_RESULT_NOT_READY = 1,                        ///< [Core] synchronization primitive not signaled
-    ZE_RESULT_ERROR_DEVICE_LOST = 0x70000001,       ///< [Core] device hung, reset, was removed, or driver update occurred
-    ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY = 0x70000002,///< [Core] insufficient host memory to satisfy call
-    ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 0x70000003,  ///< [Core] insufficient device memory to satisfy call
-    ZE_RESULT_ERROR_MODULE_BUILD_FAILURE = 0x70000004,  ///< [Core] error occurred when building module, see build log for details
-    ZE_RESULT_ERROR_MODULE_LINK_FAILURE = 0x70000005,   ///< [Core] error occurred when linking modules, see build log for details
-    ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET = 0x70000006, ///< [Core] device requires a reset
-    ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE = 0x70000007, ///< [Core] device currently in low power state
-    ZE_RESULT_EXP_ERROR_DEVICE_IS_NOT_VERTEX = 0x7ff00001,  ///< [Core, Expoerimental] device is not represented by a fabric vertex
-    ZE_RESULT_EXP_ERROR_VERTEX_IS_NOT_DEVICE = 0x7ff00002,  ///< [Core, Experimental] fabric vertex does not represent a device
-    ZE_RESULT_EXP_ERROR_REMOTE_DEVICE = 0x7ff00003, ///< [Core, Expoerimental] fabric vertex represents a remote device or
-                                                    ///< subdevice
-    ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS = 0x70010000,  ///< [Sysman] access denied due to permission level
-    ZE_RESULT_ERROR_NOT_AVAILABLE = 0x70010001,     ///< [Sysman] resource already in use and simultaneous access not allowed
-                                                    ///< or resource was removed
-    ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE = 0x70020000,///< [Tools] external required dependency is unavailable or missing
-    ZE_RESULT_WARNING_DROPPED_DATA = 0x70020001,    ///< [Tools] data may have been dropped
-    ZE_RESULT_ERROR_UNINITIALIZED = 0x78000001,     ///< [Validation] driver is not initialized
-    ZE_RESULT_ERROR_UNSUPPORTED_VERSION = 0x78000002,   ///< [Validation] generic error code for unsupported versions
-    ZE_RESULT_ERROR_UNSUPPORTED_FEATURE = 0x78000003,   ///< [Validation] generic error code for unsupported features
-    ZE_RESULT_ERROR_INVALID_ARGUMENT = 0x78000004,  ///< [Validation] generic error code for invalid arguments
-    ZE_RESULT_ERROR_INVALID_NULL_HANDLE = 0x78000005,   ///< [Validation] handle argument is not valid
-    ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 0x78000006,  ///< [Validation] object pointed to by handle still in-use by device
-    ZE_RESULT_ERROR_INVALID_NULL_POINTER = 0x78000007,  ///< [Validation] pointer argument may not be nullptr
-    ZE_RESULT_ERROR_INVALID_SIZE = 0x78000008,      ///< [Validation] size argument is invalid (e.g., must not be zero)
-    ZE_RESULT_ERROR_UNSUPPORTED_SIZE = 0x78000009,  ///< [Validation] size argument is not supported by the device (e.g., too
-                                                    ///< large)
-    ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 0x7800000a, ///< [Validation] alignment argument is not supported by the device (e.g.,
-                                                    ///< too small)
-    ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 0x7800000b,///< [Validation] synchronization object in invalid state
-    ZE_RESULT_ERROR_INVALID_ENUMERATION = 0x7800000c,   ///< [Validation] enumerator argument is not valid
-    ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 0x7800000d,   ///< [Validation] enumerator argument is not supported by the device
-    ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 0x7800000e,  ///< [Validation] image format is not supported by the device
-    ZE_RESULT_ERROR_INVALID_NATIVE_BINARY = 0x7800000f, ///< [Validation] native binary is not supported by the device
-    ZE_RESULT_ERROR_INVALID_GLOBAL_NAME = 0x78000010,   ///< [Validation] global variable is not found in the module
-    ZE_RESULT_ERROR_INVALID_KERNEL_NAME = 0x78000011,   ///< [Validation] kernel name is not found in the module
-    ZE_RESULT_ERROR_INVALID_FUNCTION_NAME = 0x78000012, ///< [Validation] function name is not found in the module
-    ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 0x78000013,  ///< [Validation] group size dimension is not valid for the kernel or
-                                                    ///< device
-    ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 0x78000014,///< [Validation] global width dimension is not valid for the kernel or
-                                                    ///< device
-    ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX = 0x78000015, ///< [Validation] kernel argument index is not valid for kernel
-    ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE = 0x78000016,  ///< [Validation] kernel argument size does not match kernel
-    ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE = 0x78000017,///< [Validation] value of kernel attribute is not valid for the kernel or
-                                                    ///< device
-    ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED = 0x78000018,   ///< [Validation] module with imports needs to be linked before kernels can
-                                                    ///< be created from it.
-    ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE = 0x78000019, ///< [Validation] command list type does not match command queue type
-    ZE_RESULT_ERROR_OVERLAPPING_REGIONS = 0x7800001a,   ///< [Validation] copy operations do not support overlapping regions of
-                                                    ///< memory
-    ZE_RESULT_WARNING_ACTION_REQUIRED = 0x7800001b, ///< [Sysman] an action is required to complete the desired operation
-    ZE_RESULT_ERROR_UNKNOWN = 0x7ffffffe,           ///< [Core] unknown or internal error
+    ZE_RESULT_SUCCESS = 0,                                                  ///< [Core] success
+    ZE_RESULT_NOT_READY = 1,                                                ///< [Core] synchronization primitive not signaled
+    ZE_RESULT_ERROR_DEVICE_LOST = 0x70000001,                               ///< [Core] device hung, reset, was removed, or driver update occurred
+    ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY = 0x70000002,                        ///< [Core] insufficient host memory to satisfy call
+    ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 0x70000003,                      ///< [Core] insufficient device memory to satisfy call
+    ZE_RESULT_ERROR_MODULE_BUILD_FAILURE = 0x70000004,                      ///< [Core] error occurred when building module, see build log for details
+    ZE_RESULT_ERROR_MODULE_LINK_FAILURE = 0x70000005,                       ///< [Core] error occurred when linking modules, see build log for details
+    ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET = 0x70000006,                     ///< [Core] device requires a reset
+    ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE = 0x70000007,                 ///< [Core] device currently in low power state
+    ZE_RESULT_EXP_ERROR_DEVICE_IS_NOT_VERTEX = 0x7ff00001,                  ///< [Core, Experimental] device is not represented by a fabric vertex
+    ZE_RESULT_EXP_ERROR_VERTEX_IS_NOT_DEVICE = 0x7ff00002,                  ///< [Core, Experimental] fabric vertex does not represent a device
+    ZE_RESULT_EXP_ERROR_REMOTE_DEVICE = 0x7ff00003,                         ///< [Core, Experimental] fabric vertex represents a remote device or
+                                                                            ///< subdevice
+    ZE_RESULT_EXP_ERROR_OPERANDS_INCOMPATIBLE = 0x7ff00004,                 ///< [Core, Experimental] operands of comparison are not compatible
+    ZE_RESULT_EXP_RTAS_BUILD_RETRY = 0x7ff00005,                            ///< [Core, Experimental] ray tracing acceleration structure build
+                                                                            ///< operation failed due to insufficient resources, retry with a larger
+                                                                            ///< acceleration structure buffer allocation
+    ZE_RESULT_EXP_RTAS_BUILD_DEFERRED = 0x7ff00006,                         ///< [Core, Experimental] ray tracing acceleration structure build
+                                                                            ///< operation deferred to parallel operation join
+    ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS = 0x70010000,                  ///< [Sysman] access denied due to permission level
+    ZE_RESULT_ERROR_NOT_AVAILABLE = 0x70010001,                             ///< [Sysman] resource already in use and simultaneous access not allowed
+                                                                            ///< or resource was removed
+    ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE = 0x70020000,                    ///< [Common] external required dependency is unavailable or missing
+    ZE_RESULT_WARNING_DROPPED_DATA = 0x70020001,                            ///< [Tools] data may have been dropped
+    ZE_RESULT_ERROR_UNINITIALIZED = 0x78000001,                             ///< [Validation] driver is not initialized
+    ZE_RESULT_ERROR_UNSUPPORTED_VERSION = 0x78000002,                       ///< [Validation] generic error code for unsupported versions
+    ZE_RESULT_ERROR_UNSUPPORTED_FEATURE = 0x78000003,                       ///< [Validation] generic error code for unsupported features
+    ZE_RESULT_ERROR_INVALID_ARGUMENT = 0x78000004,                          ///< [Validation] generic error code for invalid arguments
+    ZE_RESULT_ERROR_INVALID_NULL_HANDLE = 0x78000005,                       ///< [Validation] handle argument is not valid
+    ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 0x78000006,                      ///< [Validation] object pointed to by handle still in-use by device
+    ZE_RESULT_ERROR_INVALID_NULL_POINTER = 0x78000007,                      ///< [Validation] pointer argument may not be nullptr
+    ZE_RESULT_ERROR_INVALID_SIZE = 0x78000008,                              ///< [Validation] size argument is invalid (e.g., must not be zero)
+    ZE_RESULT_ERROR_UNSUPPORTED_SIZE = 0x78000009,                          ///< [Validation] size argument is not supported by the device (e.g., too
+                                                                            ///< large)
+    ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 0x7800000a,                     ///< [Validation] alignment argument is not supported by the device (e.g.,
+                                                                            ///< too small)
+    ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 0x7800000b,            ///< [Validation] synchronization object in invalid state
+    ZE_RESULT_ERROR_INVALID_ENUMERATION = 0x7800000c,                       ///< [Validation] enumerator argument is not valid
+    ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 0x7800000d,                   ///< [Validation] enumerator argument is not supported by the device
+    ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 0x7800000e,                  ///< [Validation] image format is not supported by the device
+    ZE_RESULT_ERROR_INVALID_NATIVE_BINARY = 0x7800000f,                     ///< [Validation] native binary is not supported by the device
+    ZE_RESULT_ERROR_INVALID_GLOBAL_NAME = 0x78000010,                       ///< [Validation] global variable is not found in the module
+    ZE_RESULT_ERROR_INVALID_KERNEL_NAME = 0x78000011,                       ///< [Validation] kernel name is not found in the module
+    ZE_RESULT_ERROR_INVALID_FUNCTION_NAME = 0x78000012,                     ///< [Validation] function name is not found in the module
+    ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 0x78000013,              ///< [Validation] group size dimension is not valid for the kernel or
+                                                                            ///< device
+    ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 0x78000014,            ///< [Validation] global width dimension is not valid for the kernel or
+                                                                            ///< device
+    ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX = 0x78000015,             ///< [Validation] kernel argument index is not valid for kernel
+    ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE = 0x78000016,              ///< [Validation] kernel argument size does not match kernel
+    ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE = 0x78000017,            ///< [Validation] value of kernel attribute is not valid for the kernel or
+                                                                            ///< device
+    ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED = 0x78000018,                   ///< [Validation] module with imports needs to be linked before kernels can
+                                                                            ///< be created from it.
+    ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE = 0x78000019,                 ///< [Validation] command list type does not match command queue type
+    ZE_RESULT_ERROR_OVERLAPPING_REGIONS = 0x7800001a,                       ///< [Validation] copy operations do not support overlapping regions of
+                                                                            ///< memory
+    ZE_RESULT_WARNING_ACTION_REQUIRED = 0x7800001b,                         ///< [Sysman] an action is required to complete the desired operation
+    ZE_RESULT_ERROR_UNKNOWN = 0x7ffffffe,                                   ///< [Core] unknown or internal error
     ZE_RESULT_FORCE_UINT32 = 0x7fffffff
 
 } ze_result_t;
@@ -251,72 +257,80 @@ typedef enum _ze_result_t
 /// @brief Defines structure types
 typedef enum _ze_structure_type_t
 {
-    ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES = 0x1,      ///< ::ze_driver_properties_t
-    ZE_STRUCTURE_TYPE_DRIVER_IPC_PROPERTIES = 0x2,  ///< ::ze_driver_ipc_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x3,      ///< ::ze_device_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES = 0x4,  ///< ::ze_device_compute_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES = 0x5,   ///< ::ze_device_module_properties_t
-    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES = 0x6, ///< ::ze_command_queue_group_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES = 0x7,   ///< ::ze_device_memory_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES = 0x8,///< ::ze_device_memory_access_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES = 0x9,///< ::ze_device_cache_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES = 0xa,///< ::ze_device_image_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_P2P_PROPERTIES = 0xb,  ///< ::ze_device_p2p_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_EXTERNAL_MEMORY_PROPERTIES = 0xc,  ///< ::ze_device_external_memory_properties_t
-    ZE_STRUCTURE_TYPE_CONTEXT_DESC = 0xd,           ///< ::ze_context_desc_t
-    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC = 0xe,     ///< ::ze_command_queue_desc_t
-    ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC = 0xf,      ///< ::ze_command_list_desc_t
-    ZE_STRUCTURE_TYPE_EVENT_POOL_DESC = 0x10,       ///< ::ze_event_pool_desc_t
-    ZE_STRUCTURE_TYPE_EVENT_DESC = 0x11,            ///< ::ze_event_desc_t
-    ZE_STRUCTURE_TYPE_FENCE_DESC = 0x12,            ///< ::ze_fence_desc_t
-    ZE_STRUCTURE_TYPE_IMAGE_DESC = 0x13,            ///< ::ze_image_desc_t
-    ZE_STRUCTURE_TYPE_IMAGE_PROPERTIES = 0x14,      ///< ::ze_image_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC = 0x15, ///< ::ze_device_mem_alloc_desc_t
-    ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC = 0x16,   ///< ::ze_host_mem_alloc_desc_t
-    ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES = 0x17,  ///< ::ze_memory_allocation_properties_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_DESC = 0x18,   ///< ::ze_external_memory_export_desc_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD = 0x19, ///< ::ze_external_memory_import_fd_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_FD = 0x1a, ///< ::ze_external_memory_export_fd_t
-    ZE_STRUCTURE_TYPE_MODULE_DESC = 0x1b,           ///< ::ze_module_desc_t
-    ZE_STRUCTURE_TYPE_MODULE_PROPERTIES = 0x1c,     ///< ::ze_module_properties_t
-    ZE_STRUCTURE_TYPE_KERNEL_DESC = 0x1d,           ///< ::ze_kernel_desc_t
-    ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES = 0x1e,     ///< ::ze_kernel_properties_t
-    ZE_STRUCTURE_TYPE_SAMPLER_DESC = 0x1f,          ///< ::ze_sampler_desc_t
-    ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC = 0x20,     ///< ::ze_physical_mem_desc_t
-    ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES = 0x21,///< ::ze_kernel_preferred_group_size_properties_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_WIN32 = 0x22,  ///< ::ze_external_memory_import_win32_handle_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_WIN32 = 0x23,  ///< ::ze_external_memory_export_win32_handle_t
-    ZE_STRUCTURE_TYPE_DEVICE_RAYTRACING_EXT_PROPERTIES = 0x00010001,///< ::ze_device_raytracing_ext_properties_t
-    ZE_STRUCTURE_TYPE_RAYTRACING_MEM_ALLOC_EXT_DESC = 0x10002,  ///< ::ze_raytracing_mem_alloc_ext_desc_t
-    ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES = 0x10003,///< ::ze_float_atomic_ext_properties_t
-    ZE_STRUCTURE_TYPE_CACHE_RESERVATION_EXT_DESC = 0x10004, ///< ::ze_cache_reservation_ext_desc_t
-    ZE_STRUCTURE_TYPE_EU_COUNT_EXT = 0x10005,       ///< ::ze_eu_count_ext_t
-    ZE_STRUCTURE_TYPE_SRGB_EXT_DESC = 0x10006,      ///< ::ze_srgb_ext_desc_t
-    ZE_STRUCTURE_TYPE_LINKAGE_INSPECTION_EXT_DESC = 0x10007,///< ::ze_linkage_inspection_ext_desc_t
-    ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES = 0x10008, ///< ::ze_pci_ext_properties_t
-    ZE_STRUCTURE_TYPE_DRIVER_MEMORY_FREE_EXT_PROPERTIES = 0x10009,  ///< ::ze_driver_memory_free_ext_properties_t
-    ZE_STRUCTURE_TYPE_MEMORY_FREE_EXT_DESC = 0x1000a,   ///< ::ze_memory_free_ext_desc_t
-    ZE_STRUCTURE_TYPE_MEMORY_COMPRESSION_HINTS_EXT_DESC = 0x1000b,  ///< ::ze_memory_compression_hints_ext_desc_t
-    ZE_STRUCTURE_TYPE_IMAGE_ALLOCATION_EXT_PROPERTIES = 0x1000c,///< ::ze_image_allocation_ext_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_LUID_EXT_PROPERTIES = 0x1000d, ///< ::ze_device_luid_ext_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_EXT_PROPERTIES = 0x1000e,   ///< ::ze_device_memory_ext_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT = 0x1000f,  ///< ::ze_device_ip_version_ext_t
-    ZE_STRUCTURE_TYPE_IMAGE_VIEW_PLANAR_EXT_DESC = 0x10010, ///< ::ze_image_view_planar_ext_desc_t
+    ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES = 0x1,                              ///< ::ze_driver_properties_t
+    ZE_STRUCTURE_TYPE_DRIVER_IPC_PROPERTIES = 0x2,                          ///< ::ze_driver_ipc_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x3,                              ///< ::ze_device_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES = 0x4,                      ///< ::ze_device_compute_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES = 0x5,                       ///< ::ze_device_module_properties_t
+    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES = 0x6,                 ///< ::ze_command_queue_group_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES = 0x7,                       ///< ::ze_device_memory_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES = 0x8,                ///< ::ze_device_memory_access_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES = 0x9,                        ///< ::ze_device_cache_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES = 0xa,                        ///< ::ze_device_image_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_P2P_PROPERTIES = 0xb,                          ///< ::ze_device_p2p_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_EXTERNAL_MEMORY_PROPERTIES = 0xc,              ///< ::ze_device_external_memory_properties_t
+    ZE_STRUCTURE_TYPE_CONTEXT_DESC = 0xd,                                   ///< ::ze_context_desc_t
+    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC = 0xe,                             ///< ::ze_command_queue_desc_t
+    ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC = 0xf,                              ///< ::ze_command_list_desc_t
+    ZE_STRUCTURE_TYPE_EVENT_POOL_DESC = 0x10,                               ///< ::ze_event_pool_desc_t
+    ZE_STRUCTURE_TYPE_EVENT_DESC = 0x11,                                    ///< ::ze_event_desc_t
+    ZE_STRUCTURE_TYPE_FENCE_DESC = 0x12,                                    ///< ::ze_fence_desc_t
+    ZE_STRUCTURE_TYPE_IMAGE_DESC = 0x13,                                    ///< ::ze_image_desc_t
+    ZE_STRUCTURE_TYPE_IMAGE_PROPERTIES = 0x14,                              ///< ::ze_image_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC = 0x15,                         ///< ::ze_device_mem_alloc_desc_t
+    ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC = 0x16,                           ///< ::ze_host_mem_alloc_desc_t
+    ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES = 0x17,                  ///< ::ze_memory_allocation_properties_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_DESC = 0x18,                   ///< ::ze_external_memory_export_desc_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD = 0x19,                     ///< ::ze_external_memory_import_fd_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_FD = 0x1a,                     ///< ::ze_external_memory_export_fd_t
+    ZE_STRUCTURE_TYPE_MODULE_DESC = 0x1b,                                   ///< ::ze_module_desc_t
+    ZE_STRUCTURE_TYPE_MODULE_PROPERTIES = 0x1c,                             ///< ::ze_module_properties_t
+    ZE_STRUCTURE_TYPE_KERNEL_DESC = 0x1d,                                   ///< ::ze_kernel_desc_t
+    ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES = 0x1e,                             ///< ::ze_kernel_properties_t
+    ZE_STRUCTURE_TYPE_SAMPLER_DESC = 0x1f,                                  ///< ::ze_sampler_desc_t
+    ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC = 0x20,                             ///< ::ze_physical_mem_desc_t
+    ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES = 0x21,        ///< ::ze_kernel_preferred_group_size_properties_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_WIN32 = 0x22,                  ///< ::ze_external_memory_import_win32_handle_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_WIN32 = 0x23,                  ///< ::ze_external_memory_export_win32_handle_t
+    ZE_STRUCTURE_TYPE_DEVICE_RAYTRACING_EXT_PROPERTIES = 0x00010001,        ///< ::ze_device_raytracing_ext_properties_t
+    ZE_STRUCTURE_TYPE_RAYTRACING_MEM_ALLOC_EXT_DESC = 0x10002,              ///< ::ze_raytracing_mem_alloc_ext_desc_t
+    ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES = 0x10003,                ///< ::ze_float_atomic_ext_properties_t
+    ZE_STRUCTURE_TYPE_CACHE_RESERVATION_EXT_DESC = 0x10004,                 ///< ::ze_cache_reservation_ext_desc_t
+    ZE_STRUCTURE_TYPE_EU_COUNT_EXT = 0x10005,                               ///< ::ze_eu_count_ext_t
+    ZE_STRUCTURE_TYPE_SRGB_EXT_DESC = 0x10006,                              ///< ::ze_srgb_ext_desc_t
+    ZE_STRUCTURE_TYPE_LINKAGE_INSPECTION_EXT_DESC = 0x10007,                ///< ::ze_linkage_inspection_ext_desc_t
+    ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES = 0x10008,                         ///< ::ze_pci_ext_properties_t
+    ZE_STRUCTURE_TYPE_DRIVER_MEMORY_FREE_EXT_PROPERTIES = 0x10009,          ///< ::ze_driver_memory_free_ext_properties_t
+    ZE_STRUCTURE_TYPE_MEMORY_FREE_EXT_DESC = 0x1000a,                       ///< ::ze_memory_free_ext_desc_t
+    ZE_STRUCTURE_TYPE_MEMORY_COMPRESSION_HINTS_EXT_DESC = 0x1000b,          ///< ::ze_memory_compression_hints_ext_desc_t
+    ZE_STRUCTURE_TYPE_IMAGE_ALLOCATION_EXT_PROPERTIES = 0x1000c,            ///< ::ze_image_allocation_ext_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_LUID_EXT_PROPERTIES = 0x1000d,                 ///< ::ze_device_luid_ext_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_EXT_PROPERTIES = 0x1000e,               ///< ::ze_device_memory_ext_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT = 0x1000f,                      ///< ::ze_device_ip_version_ext_t
+    ZE_STRUCTURE_TYPE_IMAGE_VIEW_PLANAR_EXT_DESC = 0x10010,                 ///< ::ze_image_view_planar_ext_desc_t
     ZE_STRUCTURE_TYPE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_PROPERTIES = 0x10011,   ///< ::ze_event_query_kernel_timestamps_ext_properties_t
     ZE_STRUCTURE_TYPE_EVENT_QUERY_KERNEL_TIMESTAMPS_RESULTS_EXT_PROPERTIES = 0x10012,   ///< ::ze_event_query_kernel_timestamps_results_ext_properties_t
-    ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC = 0x00020001,  ///< ::ze_relaxed_allocation_limits_exp_desc_t
-    ZE_STRUCTURE_TYPE_MODULE_PROGRAM_EXP_DESC = 0x00020002, ///< ::ze_module_program_exp_desc_t
-    ZE_STRUCTURE_TYPE_SCHEDULING_HINT_EXP_PROPERTIES = 0x00020003,  ///< ::ze_scheduling_hint_exp_properties_t
-    ZE_STRUCTURE_TYPE_SCHEDULING_HINT_EXP_DESC = 0x00020004,///< ::ze_scheduling_hint_exp_desc_t
-    ZE_STRUCTURE_TYPE_IMAGE_VIEW_PLANAR_EXP_DESC = 0x00020005,  ///< ::ze_image_view_planar_exp_desc_t
-    ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 = 0x00020006,   ///< ::ze_device_properties_t
-    ZE_STRUCTURE_TYPE_IMAGE_MEMORY_EXP_PROPERTIES = 0x00020007, ///< ::ze_image_memory_properties_exp_t
-    ZE_STRUCTURE_TYPE_POWER_SAVING_HINT_EXP_DESC = 0x00020008,  ///< ::ze_context_power_saving_hint_exp_desc_t
-    ZE_STRUCTURE_TYPE_COPY_BANDWIDTH_EXP_PROPERTIES = 0x00020009,   ///< ::ze_copy_bandwidth_exp_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_P2P_BANDWIDTH_EXP_PROPERTIES = 0x0002000A, ///< ::ze_device_p2p_bandwidth_exp_properties_t
-    ZE_STRUCTURE_TYPE_FABRIC_VERTEX_EXP_PROPERTIES = 0x0002000B,///< ::ze_fabric_vertex_exp_properties_t
-    ZE_STRUCTURE_TYPE_FABRIC_EDGE_EXP_PROPERTIES = 0x0002000C,  ///< ::ze_fabric_edge_exp_properties_t
+    ZE_STRUCTURE_TYPE_KERNEL_MAX_GROUP_SIZE_EXT_PROPERTIES = 0x10013,       ///< ::ze_kernel_max_group_size_ext_properties_t
+    ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC = 0x00020001,      ///< ::ze_relaxed_allocation_limits_exp_desc_t
+    ZE_STRUCTURE_TYPE_MODULE_PROGRAM_EXP_DESC = 0x00020002,                 ///< ::ze_module_program_exp_desc_t
+    ZE_STRUCTURE_TYPE_SCHEDULING_HINT_EXP_PROPERTIES = 0x00020003,          ///< ::ze_scheduling_hint_exp_properties_t
+    ZE_STRUCTURE_TYPE_SCHEDULING_HINT_EXP_DESC = 0x00020004,                ///< ::ze_scheduling_hint_exp_desc_t
+    ZE_STRUCTURE_TYPE_IMAGE_VIEW_PLANAR_EXP_DESC = 0x00020005,              ///< ::ze_image_view_planar_exp_desc_t
+    ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 = 0x00020006,                   ///< ::ze_device_properties_t
+    ZE_STRUCTURE_TYPE_IMAGE_MEMORY_EXP_PROPERTIES = 0x00020007,             ///< ::ze_image_memory_properties_exp_t
+    ZE_STRUCTURE_TYPE_POWER_SAVING_HINT_EXP_DESC = 0x00020008,              ///< ::ze_context_power_saving_hint_exp_desc_t
+    ZE_STRUCTURE_TYPE_COPY_BANDWIDTH_EXP_PROPERTIES = 0x00020009,           ///< ::ze_copy_bandwidth_exp_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_P2P_BANDWIDTH_EXP_PROPERTIES = 0x0002000A,     ///< ::ze_device_p2p_bandwidth_exp_properties_t
+    ZE_STRUCTURE_TYPE_FABRIC_VERTEX_EXP_PROPERTIES = 0x0002000B,            ///< ::ze_fabric_vertex_exp_properties_t
+    ZE_STRUCTURE_TYPE_FABRIC_EDGE_EXP_PROPERTIES = 0x0002000C,              ///< ::ze_fabric_edge_exp_properties_t
     ZE_STRUCTURE_TYPE_MEMORY_SUB_ALLOCATIONS_EXP_PROPERTIES = 0x0002000D,   ///< ::ze_memory_sub_allocations_exp_properties_t
+    ZE_STRUCTURE_TYPE_RTAS_BUILDER_EXP_DESC = 0x0002000E,                   ///< ::ze_rtas_builder_exp_desc_t
+    ZE_STRUCTURE_TYPE_RTAS_BUILDER_BUILD_OP_EXP_DESC = 0x0002000F,          ///< ::ze_rtas_builder_build_op_exp_desc_t
+    ZE_STRUCTURE_TYPE_RTAS_BUILDER_EXP_PROPERTIES = 0x00020010,             ///< ::ze_rtas_builder_exp_properties_t
+    ZE_STRUCTURE_TYPE_RTAS_PARALLEL_OPERATION_EXP_PROPERTIES = 0x00020011,  ///< ::ze_rtas_parallel_operation_exp_properties_t
+    ZE_STRUCTURE_TYPE_RTAS_DEVICE_EXP_PROPERTIES = 0x00020012,              ///< ::ze_rtas_device_exp_properties_t
+    ZE_STRUCTURE_TYPE_RTAS_GEOMETRY_AABBS_EXP_CB_PARAMS = 0x00020013,       ///< ::ze_rtas_geometry_aabbs_exp_cb_params_t
+    ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC = 0x00020014,       ///< ::ze_event_pool_counter_based_exp_desc_t
     ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } ze_structure_type_t;
@@ -326,15 +340,15 @@ typedef enum _ze_structure_type_t
 typedef uint32_t ze_external_memory_type_flags_t;
 typedef enum _ze_external_memory_type_flag_t
 {
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_FD = ZE_BIT(0), ///< an opaque POSIX file descriptor handle
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF = ZE_BIT(1),   ///< a file descriptor handle for a Linux dma_buf
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32 = ZE_BIT(2),  ///< an NT handle
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32_KMT = ZE_BIT(3),  ///< a global share (KMT) handle
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D11_TEXTURE = ZE_BIT(4), ///< an NT handle referring to a Direct3D 10 or 11 texture resource
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D11_TEXTURE_KMT = ZE_BIT(5), ///< a global share (KMT) handle referring to a Direct3D 10 or 11 texture
-                                                    ///< resource
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D12_HEAP = ZE_BIT(6),///< an NT handle referring to a Direct3D 12 heap resource
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D12_RESOURCE = ZE_BIT(7),///< an NT handle referring to a Direct3D 12 committed resource
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_FD = ZE_BIT(0),                     ///< an opaque POSIX file descriptor handle
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF = ZE_BIT(1),                       ///< a file descriptor handle for a Linux dma_buf
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32 = ZE_BIT(2),                  ///< an NT handle
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32_KMT = ZE_BIT(3),              ///< a global share (KMT) handle
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D11_TEXTURE = ZE_BIT(4),                 ///< an NT handle referring to a Direct3D 10 or 11 texture resource
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D11_TEXTURE_KMT = ZE_BIT(5),             ///< a global share (KMT) handle referring to a Direct3D 10 or 11 texture
+                                                                            ///< resource
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D12_HEAP = ZE_BIT(6),                    ///< an NT handle referring to a Direct3D 12 heap resource
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D12_RESOURCE = ZE_BIT(7),                ///< an NT handle referring to a Direct3D 12 committed resource
     ZE_EXTERNAL_MEMORY_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_external_memory_type_flag_t;
@@ -343,9 +357,9 @@ typedef enum _ze_external_memory_type_flag_t
 /// @brief Bandwidth unit
 typedef enum _ze_bandwidth_unit_t
 {
-    ZE_BANDWIDTH_UNIT_UNKNOWN = 0,                  ///< The unit used for bandwidth is unknown
-    ZE_BANDWIDTH_UNIT_BYTES_PER_NANOSEC = 1,        ///< Bandwidth is provided in bytes/nanosec
-    ZE_BANDWIDTH_UNIT_BYTES_PER_CLOCK = 2,          ///< Bandwidth is provided in bytes/clock
+    ZE_BANDWIDTH_UNIT_UNKNOWN = 0,                                          ///< The unit used for bandwidth is unknown
+    ZE_BANDWIDTH_UNIT_BYTES_PER_NANOSEC = 1,                                ///< Bandwidth is provided in bytes/nanosec
+    ZE_BANDWIDTH_UNIT_BYTES_PER_CLOCK = 2,                                  ///< Bandwidth is provided in bytes/clock
     ZE_BANDWIDTH_UNIT_FORCE_UINT32 = 0x7fffffff
 
 } ze_bandwidth_unit_t;
@@ -354,11 +368,11 @@ typedef enum _ze_bandwidth_unit_t
 /// @brief Latency unit
 typedef enum _ze_latency_unit_t
 {
-    ZE_LATENCY_UNIT_UNKNOWN = 0,                    ///< The unit used for latency is unknown
-    ZE_LATENCY_UNIT_NANOSEC = 1,                    ///< Latency is provided in nanosecs
-    ZE_LATENCY_UNIT_CLOCK = 2,                      ///< Latency is provided in clocks
-    ZE_LATENCY_UNIT_HOP = 3,                        ///< Latency is provided in hops (normalized so that the lowest latency
-                                                    ///< link has a latency of 1 hop)
+    ZE_LATENCY_UNIT_UNKNOWN = 0,                                            ///< The unit used for latency is unknown
+    ZE_LATENCY_UNIT_NANOSEC = 1,                                            ///< Latency is provided in nanosecs
+    ZE_LATENCY_UNIT_CLOCK = 2,                                              ///< Latency is provided in clocks
+    ZE_LATENCY_UNIT_HOP = 3,                                                ///< Latency is provided in hops (normalized so that the lowest latency
+                                                                            ///< link has a latency of 1 hop)
     ZE_LATENCY_UNIT_FORCE_UINT32 = 0x7fffffff
 
 } ze_latency_unit_t;
@@ -373,17 +387,27 @@ typedef enum _ze_latency_unit_t
 /// @brief Universal unique id (UUID)
 typedef struct _ze_uuid_t
 {
-    uint8_t id[ZE_MAX_UUID_SIZE];                   ///< [out] opaque data representing a UUID
+    uint8_t id[ZE_MAX_UUID_SIZE];                                           ///< [out] opaque data representing a UUID
 
 } ze_uuid_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all callback function parameter types
+typedef struct _ze_base_cb_params_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} ze_base_cb_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Base for all properties types
 typedef struct _ze_base_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
 
 } ze_base_properties_t;
 
@@ -391,9 +415,9 @@ typedef struct _ze_base_properties_t
 /// @brief Base for all descriptor types
 typedef struct _ze_base_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
 
 } ze_base_desc_t;
 
@@ -407,6 +431,10 @@ typedef struct _ze_base_desc_t
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Forces all shared allocations into device memory
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Defines the device hierarchy model exposed by Level Zero driver
+///        implementation
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Forward-declare ze_ipc_mem_handle_t
 typedef struct _ze_ipc_mem_handle_t ze_ipc_mem_handle_t;
@@ -419,6 +447,10 @@ typedef struct _ze_ipc_event_pool_handle_t ze_ipc_event_pool_handle_t;
 /// @brief Forward-declare ze_uuid_t
 typedef struct _ze_uuid_t ze_uuid_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_base_cb_params_t
+typedef struct _ze_base_cb_params_t ze_base_cb_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Forward-declare ze_base_properties_t
 typedef struct _ze_base_properties_t ze_base_properties_t;
@@ -771,6 +803,82 @@ typedef struct _ze_synchronized_timestamp_result_ext_t ze_synchronized_timestamp
 /// @brief Forward-declare ze_event_query_kernel_timestamps_results_ext_properties_t
 typedef struct _ze_event_query_kernel_timestamps_results_ext_properties_t ze_event_query_kernel_timestamps_results_ext_properties_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_exp_desc_t
+typedef struct _ze_rtas_builder_exp_desc_t ze_rtas_builder_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_exp_properties_t
+typedef struct _ze_rtas_builder_exp_properties_t ze_rtas_builder_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_parallel_operation_exp_properties_t
+typedef struct _ze_rtas_parallel_operation_exp_properties_t ze_rtas_parallel_operation_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_device_exp_properties_t
+typedef struct _ze_rtas_device_exp_properties_t ze_rtas_device_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_float3_exp_t
+typedef struct _ze_rtas_float3_exp_t ze_rtas_float3_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_transform_float3x4_column_major_exp_t
+typedef struct _ze_rtas_transform_float3x4_column_major_exp_t ze_rtas_transform_float3x4_column_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_transform_float3x4_aligned_column_major_exp_t
+typedef struct _ze_rtas_transform_float3x4_aligned_column_major_exp_t ze_rtas_transform_float3x4_aligned_column_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_transform_float3x4_row_major_exp_t
+typedef struct _ze_rtas_transform_float3x4_row_major_exp_t ze_rtas_transform_float3x4_row_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_aabb_exp_t
+typedef struct _ze_rtas_aabb_exp_t ze_rtas_aabb_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_triangle_indices_uint32_exp_t
+typedef struct _ze_rtas_triangle_indices_uint32_exp_t ze_rtas_triangle_indices_uint32_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_quad_indices_uint32_exp_t
+typedef struct _ze_rtas_quad_indices_uint32_exp_t ze_rtas_quad_indices_uint32_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_geometry_info_exp_t
+typedef struct _ze_rtas_builder_geometry_info_exp_t ze_rtas_builder_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_triangles_geometry_info_exp_t
+typedef struct _ze_rtas_builder_triangles_geometry_info_exp_t ze_rtas_builder_triangles_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_quads_geometry_info_exp_t
+typedef struct _ze_rtas_builder_quads_geometry_info_exp_t ze_rtas_builder_quads_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_geometry_aabbs_exp_cb_params_t
+typedef struct _ze_rtas_geometry_aabbs_exp_cb_params_t ze_rtas_geometry_aabbs_exp_cb_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_procedural_geometry_info_exp_t
+typedef struct _ze_rtas_builder_procedural_geometry_info_exp_t ze_rtas_builder_procedural_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_instance_geometry_info_exp_t
+typedef struct _ze_rtas_builder_instance_geometry_info_exp_t ze_rtas_builder_instance_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_build_op_exp_desc_t
+typedef struct _ze_rtas_builder_build_op_exp_desc_t ze_rtas_builder_build_op_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_event_pool_counter_based_exp_desc_t
+typedef struct _ze_event_pool_counter_based_exp_desc_t ze_event_pool_counter_based_exp_desc_t;
+
 
 #if !defined(__GNUC__)
 #pragma endregion
@@ -784,8 +892,8 @@ typedef struct _ze_event_query_kernel_timestamps_results_ext_properties_t ze_eve
 typedef uint32_t ze_init_flags_t;
 typedef enum _ze_init_flag_t
 {
-    ZE_INIT_FLAG_GPU_ONLY = ZE_BIT(0),              ///< only initialize GPU drivers
-    ZE_INIT_FLAG_VPU_ONLY = ZE_BIT(1),              ///< only initialize VPU drivers
+    ZE_INIT_FLAG_GPU_ONLY = ZE_BIT(0),                                      ///< only initialize GPU drivers
+    ZE_INIT_FLAG_VPU_ONLY = ZE_BIT(1),                                      ///< only initialize VPU drivers
     ZE_INIT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_init_flag_t;
@@ -817,8 +925,8 @@ typedef enum _ze_init_flag_t
 ///         + `0x3 < flags`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeInit(
-    ze_init_flags_t flags                           ///< [in] initialization flags.
-                                                    ///< must be 0 (default) or a combination of ::ze_init_flag_t.
+    ze_init_flags_t flags                                                   ///< [in] initialization flags.
+                                                                            ///< must be 0 (default) or a combination of ::ze_init_flag_t.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -847,14 +955,14 @@ zeInit(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDriverGet(
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of driver instances.
-                                                    ///< if count is zero, then the loader shall update the value with the
-                                                    ///< total number of drivers available.
-                                                    ///< if count is greater than the number of drivers available, then the
-                                                    ///< loader shall update the value with the correct number of drivers available.
-    ze_driver_handle_t* phDrivers                   ///< [in,out][optional][range(0, *pCount)] array of driver instance handles.
-                                                    ///< if count is less than the number of drivers available, then the loader
-                                                    ///< shall only retrieve that number of drivers.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of driver instances.
+                                                                            ///< if count is zero, then the loader shall update the value with the
+                                                                            ///< total number of drivers available.
+                                                                            ///< if count is greater than the number of drivers available, then the
+                                                                            ///< loader shall update the value with the correct number of drivers available.
+    ze_driver_handle_t* phDrivers                                           ///< [in,out][optional][range(0, *pCount)] array of driver instance handles.
+                                                                            ///< if count is less than the number of drivers available, then the loader
+                                                                            ///< shall only retrieve that number of drivers.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -865,14 +973,16 @@ zeDriverGet(
 ///       ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION
 typedef enum _ze_api_version_t
 {
-    ZE_API_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
-    ZE_API_VERSION_1_1 = ZE_MAKE_VERSION( 1, 1 ),   ///< version 1.1
-    ZE_API_VERSION_1_2 = ZE_MAKE_VERSION( 1, 2 ),   ///< version 1.2
-    ZE_API_VERSION_1_3 = ZE_MAKE_VERSION( 1, 3 ),   ///< version 1.3
-    ZE_API_VERSION_1_4 = ZE_MAKE_VERSION( 1, 4 ),   ///< version 1.4
-    ZE_API_VERSION_1_5 = ZE_MAKE_VERSION( 1, 5 ),   ///< version 1.5
-    ZE_API_VERSION_1_6 = ZE_MAKE_VERSION( 1, 6 ),   ///< version 1.6
-    ZE_API_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 6 ),   ///< latest known version
+    ZE_API_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                           ///< version 1.0
+    ZE_API_VERSION_1_1 = ZE_MAKE_VERSION( 1, 1 ),                           ///< version 1.1
+    ZE_API_VERSION_1_2 = ZE_MAKE_VERSION( 1, 2 ),                           ///< version 1.2
+    ZE_API_VERSION_1_3 = ZE_MAKE_VERSION( 1, 3 ),                           ///< version 1.3
+    ZE_API_VERSION_1_4 = ZE_MAKE_VERSION( 1, 4 ),                           ///< version 1.4
+    ZE_API_VERSION_1_5 = ZE_MAKE_VERSION( 1, 5 ),                           ///< version 1.5
+    ZE_API_VERSION_1_6 = ZE_MAKE_VERSION( 1, 6 ),                           ///< version 1.6
+    ZE_API_VERSION_1_7 = ZE_MAKE_VERSION( 1, 7 ),                           ///< version 1.7
+    ZE_API_VERSION_1_8 = ZE_MAKE_VERSION( 1, 8 ),                           ///< version 1.8
+    ZE_API_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 8 ),                       ///< latest known version
     ZE_API_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_api_version_t;
@@ -896,8 +1006,8 @@ typedef enum _ze_api_version_t
 ///         + `nullptr == version`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDriverGetApiVersion(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    ze_api_version_t* version                       ///< [out] api version
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    ze_api_version_t* version                                               ///< [out] api version
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -910,7 +1020,7 @@ zeDriverGetApiVersion(
 /// @brief Driver universal unique id (UUID)
 typedef struct _ze_driver_uuid_t
 {
-    uint8_t id[ZE_MAX_DRIVER_UUID_SIZE];            ///< [out] opaque data representing a driver UUID
+    uint8_t id[ZE_MAX_DRIVER_UUID_SIZE];                                    ///< [out] opaque data representing a driver UUID
 
 } ze_driver_uuid_t;
 
@@ -918,13 +1028,13 @@ typedef struct _ze_driver_uuid_t
 /// @brief Driver properties queried using ::zeDriverGetProperties
 typedef struct _ze_driver_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_driver_uuid_t uuid;                          ///< [out] universal unique identifier.
-    uint32_t driverVersion;                         ///< [out] driver version
-                                                    ///< The driver version is a non-zero, monotonically increasing value where
-                                                    ///< higher values always indicate a more recent version.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_driver_uuid_t uuid;                                                  ///< [out] universal unique identifier.
+    uint32_t driverVersion;                                                 ///< [out] driver version
+                                                                            ///< The driver version is a non-zero, monotonically increasing value where
+                                                                            ///< higher values always indicate a more recent version.
 
 } ze_driver_properties_t;
 
@@ -951,8 +1061,8 @@ typedef struct _ze_driver_properties_t
 ///         + `nullptr == pDriverProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDriverGetProperties(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    ze_driver_properties_t* pDriverProperties       ///< [in,out] query result for driver properties
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    ze_driver_properties_t* pDriverProperties                               ///< [in,out] query result for driver properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -960,10 +1070,10 @@ zeDriverGetProperties(
 typedef uint32_t ze_ipc_property_flags_t;
 typedef enum _ze_ipc_property_flag_t
 {
-    ZE_IPC_PROPERTY_FLAG_MEMORY = ZE_BIT(0),        ///< Supports passing memory allocations between processes. See
-                                                    ///< ::zeMemGetIpcHandle.
-    ZE_IPC_PROPERTY_FLAG_EVENT_POOL = ZE_BIT(1),    ///< Supports passing event pools between processes. See
-                                                    ///< ::zeEventPoolGetIpcHandle.
+    ZE_IPC_PROPERTY_FLAG_MEMORY = ZE_BIT(0),                                ///< Supports passing memory allocations between processes. See
+                                                                            ///< ::zeMemGetIpcHandle.
+    ZE_IPC_PROPERTY_FLAG_EVENT_POOL = ZE_BIT(1),                            ///< Supports passing event pools between processes. See
+                                                                            ///< ::zeEventPoolGetIpcHandle.
     ZE_IPC_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_ipc_property_flag_t;
@@ -972,10 +1082,10 @@ typedef enum _ze_ipc_property_flag_t
 /// @brief IPC properties queried using ::zeDriverGetIpcProperties
 typedef struct _ze_driver_ipc_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_ipc_property_flags_t flags;                  ///< [out] 0 (none) or a valid combination of ::ze_ipc_property_flag_t
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_ipc_property_flags_t flags;                                          ///< [out] 0 (none) or a valid combination of ::ze_ipc_property_flag_t
 
 } ze_driver_ipc_properties_t;
 
@@ -998,8 +1108,8 @@ typedef struct _ze_driver_ipc_properties_t
 ///         + `nullptr == pIpcProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDriverGetIpcProperties(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    ze_driver_ipc_properties_t* pIpcProperties      ///< [in,out] query result for IPC properties
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    ze_driver_ipc_properties_t* pIpcProperties                              ///< [in,out] query result for IPC properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1012,8 +1122,8 @@ zeDriverGetIpcProperties(
 /// @brief Extension properties queried using ::zeDriverGetExtensionProperties
 typedef struct _ze_driver_extension_properties_t
 {
-    char name[ZE_MAX_EXTENSION_NAME];               ///< [out] extension name
-    uint32_t version;                               ///< [out] extension version using ::ZE_MAKE_VERSION
+    char name[ZE_MAX_EXTENSION_NAME];                                       ///< [out] extension name
+    uint32_t version;                                                       ///< [out] extension version using ::ZE_MAKE_VERSION
 
 } ze_driver_extension_properties_t;
 
@@ -1040,17 +1150,17 @@ typedef struct _ze_driver_extension_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDriverGetExtensionProperties(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of extension properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of extension properties available.
-                                                    ///< if count is greater than the number of extension properties available,
-                                                    ///< then the driver shall update the value with the correct number of
-                                                    ///< extension properties available.
-    ze_driver_extension_properties_t* pExtensionProperties  ///< [in,out][optional][range(0, *pCount)] array of query results for
-                                                    ///< extension properties.
-                                                    ///< if count is less than the number of extension properties available,
-                                                    ///< then driver shall only retrieve that number of extension properties.
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of extension properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of extension properties available.
+                                                                            ///< if count is greater than the number of extension properties available,
+                                                                            ///< then the driver shall update the value with the correct number of
+                                                                            ///< extension properties available.
+    ze_driver_extension_properties_t* pExtensionProperties                  ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< extension properties.
+                                                                            ///< if count is less than the number of extension properties available,
+                                                                            ///< then driver shall only retrieve that number of extension properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1074,9 +1184,9 @@ zeDriverGetExtensionProperties(
 ///         + `nullptr == ppFunctionAddress`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDriverGetExtensionFunctionAddress(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    const char* name,                               ///< [in] extension function name
-    void** ppFunctionAddress                        ///< [out] pointer to function pointer
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    const char* name,                                                       ///< [in] extension function name
+    void** ppFunctionAddress                                                ///< [out] pointer to function pointer
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1104,9 +1214,9 @@ zeDriverGetExtensionFunctionAddress(
 ///         + `nullptr == ppString`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDriverGetLastErrorDescription(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    const char** ppString                           ///< [in,out] pointer to a null-terminated array of characters describing
-                                                    ///< cause of error.
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    const char** ppString                                                   ///< [in,out] pointer to a null-terminated array of characters describing
+                                                                            ///< cause of error.
     );
 
 #if !defined(__GNUC__)
@@ -1140,21 +1250,54 @@ zeDriverGetLastErrorDescription(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGet(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of devices.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of devices available.
-                                                    ///< if count is greater than the number of devices available, then the
-                                                    ///< driver shall update the value with the correct number of devices available.
-    ze_device_handle_t* phDevices                   ///< [in,out][optional][range(0, *pCount)] array of handle of devices.
-                                                    ///< if count is less than the number of devices available, then driver
-                                                    ///< shall only retrieve that number of devices.
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of devices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of devices available.
+                                                                            ///< if count is greater than the number of devices available, then the
+                                                                            ///< driver shall update the value with the correct number of devices available.
+    ze_device_handle_t* phDevices                                           ///< [in,out][optional][range(0, *pCount)] array of handle of devices.
+                                                                            ///< if count is less than the number of devices available, then driver
+                                                                            ///< shall only retrieve that number of devices.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves the root-device of a device handle
+/// 
+/// @details
+///     - When the device handle passed does not belong to any root-device,
+///       nullptr is returned.
+///     - Multiple calls to this function will return the same device handle.
+///     - The root-device handle returned by this function does not have access
+///       automatically to the resources
+///       created with the associated sub-device, unless those resources have
+///       been created with a context
+///       explicitly containing both handles.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phRootDevice`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetRootDevice(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    ze_device_handle_t* phRootDevice                                        ///< [in,out] parent root device.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Retrieves a sub-device from a device
 /// 
 /// @details
+///     - When the device handle passed does not contain any sub-device, a
+///       pCount of 0 is returned.
 ///     - Multiple calls to this function will return identical device handles,
 ///       in the same order.
 ///     - The number of handles returned from this function is affected by the
@@ -1178,26 +1321,26 @@ zeDeviceGet(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetSubDevices(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of sub-devices.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of sub-devices available.
-                                                    ///< if count is greater than the number of sub-devices available, then the
-                                                    ///< driver shall update the value with the correct number of sub-devices available.
-    ze_device_handle_t* phSubdevices                ///< [in,out][optional][range(0, *pCount)] array of handle of sub-devices.
-                                                    ///< if count is less than the number of sub-devices available, then driver
-                                                    ///< shall only retrieve that number of sub-devices.
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sub-devices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sub-devices available.
+                                                                            ///< if count is greater than the number of sub-devices available, then the
+                                                                            ///< driver shall update the value with the correct number of sub-devices available.
+    ze_device_handle_t* phSubdevices                                        ///< [in,out][optional][range(0, *pCount)] array of handle of sub-devices.
+                                                                            ///< if count is less than the number of sub-devices available, then driver
+                                                                            ///< shall only retrieve that number of sub-devices.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Supported device types
 typedef enum _ze_device_type_t
 {
-    ZE_DEVICE_TYPE_GPU = 1,                         ///< Graphics Processing Unit
-    ZE_DEVICE_TYPE_CPU = 2,                         ///< Central Processing Unit
-    ZE_DEVICE_TYPE_FPGA = 3,                        ///< Field Programmable Gate Array
-    ZE_DEVICE_TYPE_MCA = 4,                         ///< Memory Copy Accelerator
-    ZE_DEVICE_TYPE_VPU = 5,                         ///< Vision Processing Unit
+    ZE_DEVICE_TYPE_GPU = 1,                                                 ///< Graphics Processing Unit
+    ZE_DEVICE_TYPE_CPU = 2,                                                 ///< Central Processing Unit
+    ZE_DEVICE_TYPE_FPGA = 3,                                                ///< Field Programmable Gate Array
+    ZE_DEVICE_TYPE_MCA = 4,                                                 ///< Memory Copy Accelerator
+    ZE_DEVICE_TYPE_VPU = 5,                                                 ///< Vision Processing Unit
     ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_type_t;
@@ -1212,7 +1355,7 @@ typedef enum _ze_device_type_t
 /// @brief Device universal unique id (UUID)
 typedef struct _ze_device_uuid_t
 {
-    uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];            ///< [out] opaque data representing a device UUID
+    uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];                                    ///< [out] opaque data representing a device UUID
 
 } ze_device_uuid_t;
 
@@ -1227,10 +1370,10 @@ typedef struct _ze_device_uuid_t
 typedef uint32_t ze_device_property_flags_t;
 typedef enum _ze_device_property_flag_t
 {
-    ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), ///< Device is integrated with the Host.
-    ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),  ///< Device handle used for query represents a sub-device.
-    ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),        ///< Device supports error correction memory access.
-    ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3), ///< Device supports on-demand page-faulting.
+    ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),                         ///< Device is integrated with the Host.
+    ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),                          ///< Device handle used for query represents a sub-device.
+    ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),                                ///< Device supports error correction memory access.
+    ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),                     ///< Device supports on-demand page-faulting.
     ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_property_flag_t;
@@ -1239,36 +1382,36 @@ typedef enum _ze_device_property_flag_t
 /// @brief Device properties queried using ::zeDeviceGetProperties
 typedef struct _ze_device_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_type_t type;                          ///< [out] generic device type
-    uint32_t vendorId;                              ///< [out] vendor id from PCI configuration
-    uint32_t deviceId;                              ///< [out] device id from PCI configuration
-                                                    ///< Note, the device id uses little-endian format.
-    ze_device_property_flags_t flags;               ///< [out] 0 (none) or a valid combination of ::ze_device_property_flag_t
-    uint32_t subdeviceId;                           ///< [out] sub-device id. Only valid if ::ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE
-                                                    ///< is set.
-    uint32_t coreClockRate;                         ///< [out] Clock rate for device core.
-    uint64_t maxMemAllocSize;                       ///< [out] Maximum memory allocation size.
-    uint32_t maxHardwareContexts;                   ///< [out] Maximum number of logical hardware contexts.
-    uint32_t maxCommandQueuePriority;               ///< [out] Maximum priority for command queues. Higher value is higher
-                                                    ///< priority.
-    uint32_t numThreadsPerEU;                       ///< [out] Maximum number of threads per EU.
-    uint32_t physicalEUSimdWidth;                   ///< [out] The physical EU simd width.
-    uint32_t numEUsPerSubslice;                     ///< [out] Maximum number of EUs per sub-slice.
-    uint32_t numSubslicesPerSlice;                  ///< [out] Maximum number of sub-slices per slice.
-    uint32_t numSlices;                             ///< [out] Maximum number of slices.
-    uint64_t timerResolution;                       ///< [out] Returns the resolution of device timer used for profiling,
-                                                    ///< timestamps, etc. When stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES the
-                                                    ///< units are in nanoseconds. When
-                                                    ///< stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 units are in
-                                                    ///< cycles/sec
-    uint32_t timestampValidBits;                    ///< [out] Returns the number of valid bits in the timestamp value.
-    uint32_t kernelTimestampValidBits;              ///< [out] Returns the number of valid bits in the kernel timestamp values
-    ze_device_uuid_t uuid;                          ///< [out] universal unique identifier. Note: Subdevices will have their
-                                                    ///< own uuid.
-    char name[ZE_MAX_DEVICE_NAME];                  ///< [out] Device name
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_type_t type;                                                  ///< [out] generic device type
+    uint32_t vendorId;                                                      ///< [out] vendor id from PCI configuration
+    uint32_t deviceId;                                                      ///< [out] device id from PCI configuration
+                                                                            ///< Note, the device id uses little-endian format.
+    ze_device_property_flags_t flags;                                       ///< [out] 0 (none) or a valid combination of ::ze_device_property_flag_t
+    uint32_t subdeviceId;                                                   ///< [out] sub-device id. Only valid if ::ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE
+                                                                            ///< is set.
+    uint32_t coreClockRate;                                                 ///< [out] Clock rate for device core.
+    uint64_t maxMemAllocSize;                                               ///< [out] Maximum memory allocation size.
+    uint32_t maxHardwareContexts;                                           ///< [out] Maximum number of logical hardware contexts.
+    uint32_t maxCommandQueuePriority;                                       ///< [out] Maximum priority for command queues. Higher value is higher
+                                                                            ///< priority.
+    uint32_t numThreadsPerEU;                                               ///< [out] Maximum number of threads per EU.
+    uint32_t physicalEUSimdWidth;                                           ///< [out] The physical EU simd width.
+    uint32_t numEUsPerSubslice;                                             ///< [out] Maximum number of EUs per sub-slice.
+    uint32_t numSubslicesPerSlice;                                          ///< [out] Maximum number of sub-slices per slice.
+    uint32_t numSlices;                                                     ///< [out] Maximum number of slices.
+    uint64_t timerResolution;                                               ///< [out] Returns the resolution of device timer used for profiling,
+                                                                            ///< timestamps, etc. When stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES the
+                                                                            ///< units are in nanoseconds. When
+                                                                            ///< stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 units are in
+                                                                            ///< cycles/sec
+    uint32_t timestampValidBits;                                            ///< [out] Returns the number of valid bits in the timestamp value.
+    uint32_t kernelTimestampValidBits;                                      ///< [out] Returns the number of valid bits in the kernel timestamp values
+    ze_device_uuid_t uuid;                                                  ///< [out] universal unique identifier. Note: Subdevices will have their
+                                                                            ///< own uuid.
+    char name[ZE_MAX_DEVICE_NAME];                                          ///< [out] Device name
 
 } ze_device_properties_t;
 
@@ -1276,14 +1419,17 @@ typedef struct _ze_device_properties_t
 /// @brief Device thread identifier.
 typedef struct _ze_device_thread_t
 {
-    uint32_t slice;                                 ///< [in,out] the slice number.
-                                                    ///< Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numSlices.
-    uint32_t subslice;                              ///< [in,out] the sub-slice number within its slice.
-                                                    ///< Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numSubslicesPerSlice.
-    uint32_t eu;                                    ///< [in,out] the EU number within its sub-slice.
-                                                    ///< Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numEUsPerSubslice.
-    uint32_t thread;                                ///< [in,out] the thread number within its EU.
-                                                    ///< Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numThreadsPerEU.
+    uint32_t slice;                                                         ///< [in,out] the slice number.
+                                                                            ///< Must be `UINT32_MAX` (all) or less than the `numSlices` member of ::ze_device_properties_t.
+    uint32_t subslice;                                                      ///< [in,out] the sub-slice number within its slice.
+                                                                            ///< Must be `UINT32_MAX` (all) or less than the `numSubslicesPerSlice`
+                                                                            ///< member of ::ze_device_properties_t.
+    uint32_t eu;                                                            ///< [in,out] the EU number within its sub-slice.
+                                                                            ///< Must be `UINT32_MAX` (all) or less than the `numEUsPerSubslice` member
+                                                                            ///< of ::ze_device_properties_t.
+    uint32_t thread;                                                        ///< [in,out] the thread number within its EU.
+                                                                            ///< Must be `UINT32_MAX` (all) or less than the `numThreadsPerEU` member
+                                                                            ///< of ::ze_device_properties_t.
 
 } ze_device_thread_t;
 
@@ -1310,8 +1456,8 @@ typedef struct _ze_device_thread_t
 ///         + `nullptr == pDeviceProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_properties_t* pDeviceProperties       ///< [in,out] query result for device properties
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_properties_t* pDeviceProperties                               ///< [in,out] query result for device properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1324,21 +1470,21 @@ zeDeviceGetProperties(
 /// @brief Device compute properties queried using ::zeDeviceGetComputeProperties
 typedef struct _ze_device_compute_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t maxTotalGroupSize;                     ///< [out] Maximum items per compute group. (groupSizeX * groupSizeY *
-                                                    ///< groupSizeZ) <= maxTotalGroupSize
-    uint32_t maxGroupSizeX;                         ///< [out] Maximum items for X dimension in group
-    uint32_t maxGroupSizeY;                         ///< [out] Maximum items for Y dimension in group
-    uint32_t maxGroupSizeZ;                         ///< [out] Maximum items for Z dimension in group
-    uint32_t maxGroupCountX;                        ///< [out] Maximum groups that can be launched for x dimension
-    uint32_t maxGroupCountY;                        ///< [out] Maximum groups that can be launched for y dimension
-    uint32_t maxGroupCountZ;                        ///< [out] Maximum groups that can be launched for z dimension
-    uint32_t maxSharedLocalMemory;                  ///< [out] Maximum shared local memory per group.
-    uint32_t numSubGroupSizes;                      ///< [out] Number of subgroup sizes supported. This indicates number of
-                                                    ///< entries in subGroupSizes.
-    uint32_t subGroupSizes[ZE_SUBGROUPSIZE_COUNT];  ///< [out] Size group sizes supported.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t maxTotalGroupSize;                                             ///< [out] Maximum items per compute group. (groupSizeX * groupSizeY *
+                                                                            ///< groupSizeZ) <= maxTotalGroupSize
+    uint32_t maxGroupSizeX;                                                 ///< [out] Maximum items for X dimension in group
+    uint32_t maxGroupSizeY;                                                 ///< [out] Maximum items for Y dimension in group
+    uint32_t maxGroupSizeZ;                                                 ///< [out] Maximum items for Z dimension in group
+    uint32_t maxGroupCountX;                                                ///< [out] Maximum groups that can be launched for x dimension
+    uint32_t maxGroupCountY;                                                ///< [out] Maximum groups that can be launched for y dimension
+    uint32_t maxGroupCountZ;                                                ///< [out] Maximum groups that can be launched for z dimension
+    uint32_t maxSharedLocalMemory;                                          ///< [out] Maximum shared local memory per group.
+    uint32_t numSubGroupSizes;                                              ///< [out] Number of subgroup sizes supported. This indicates number of
+                                                                            ///< entries in subGroupSizes.
+    uint32_t subGroupSizes[ZE_SUBGROUPSIZE_COUNT];                          ///< [out] Size group sizes supported.
 
 } ze_device_compute_properties_t;
 
@@ -1365,8 +1511,8 @@ typedef struct _ze_device_compute_properties_t
 ///         + `nullptr == pComputeProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetComputeProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_compute_properties_t* pComputeProperties  ///< [in,out] query result for compute properties
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_compute_properties_t* pComputeProperties                      ///< [in,out] query result for compute properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1379,7 +1525,7 @@ zeDeviceGetComputeProperties(
 /// @brief Native kernel universal unique id (UUID)
 typedef struct _ze_native_kernel_uuid_t
 {
-    uint8_t id[ZE_MAX_NATIVE_KERNEL_UUID_SIZE];     ///< [out] opaque data representing a native kernel UUID
+    uint8_t id[ZE_MAX_NATIVE_KERNEL_UUID_SIZE];                             ///< [out] opaque data representing a native kernel UUID
 
 } ze_native_kernel_uuid_t;
 
@@ -1388,10 +1534,10 @@ typedef struct _ze_native_kernel_uuid_t
 typedef uint32_t ze_device_module_flags_t;
 typedef enum _ze_device_module_flag_t
 {
-    ZE_DEVICE_MODULE_FLAG_FP16 = ZE_BIT(0),         ///< Device supports 16-bit floating-point operations
-    ZE_DEVICE_MODULE_FLAG_FP64 = ZE_BIT(1),         ///< Device supports 64-bit floating-point operations
-    ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS = ZE_BIT(2),///< Device supports 64-bit atomic operations
-    ZE_DEVICE_MODULE_FLAG_DP4A = ZE_BIT(3),         ///< Device supports four component dot product and accumulate operations
+    ZE_DEVICE_MODULE_FLAG_FP16 = ZE_BIT(0),                                 ///< Device supports 16-bit floating-point operations
+    ZE_DEVICE_MODULE_FLAG_FP64 = ZE_BIT(1),                                 ///< Device supports 64-bit floating-point operations
+    ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS = ZE_BIT(2),                        ///< Device supports 64-bit atomic operations
+    ZE_DEVICE_MODULE_FLAG_DP4A = ZE_BIT(3),                                 ///< Device supports four component dot product and accumulate operations
     ZE_DEVICE_MODULE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_module_flag_t;
@@ -1401,15 +1547,15 @@ typedef enum _ze_device_module_flag_t
 typedef uint32_t ze_device_fp_flags_t;
 typedef enum _ze_device_fp_flag_t
 {
-    ZE_DEVICE_FP_FLAG_DENORM = ZE_BIT(0),           ///< Supports denorms
-    ZE_DEVICE_FP_FLAG_INF_NAN = ZE_BIT(1),          ///< Supports INF and quiet NaNs
-    ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST = ZE_BIT(2), ///< Supports rounding to nearest even rounding mode
-    ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO = ZE_BIT(3),    ///< Supports rounding to zero.
-    ZE_DEVICE_FP_FLAG_ROUND_TO_INF = ZE_BIT(4),     ///< Supports rounding to both positive and negative INF.
-    ZE_DEVICE_FP_FLAG_FMA = ZE_BIT(5),              ///< Supports IEEE754-2008 fused multiply-add.
-    ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT = ZE_BIT(6),  ///< Supports rounding as defined by IEEE754 for divide and sqrt
-                                                    ///< operations.
-    ZE_DEVICE_FP_FLAG_SOFT_FLOAT = ZE_BIT(7),       ///< Uses software implementation for basic floating-point operations.
+    ZE_DEVICE_FP_FLAG_DENORM = ZE_BIT(0),                                   ///< Supports denorms
+    ZE_DEVICE_FP_FLAG_INF_NAN = ZE_BIT(1),                                  ///< Supports INF and quiet NaNs
+    ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST = ZE_BIT(2),                         ///< Supports rounding to nearest even rounding mode
+    ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO = ZE_BIT(3),                            ///< Supports rounding to zero.
+    ZE_DEVICE_FP_FLAG_ROUND_TO_INF = ZE_BIT(4),                             ///< Supports rounding to both positive and negative INF.
+    ZE_DEVICE_FP_FLAG_FMA = ZE_BIT(5),                                      ///< Supports IEEE754-2008 fused multiply-add.
+    ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT = ZE_BIT(6),                      ///< Supports rounding as defined by IEEE754 for divide and sqrt
+                                                                            ///< operations.
+    ZE_DEVICE_FP_FLAG_SOFT_FLOAT = ZE_BIT(7),                               ///< Uses software implementation for basic floating-point operations.
     ZE_DEVICE_FP_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_fp_flag_t;
@@ -1418,30 +1564,30 @@ typedef enum _ze_device_fp_flag_t
 /// @brief Device module properties queried using ::zeDeviceGetModuleProperties
 typedef struct _ze_device_module_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t spirvVersionSupported;                 ///< [out] Maximum supported SPIR-V version.
-                                                    ///< Returns zero if SPIR-V is not supported.
-                                                    ///< Contains major and minor attributes, use ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION.
-    ze_device_module_flags_t flags;                 ///< [out] 0 or a valid combination of ::ze_device_module_flag_t
-    ze_device_fp_flags_t fp16flags;                 ///< [out] Capabilities for half-precision floating-point operations.
-                                                    ///< returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP16 is not set) or a
-                                                    ///< combination of ::ze_device_fp_flag_t.
-    ze_device_fp_flags_t fp32flags;                 ///< [out] Capabilities for single-precision floating-point operations.
-                                                    ///< returns a combination of ::ze_device_fp_flag_t.
-    ze_device_fp_flags_t fp64flags;                 ///< [out] Capabilities for double-precision floating-point operations.
-                                                    ///< returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP64 is not set) or a
-                                                    ///< combination of ::ze_device_fp_flag_t.
-    uint32_t maxArgumentsSize;                      ///< [out] Maximum kernel argument size that is supported.
-    uint32_t printfBufferSize;                      ///< [out] Maximum size of internal buffer that holds output of printf
-                                                    ///< calls from kernel.
-    ze_native_kernel_uuid_t nativeKernelSupported;  ///< [out] Compatibility UUID of supported native kernel.
-                                                    ///< UUID may or may not be the same across driver release, devices, or
-                                                    ///< operating systems.
-                                                    ///< Application is responsible for ensuring UUID matches before creating
-                                                    ///< module using
-                                                    ///< previously created native kernel.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t spirvVersionSupported;                                         ///< [out] Maximum supported SPIR-V version.
+                                                                            ///< Returns zero if SPIR-V is not supported.
+                                                                            ///< Contains major and minor attributes, use ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION.
+    ze_device_module_flags_t flags;                                         ///< [out] 0 or a valid combination of ::ze_device_module_flag_t
+    ze_device_fp_flags_t fp16flags;                                         ///< [out] Capabilities for half-precision floating-point operations.
+                                                                            ///< returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP16 is not set) or a
+                                                                            ///< combination of ::ze_device_fp_flag_t.
+    ze_device_fp_flags_t fp32flags;                                         ///< [out] Capabilities for single-precision floating-point operations.
+                                                                            ///< returns a combination of ::ze_device_fp_flag_t.
+    ze_device_fp_flags_t fp64flags;                                         ///< [out] Capabilities for double-precision floating-point operations.
+                                                                            ///< returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP64 is not set) or a
+                                                                            ///< combination of ::ze_device_fp_flag_t.
+    uint32_t maxArgumentsSize;                                              ///< [out] Maximum kernel argument size that is supported.
+    uint32_t printfBufferSize;                                              ///< [out] Maximum size of internal buffer that holds output of printf
+                                                                            ///< calls from kernel.
+    ze_native_kernel_uuid_t nativeKernelSupported;                          ///< [out] Compatibility UUID of supported native kernel.
+                                                                            ///< UUID may or may not be the same across driver release, devices, or
+                                                                            ///< operating systems.
+                                                                            ///< Application is responsible for ensuring UUID matches before creating
+                                                                            ///< module using
+                                                                            ///< previously created native kernel.
 
 } ze_device_module_properties_t;
 
@@ -1464,8 +1610,8 @@ typedef struct _ze_device_module_properties_t
 ///         + `nullptr == pModuleProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetModuleProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_module_properties_t* pModuleProperties///< [in,out] query result for module properties
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_module_properties_t* pModuleProperties                        ///< [in,out] query result for module properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1473,11 +1619,11 @@ zeDeviceGetModuleProperties(
 typedef uint32_t ze_command_queue_group_property_flags_t;
 typedef enum _ze_command_queue_group_property_flag_t
 {
-    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE = ZE_BIT(0),   ///< Command queue group supports enqueing compute commands.
-    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY = ZE_BIT(1),  ///< Command queue group supports enqueing copy commands.
+    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE = ZE_BIT(0),               ///< Command queue group supports enqueing compute commands.
+    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY = ZE_BIT(1),                  ///< Command queue group supports enqueing copy commands.
     ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COOPERATIVE_KERNELS = ZE_BIT(2),   ///< Command queue group supports cooperative kernels.
-                                                    ///< See ::zeCommandListAppendLaunchCooperativeKernel for more details.
-    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_METRICS = ZE_BIT(3),   ///< Command queue groups supports metric queries.
+                                                                            ///< See ::zeCommandListAppendLaunchCooperativeKernel for more details.
+    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_METRICS = ZE_BIT(3),               ///< Command queue groups supports metric queries.
     ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_command_queue_group_property_flag_t;
@@ -1487,14 +1633,14 @@ typedef enum _ze_command_queue_group_property_flag_t
 ///        ::zeDeviceGetCommandQueueGroupProperties
 typedef struct _ze_command_queue_group_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_command_queue_group_property_flags_t flags;  ///< [out] 0 (none) or a valid combination of
-                                                    ///< ::ze_command_queue_group_property_flag_t
-    size_t maxMemoryFillPatternSize;                ///< [out] maximum `pattern_size` supported by command queue group.
-                                                    ///< See ::zeCommandListAppendMemoryFill for more details.
-    uint32_t numQueues;                             ///< [out] the number of physical engines within the group.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_command_queue_group_property_flags_t flags;                          ///< [out] 0 (none) or a valid combination of
+                                                                            ///< ::ze_command_queue_group_property_flag_t
+    size_t maxMemoryFillPatternSize;                                        ///< [out] maximum `pattern_size` supported by command queue group.
+                                                                            ///< See ::zeCommandListAppendMemoryFill for more details.
+    uint32_t numQueues;                                                     ///< [out] the number of physical engines within the group.
 
 } ze_command_queue_group_properties_t;
 
@@ -1527,18 +1673,18 @@ typedef struct _ze_command_queue_group_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetCommandQueueGroupProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of command queue group properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of command queue group properties available.
-                                                    ///< if count is greater than the number of command queue group properties
-                                                    ///< available, then the driver shall update the value with the correct
-                                                    ///< number of command queue group properties available.
-    ze_command_queue_group_properties_t* pCommandQueueGroupProperties   ///< [in,out][optional][range(0, *pCount)] array of query results for
-                                                    ///< command queue group properties.
-                                                    ///< if count is less than the number of command queue group properties
-                                                    ///< available, then driver shall only retrieve that number of command
-                                                    ///< queue group properties.
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of command queue group properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of command queue group properties available.
+                                                                            ///< if count is greater than the number of command queue group properties
+                                                                            ///< available, then the driver shall update the value with the correct
+                                                                            ///< number of command queue group properties available.
+    ze_command_queue_group_properties_t* pCommandQueueGroupProperties       ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< command queue group properties.
+                                                                            ///< if count is less than the number of command queue group properties
+                                                                            ///< available, then driver shall only retrieve that number of command
+                                                                            ///< queue group properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1546,7 +1692,7 @@ zeDeviceGetCommandQueueGroupProperties(
 typedef uint32_t ze_device_memory_property_flags_t;
 typedef enum _ze_device_memory_property_flag_t
 {
-    ZE_DEVICE_MEMORY_PROPERTY_FLAG_TBD = ZE_BIT(0), ///< reserved for future use
+    ZE_DEVICE_MEMORY_PROPERTY_FLAG_TBD = ZE_BIT(0),                         ///< reserved for future use
     ZE_DEVICE_MEMORY_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_memory_property_flag_t;
@@ -1556,15 +1702,15 @@ typedef enum _ze_device_memory_property_flag_t
 ///        ::zeDeviceGetMemoryProperties
 typedef struct _ze_device_memory_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_memory_property_flags_t flags;        ///< [out] 0 (none) or a valid combination of
-                                                    ///< ::ze_device_memory_property_flag_t
-    uint32_t maxClockRate;                          ///< [out] Maximum clock rate for device memory.
-    uint32_t maxBusWidth;                           ///< [out] Maximum bus width between device and memory.
-    uint64_t totalSize;                             ///< [out] Total memory size in bytes that is available to the device.
-    char name[ZE_MAX_DEVICE_NAME];                  ///< [out] Memory name
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_memory_property_flags_t flags;                                ///< [out] 0 (none) or a valid combination of
+                                                                            ///< ::ze_device_memory_property_flag_t
+    uint32_t maxClockRate;                                                  ///< [out] Maximum clock rate for device memory.
+    uint32_t maxBusWidth;                                                   ///< [out] Maximum bus width between device and memory.
+    uint64_t totalSize;                                                     ///< [out] Total memory size in bytes that is available to the device.
+    char name[ZE_MAX_DEVICE_NAME];                                          ///< [out] Memory name
 
 } ze_device_memory_properties_t;
 
@@ -1597,17 +1743,17 @@ typedef struct _ze_device_memory_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetMemoryProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of memory properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of memory properties available.
-                                                    ///< if count is greater than the number of memory properties available,
-                                                    ///< then the driver shall update the value with the correct number of
-                                                    ///< memory properties available.
-    ze_device_memory_properties_t* pMemProperties   ///< [in,out][optional][range(0, *pCount)] array of query results for
-                                                    ///< memory properties.
-                                                    ///< if count is less than the number of memory properties available, then
-                                                    ///< driver shall only retrieve that number of memory properties.
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of memory properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of memory properties available.
+                                                                            ///< if count is greater than the number of memory properties available,
+                                                                            ///< then the driver shall update the value with the correct number of
+                                                                            ///< memory properties available.
+    ze_device_memory_properties_t* pMemProperties                           ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< memory properties.
+                                                                            ///< if count is less than the number of memory properties available, then
+                                                                            ///< driver shall only retrieve that number of memory properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1619,10 +1765,10 @@ zeDeviceGetMemoryProperties(
 typedef uint32_t ze_memory_access_cap_flags_t;
 typedef enum _ze_memory_access_cap_flag_t
 {
-    ZE_MEMORY_ACCESS_CAP_FLAG_RW = ZE_BIT(0),       ///< Supports load/store access
-    ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC = ZE_BIT(1),   ///< Supports atomic access
-    ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT = ZE_BIT(2),   ///< Supports concurrent access
-    ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC = ZE_BIT(3),///< Supports concurrent atomic access
+    ZE_MEMORY_ACCESS_CAP_FLAG_RW = ZE_BIT(0),                               ///< Supports load/store access
+    ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC = ZE_BIT(1),                           ///< Supports atomic access
+    ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT = ZE_BIT(2),                       ///< Supports concurrent access
+    ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC = ZE_BIT(3),                ///< Supports concurrent atomic access
     ZE_MEMORY_ACCESS_CAP_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_memory_access_cap_flag_t;
@@ -1632,19 +1778,19 @@ typedef enum _ze_memory_access_cap_flag_t
 ///        ::zeDeviceGetMemoryAccessProperties
 typedef struct _ze_device_memory_access_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_memory_access_cap_flags_t hostAllocCapabilities; ///< [out] host memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ze_memory_access_cap_flags_t deviceAllocCapabilities;   ///< [out] device memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ze_memory_access_cap_flags_t sharedSingleDeviceAllocCapabilities;   ///< [out] shared, single-device memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ze_memory_access_cap_flags_t sharedCrossDeviceAllocCapabilities;///< [out] shared, cross-device memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ze_memory_access_cap_flags_t sharedSystemAllocCapabilities; ///< [out] shared, system memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_memory_access_cap_flags_t hostAllocCapabilities;                     ///< [out] host memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_memory_access_cap_flags_t deviceAllocCapabilities;                   ///< [out] device memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_memory_access_cap_flags_t sharedSingleDeviceAllocCapabilities;       ///< [out] shared, single-device memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_memory_access_cap_flags_t sharedCrossDeviceAllocCapabilities;        ///< [out] shared, cross-device memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_memory_access_cap_flags_t sharedSystemAllocCapabilities;             ///< [out] shared, system memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
 
 } ze_device_memory_access_properties_t;
 
@@ -1671,8 +1817,8 @@ typedef struct _ze_device_memory_access_properties_t
 ///         + `nullptr == pMemAccessProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetMemoryAccessProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_memory_access_properties_t* pMemAccessProperties  ///< [in,out] query result for memory access properties
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_memory_access_properties_t* pMemAccessProperties              ///< [in,out] query result for memory access properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1680,7 +1826,7 @@ zeDeviceGetMemoryAccessProperties(
 typedef uint32_t ze_device_cache_property_flags_t;
 typedef enum _ze_device_cache_property_flag_t
 {
-    ZE_DEVICE_CACHE_PROPERTY_FLAG_USER_CONTROL = ZE_BIT(0), ///< Device support User Cache Control (i.e. SLM section vs Generic Cache)
+    ZE_DEVICE_CACHE_PROPERTY_FLAG_USER_CONTROL = ZE_BIT(0),                 ///< Device support User Cache Control (i.e. SLM section vs Generic Cache)
     ZE_DEVICE_CACHE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_cache_property_flag_t;
@@ -1689,12 +1835,12 @@ typedef enum _ze_device_cache_property_flag_t
 /// @brief Device cache properties queried using ::zeDeviceGetCacheProperties
 typedef struct _ze_device_cache_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_cache_property_flags_t flags;         ///< [out] 0 (none) or a valid combination of
-                                                    ///< ::ze_device_cache_property_flag_t
-    size_t cacheSize;                               ///< [out] Per-cache size, in bytes
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_cache_property_flags_t flags;                                 ///< [out] 0 (none) or a valid combination of
+                                                                            ///< ::ze_device_cache_property_flag_t
+    size_t cacheSize;                                                       ///< [out] Per-cache size, in bytes
 
 } ze_device_cache_properties_t;
 
@@ -1721,43 +1867,43 @@ typedef struct _ze_device_cache_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetCacheProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of cache properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of cache properties available.
-                                                    ///< if count is greater than the number of cache properties available,
-                                                    ///< then the driver shall update the value with the correct number of
-                                                    ///< cache properties available.
-    ze_device_cache_properties_t* pCacheProperties  ///< [in,out][optional][range(0, *pCount)] array of query results for cache properties.
-                                                    ///< if count is less than the number of cache properties available, then
-                                                    ///< driver shall only retrieve that number of cache properties.
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of cache properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of cache properties available.
+                                                                            ///< if count is greater than the number of cache properties available,
+                                                                            ///< then the driver shall update the value with the correct number of
+                                                                            ///< cache properties available.
+    ze_device_cache_properties_t* pCacheProperties                          ///< [in,out][optional][range(0, *pCount)] array of query results for cache properties.
+                                                                            ///< if count is less than the number of cache properties available, then
+                                                                            ///< driver shall only retrieve that number of cache properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Device image properties queried using ::zeDeviceGetImageProperties
 typedef struct _ze_device_image_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t maxImageDims1D;                        ///< [out] Maximum image dimensions for 1D resources. if 0, then 1D images
-                                                    ///< are unsupported.
-    uint32_t maxImageDims2D;                        ///< [out] Maximum image dimensions for 2D resources. if 0, then 2D images
-                                                    ///< are unsupported.
-    uint32_t maxImageDims3D;                        ///< [out] Maximum image dimensions for 3D resources. if 0, then 3D images
-                                                    ///< are unsupported.
-    uint64_t maxImageBufferSize;                    ///< [out] Maximum image buffer size in bytes. if 0, then buffer images are
-                                                    ///< unsupported.
-    uint32_t maxImageArraySlices;                   ///< [out] Maximum image array slices. if 0, then image arrays are
-                                                    ///< unsupported.
-    uint32_t maxSamplers;                           ///< [out] Max samplers that can be used in kernel. if 0, then sampling is
-                                                    ///< unsupported.
-    uint32_t maxReadImageArgs;                      ///< [out] Returns the maximum number of simultaneous image objects that
-                                                    ///< can be read from by a kernel. if 0, then reading images is
-                                                    ///< unsupported.
-    uint32_t maxWriteImageArgs;                     ///< [out] Returns the maximum number of simultaneous image objects that
-                                                    ///< can be written to by a kernel. if 0, then writing images is
-                                                    ///< unsupported.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t maxImageDims1D;                                                ///< [out] Maximum image dimensions for 1D resources. if 0, then 1D images
+                                                                            ///< are unsupported.
+    uint32_t maxImageDims2D;                                                ///< [out] Maximum image dimensions for 2D resources. if 0, then 2D images
+                                                                            ///< are unsupported.
+    uint32_t maxImageDims3D;                                                ///< [out] Maximum image dimensions for 3D resources. if 0, then 3D images
+                                                                            ///< are unsupported.
+    uint64_t maxImageBufferSize;                                            ///< [out] Maximum image buffer size in bytes. if 0, then buffer images are
+                                                                            ///< unsupported.
+    uint32_t maxImageArraySlices;                                           ///< [out] Maximum image array slices. if 0, then image arrays are
+                                                                            ///< unsupported.
+    uint32_t maxSamplers;                                                   ///< [out] Max samplers that can be used in kernel. if 0, then sampling is
+                                                                            ///< unsupported.
+    uint32_t maxReadImageArgs;                                              ///< [out] Returns the maximum number of simultaneous image objects that
+                                                                            ///< can be read from by a kernel. if 0, then reading images is
+                                                                            ///< unsupported.
+    uint32_t maxWriteImageArgs;                                             ///< [out] Returns the maximum number of simultaneous image objects that
+                                                                            ///< can be written to by a kernel. if 0, then writing images is
+                                                                            ///< unsupported.
 
 } ze_device_image_properties_t;
 
@@ -1781,21 +1927,21 @@ typedef struct _ze_device_image_properties_t
 ///         + `nullptr == pImageProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetImageProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_image_properties_t* pImageProperties  ///< [in,out] query result for image properties
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_image_properties_t* pImageProperties                          ///< [in,out] query result for image properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Device external memory import and export properties
 typedef struct _ze_device_external_memory_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t memoryAllocationImportTypes;///< [out] Supported external memory import types for memory allocations.
-    ze_external_memory_type_flags_t memoryAllocationExportTypes;///< [out] Supported external memory export types for memory allocations.
-    ze_external_memory_type_flags_t imageImportTypes;   ///< [out] Supported external memory import types for images.
-    ze_external_memory_type_flags_t imageExportTypes;   ///< [out] Supported external memory export types for images.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t memoryAllocationImportTypes;            ///< [out] Supported external memory import types for memory allocations.
+    ze_external_memory_type_flags_t memoryAllocationExportTypes;            ///< [out] Supported external memory export types for memory allocations.
+    ze_external_memory_type_flags_t imageImportTypes;                       ///< [out] Supported external memory import types for images.
+    ze_external_memory_type_flags_t imageExportTypes;                       ///< [out] Supported external memory export types for images.
 
 } ze_device_external_memory_properties_t;
 
@@ -1818,8 +1964,8 @@ typedef struct _ze_device_external_memory_properties_t
 ///         + `nullptr == pExternalMemoryProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetExternalMemoryProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_external_memory_properties_t* pExternalMemoryProperties   ///< [in,out] query result for external memory properties
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_external_memory_properties_t* pExternalMemoryProperties       ///< [in,out] query result for external memory properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1827,8 +1973,8 @@ zeDeviceGetExternalMemoryProperties(
 typedef uint32_t ze_device_p2p_property_flags_t;
 typedef enum _ze_device_p2p_property_flag_t
 {
-    ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS = ZE_BIT(0), ///< Device supports access between peer devices.
-    ZE_DEVICE_P2P_PROPERTY_FLAG_ATOMICS = ZE_BIT(1),///< Device supports atomics between peer devices.
+    ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS = ZE_BIT(0),                         ///< Device supports access between peer devices.
+    ZE_DEVICE_P2P_PROPERTY_FLAG_ATOMICS = ZE_BIT(1),                        ///< Device supports atomics between peer devices.
     ZE_DEVICE_P2P_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_p2p_property_flag_t;
@@ -1838,11 +1984,11 @@ typedef enum _ze_device_p2p_property_flag_t
 ///        ::zeDeviceGetP2PProperties
 typedef struct _ze_device_p2p_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_p2p_property_flags_t flags;           ///< [out] 0 (none) or a valid combination of
-                                                    ///< ::ze_device_p2p_property_flag_t
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_p2p_property_flags_t flags;                                   ///< [out] 0 (none) or a valid combination of
+                                                                            ///< ::ze_device_p2p_property_flag_t
 
 } ze_device_p2p_properties_t;
 
@@ -1867,9 +2013,9 @@ typedef struct _ze_device_p2p_properties_t
 ///         + `nullptr == pP2PProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetP2PProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device performing the access
-    ze_device_handle_t hPeerDevice,                 ///< [in] handle of the peer device with the allocation
-    ze_device_p2p_properties_t* pP2PProperties      ///< [in,out] Peer-to-Peer properties between source and peer device
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device performing the access
+    ze_device_handle_t hPeerDevice,                                         ///< [in] handle of the peer device with the allocation
+    ze_device_p2p_properties_t* pP2PProperties                              ///< [in,out] Peer-to-Peer properties between source and peer device
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1906,9 +2052,9 @@ zeDeviceGetP2PProperties(
 ///         + `nullptr == value`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceCanAccessPeer(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device performing the access
-    ze_device_handle_t hPeerDevice,                 ///< [in] handle of the peer device with the allocation
-    ze_bool_t* value                                ///< [out] returned access capability
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device performing the access
+    ze_device_handle_t hPeerDevice,                                         ///< [in] handle of the peer device with the allocation
+    ze_bool_t* value                                                        ///< [out] returned access capability
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1935,7 +2081,7 @@ zeDeviceCanAccessPeer(
 ///         + Device is lost; must be reset for use.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetStatus(
-    ze_device_handle_t hDevice                      ///< [in] handle of the device
+    ze_device_handle_t hDevice                                              ///< [in] handle of the device
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1959,11 +2105,11 @@ zeDeviceGetStatus(
 ///         + `nullptr == deviceTimestamp`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetGlobalTimestamps(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    uint64_t* hostTimestamp,                        ///< [out] value of the Host's global timestamp that correlates with the
-                                                    ///< Device's global timestamp value
-    uint64_t* deviceTimestamp                       ///< [out] value of the Device's global timestamp that correlates with the
-                                                    ///< Host's global timestamp value
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    uint64_t* hostTimestamp,                                                ///< [out] value of the Host's global timestamp that correlates with the
+                                                                            ///< Device's global timestamp value
+    uint64_t* deviceTimestamp                                               ///< [out] value of the Device's global timestamp that correlates with the
+                                                                            ///< Host's global timestamp value
     );
 
 #if !defined(__GNUC__)
@@ -1978,7 +2124,7 @@ zeDeviceGetGlobalTimestamps(
 typedef uint32_t ze_context_flags_t;
 typedef enum _ze_context_flag_t
 {
-    ZE_CONTEXT_FLAG_TBD = ZE_BIT(0),                ///< reserved for future use
+    ZE_CONTEXT_FLAG_TBD = ZE_BIT(0),                                        ///< reserved for future use
     ZE_CONTEXT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_context_flag_t;
@@ -1987,12 +2133,12 @@ typedef enum _ze_context_flag_t
 /// @brief Context descriptor
 typedef struct _ze_context_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_context_flags_t flags;                       ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_context_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_context_flags_t flags;                                               ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_context_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics.
 
 } ze_context_desc_t;
 
@@ -2020,9 +2166,9 @@ typedef struct _ze_context_desc_t
 ///         + `0x1 < desc->flags`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextCreate(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver object
-    const ze_context_desc_t* desc,                  ///< [in] pointer to context descriptor
-    ze_context_handle_t* phContext                  ///< [out] pointer to handle of context object created
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver object
+    const ze_context_desc_t* desc,                                          ///< [in] pointer to context descriptor
+    ze_context_handle_t* phContext                                          ///< [out] pointer to handle of context object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2051,19 +2197,19 @@ zeContextCreate(
 ///         + `(nullptr == phDevices) && (0 < numDevices)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextCreateEx(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver object
-    const ze_context_desc_t* desc,                  ///< [in] pointer to context descriptor
-    uint32_t numDevices,                            ///< [in][optional] number of device handles; must be 0 if `nullptr ==
-                                                    ///< phDevices`
-    ze_device_handle_t* phDevices,                  ///< [in][optional][range(0, numDevices)] array of device handles which
-                                                    ///< context has visibility.
-                                                    ///< if nullptr, then all devices and any sub-devices supported by the
-                                                    ///< driver instance are
-                                                    ///< visible to the context.
-                                                    ///< otherwise, the context only has visibility to the devices and any
-                                                    ///< sub-devices of the
-                                                    ///< devices in this array.
-    ze_context_handle_t* phContext                  ///< [out] pointer to handle of context object created
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver object
+    const ze_context_desc_t* desc,                                          ///< [in] pointer to context descriptor
+    uint32_t numDevices,                                                    ///< [in][optional] number of device handles; must be 0 if `nullptr ==
+                                                                            ///< phDevices`
+    ze_device_handle_t* phDevices,                                          ///< [in][optional][range(0, numDevices)] array of device handles which
+                                                                            ///< context has visibility.
+                                                                            ///< if nullptr, then all devices and any sub-devices supported by the
+                                                                            ///< driver instance are
+                                                                            ///< visible to the context.
+                                                                            ///< otherwise, the context only has visibility to the devices and any
+                                                                            ///< sub-devices of the
+                                                                            ///< devices in this array.
+    ze_context_handle_t* phContext                                          ///< [out] pointer to handle of context object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2089,7 +2235,7 @@ zeContextCreateEx(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextDestroy(
-    ze_context_handle_t hContext                    ///< [in][release] handle of context object to destroy
+    ze_context_handle_t hContext                                            ///< [in][release] handle of context object to destroy
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2114,7 +2260,7 @@ zeContextDestroy(
 ///         + Context is invalid; due to device lost or reset.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextGetStatus(
-    ze_context_handle_t hContext                    ///< [in] handle of context object
+    ze_context_handle_t hContext                                            ///< [in] handle of context object
     );
 
 #if !defined(__GNUC__)
@@ -2129,11 +2275,20 @@ zeContextGetStatus(
 typedef uint32_t ze_command_queue_flags_t;
 typedef enum _ze_command_queue_flag_t
 {
-    ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY = ZE_BIT(0),///< command queue should be optimized for submission to a single device engine.
-                                                    ///< driver **must** disable any implicit optimizations for distributing
-                                                    ///< work across multiple engines.
-                                                    ///< this flag should be used when applications want full control over
-                                                    ///< multi-engine submission and scheduling.
+    ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY = ZE_BIT(0),                        ///< command queue should be optimized for submission to a single device engine.
+                                                                            ///< driver **must** disable any implicit optimizations for distributing
+                                                                            ///< work across multiple engines.
+                                                                            ///< this flag should be used when applications want full control over
+                                                                            ///< multi-engine submission and scheduling.
+    ZE_COMMAND_QUEUE_FLAG_IN_ORDER = ZE_BIT(1),                             ///< To be used only when creating immediate command lists. Commands
+                                                                            ///< appended to the immediate command
+                                                                            ///< list are executed in-order, with driver implementation enforcing
+                                                                            ///< dependencies between them.
+                                                                            ///< Application is not required to have the signal event of a given
+                                                                            ///< command being the wait event of
+                                                                            ///< the next to define an in-order list, and application is allowed to
+                                                                            ///< pass signal and wait events
+                                                                            ///< to each appended command to implement more complex dependency graphs.
     ZE_COMMAND_QUEUE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_command_queue_flag_t;
@@ -2142,11 +2297,11 @@ typedef enum _ze_command_queue_flag_t
 /// @brief Supported command queue modes
 typedef enum _ze_command_queue_mode_t
 {
-    ZE_COMMAND_QUEUE_MODE_DEFAULT = 0,              ///< implicit default behavior; uses driver-based heuristics
-    ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS = 1,          ///< Device execution always completes immediately on execute;
-                                                    ///< Host thread is blocked using wait on implicit synchronization object
-    ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS = 2,         ///< Device execution is scheduled and will complete in future;
-                                                    ///< explicit synchronization object must be used to determine completeness
+    ZE_COMMAND_QUEUE_MODE_DEFAULT = 0,                                      ///< implicit default behavior; uses driver-based heuristics
+    ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS = 1,                                  ///< Device execution always completes immediately on execute;
+                                                                            ///< Host thread is blocked using wait on implicit synchronization object
+    ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS = 2,                                 ///< Device execution is scheduled and will complete in future;
+                                                                            ///< explicit synchronization object must be used to determine completeness
     ZE_COMMAND_QUEUE_MODE_FORCE_UINT32 = 0x7fffffff
 
 } ze_command_queue_mode_t;
@@ -2155,9 +2310,9 @@ typedef enum _ze_command_queue_mode_t
 /// @brief Supported command queue priorities
 typedef enum _ze_command_queue_priority_t
 {
-    ZE_COMMAND_QUEUE_PRIORITY_NORMAL = 0,           ///< [default] normal priority
-    ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW = 1,     ///< lower priority than normal
-    ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH = 2,    ///< higher priority than normal
+    ZE_COMMAND_QUEUE_PRIORITY_NORMAL = 0,                                   ///< [default] normal priority
+    ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW = 1,                             ///< lower priority than normal
+    ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH = 2,                            ///< higher priority than normal
     ZE_COMMAND_QUEUE_PRIORITY_FORCE_UINT32 = 0x7fffffff
 
 } ze_command_queue_priority_t;
@@ -2166,18 +2321,18 @@ typedef enum _ze_command_queue_priority_t
 /// @brief Command Queue descriptor
 typedef struct _ze_command_queue_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t ordinal;                               ///< [in] command queue group ordinal
-    uint32_t index;                                 ///< [in] command queue index within the group;
-                                                    ///< must be zero if ::ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY is not set
-    ze_command_queue_flags_t flags;                 ///< [in] usage flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_command_queue_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics to balance
-                                                    ///< latency and throughput.
-    ze_command_queue_mode_t mode;                   ///< [in] operation mode
-    ze_command_queue_priority_t priority;           ///< [in] priority
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t ordinal;                                                       ///< [in] command queue group ordinal
+    uint32_t index;                                                         ///< [in] command queue index within the group;
+                                                                            ///< must be zero if ::ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY is not set
+    ze_command_queue_flags_t flags;                                         ///< [in] usage flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_command_queue_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics to balance
+                                                                            ///< latency and throughput.
+    ze_command_queue_mode_t mode;                                           ///< [in] operation mode
+    ze_command_queue_priority_t priority;                                   ///< [in] priority
 
 } ze_command_queue_desc_t;
 
@@ -2209,15 +2364,15 @@ typedef struct _ze_command_queue_desc_t
 ///         + `nullptr == desc`
 ///         + `nullptr == phCommandQueue`
 ///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x1 < desc->flags`
+///         + `0x3 < desc->flags`
 ///         + `::ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS < desc->mode`
 ///         + `::ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH < desc->priority`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandQueueCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    const ze_command_queue_desc_t* desc,            ///< [in] pointer to command queue descriptor
-    ze_command_queue_handle_t* phCommandQueue       ///< [out] pointer to handle of command queue object created
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    const ze_command_queue_desc_t* desc,                                    ///< [in] pointer to command queue descriptor
+    ze_command_queue_handle_t* phCommandQueue                               ///< [out] pointer to handle of command queue object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2249,7 +2404,7 @@ zeCommandQueueCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandQueueDestroy(
-    ze_command_queue_handle_t hCommandQueue         ///< [in][release] handle of command queue object to destroy
+    ze_command_queue_handle_t hCommandQueue                                 ///< [in][release] handle of command queue object to destroy
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2269,6 +2424,8 @@ zeCommandQueueDestroy(
 ///     - The application must use a fence created using the same command queue.
 ///     - The application must ensure the command queue, command list and fence
 ///       were created on the same context.
+///     - The application must ensure the command lists being executed are not
+///       immediate command lists.
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
 /// 
@@ -2292,11 +2449,11 @@ zeCommandQueueDestroy(
 ///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandQueueExecuteCommandLists(
-    ze_command_queue_handle_t hCommandQueue,        ///< [in] handle of the command queue
-    uint32_t numCommandLists,                       ///< [in] number of command lists to execute
-    ze_command_list_handle_t* phCommandLists,       ///< [in][range(0, numCommandLists)] list of handles of the command lists
-                                                    ///< to execute
-    ze_fence_handle_t hFence                        ///< [in][optional] handle of the fence to signal on completion
+    ze_command_queue_handle_t hCommandQueue,                                ///< [in] handle of the command queue
+    uint32_t numCommandLists,                                               ///< [in] number of command lists to execute
+    ze_command_list_handle_t* phCommandLists,                               ///< [in][range(0, numCommandLists)] list of handles of the command lists
+                                                                            ///< to execute
+    ze_fence_handle_t hFence                                                ///< [in][optional] handle of the fence to signal on completion
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2318,14 +2475,14 @@ zeCommandQueueExecuteCommandLists(
 ///         + timeout expired
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandQueueSynchronize(
-    ze_command_queue_handle_t hCommandQueue,        ///< [in] handle of the command queue
-    uint64_t timeout                                ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then immediately returns the status of the command queue;
-                                                    ///< if UINT64_MAX, then function will not return until complete or device
-                                                    ///< is lost.
-                                                    ///< Due to external dependencies, timeout may be rounded to the closest
-                                                    ///< value allowed by the accuracy of those dependencies.
+    ze_command_queue_handle_t hCommandQueue,                                ///< [in] handle of the command queue
+    uint64_t timeout                                                        ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then immediately returns the status of the command queue;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
     );
 
 #if !defined(__GNUC__)
@@ -2340,20 +2497,29 @@ zeCommandQueueSynchronize(
 typedef uint32_t ze_command_list_flags_t;
 typedef enum _ze_command_list_flag_t
 {
-    ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING = ZE_BIT(0),  ///< driver may reorder commands (e.g., kernels, copies) between barriers
-                                                    ///< and synchronization primitives.
-                                                    ///< using this flag may increase Host overhead of ::zeCommandListClose.
-                                                    ///< therefore, this flag should **not** be set for low-latency usage-models.
-    ZE_COMMAND_LIST_FLAG_MAXIMIZE_THROUGHPUT = ZE_BIT(1),   ///< driver may perform additional optimizations that increase execution
-                                                    ///< throughput. 
-                                                    ///< using this flag may increase Host overhead of ::zeCommandListClose and ::zeCommandQueueExecuteCommandLists.
-                                                    ///< therefore, this flag should **not** be set for low-latency usage-models.
-    ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY = ZE_BIT(2), ///< command list should be optimized for submission to a single command
-                                                    ///< queue and device engine.
-                                                    ///< driver **must** disable any implicit optimizations for distributing
-                                                    ///< work across multiple engines.
-                                                    ///< this flag should be used when applications want full control over
-                                                    ///< multi-engine submission and scheduling.
+    ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING = ZE_BIT(0),                      ///< driver may reorder commands (e.g., kernels, copies) between barriers
+                                                                            ///< and synchronization primitives.
+                                                                            ///< using this flag may increase Host overhead of ::zeCommandListClose.
+                                                                            ///< therefore, this flag should **not** be set for low-latency usage-models.
+    ZE_COMMAND_LIST_FLAG_MAXIMIZE_THROUGHPUT = ZE_BIT(1),                   ///< driver may perform additional optimizations that increase execution
+                                                                            ///< throughput. 
+                                                                            ///< using this flag may increase Host overhead of ::zeCommandListClose and ::zeCommandQueueExecuteCommandLists.
+                                                                            ///< therefore, this flag should **not** be set for low-latency usage-models.
+    ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY = ZE_BIT(2),                         ///< command list should be optimized for submission to a single command
+                                                                            ///< queue and device engine.
+                                                                            ///< driver **must** disable any implicit optimizations for distributing
+                                                                            ///< work across multiple engines.
+                                                                            ///< this flag should be used when applications want full control over
+                                                                            ///< multi-engine submission and scheduling.
+    ZE_COMMAND_LIST_FLAG_IN_ORDER = ZE_BIT(3),                              ///< commands appended to this command list are executed in-order, with
+                                                                            ///< driver implementation
+                                                                            ///< enforcing dependencies between them. Application is not required to
+                                                                            ///< have the signal event
+                                                                            ///< of a given command being the wait event of the next to define an
+                                                                            ///< in-order list, and
+                                                                            ///< application is allowed to pass signal and wait events to each appended
+                                                                            ///< command to implement
+                                                                            ///< more complex dependency graphs. Cannot be combined with ::ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING.
     ZE_COMMAND_LIST_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_command_list_flag_t;
@@ -2362,15 +2528,15 @@ typedef enum _ze_command_list_flag_t
 /// @brief Command List descriptor
 typedef struct _ze_command_list_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t commandQueueGroupOrdinal;              ///< [in] command queue group ordinal to which this command list will be
-                                                    ///< submitted
-    ze_command_list_flags_t flags;                  ///< [in] usage flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_command_list_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics to balance
-                                                    ///< latency and throughput.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t commandQueueGroupOrdinal;                                      ///< [in] command queue group ordinal to which this command list will be
+                                                                            ///< submitted
+    ze_command_list_flags_t flags;                                          ///< [in] usage flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_command_list_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics to balance
+                                                                            ///< latency and throughput.
 
 } ze_command_list_desc_t;
 
@@ -2399,13 +2565,13 @@ typedef struct _ze_command_list_desc_t
 ///         + `nullptr == desc`
 ///         + `nullptr == phCommandList`
 ///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < desc->flags`
+///         + `0xf < desc->flags`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    const ze_command_list_desc_t* desc,             ///< [in] pointer to command list descriptor
-    ze_command_list_handle_t* phCommandList         ///< [out] pointer to handle of command list object created
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    const ze_command_list_desc_t* desc,                                     ///< [in] pointer to command list descriptor
+    ze_command_list_handle_t* phCommandList                                 ///< [out] pointer to handle of command list object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2415,6 +2581,8 @@ zeCommandListCreate(
 ///     - An immediate command list is used for low-latency submission of
 ///       commands.
 ///     - An immediate command list creates an implicit command queue.
+///     - Immediate command lists must not be passed to
+///       ::zeCommandQueueExecuteCommandLists.
 ///     - Commands appended into an immediate command list may execute
 ///       synchronously, by blocking until the command is complete.
 ///     - The command list is created in the 'open' state and never needs to be
@@ -2437,15 +2605,15 @@ zeCommandListCreate(
 ///         + `nullptr == altdesc`
 ///         + `nullptr == phCommandList`
 ///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x1 < altdesc->flags`
+///         + `0x3 < altdesc->flags`
 ///         + `::ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS < altdesc->mode`
 ///         + `::ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH < altdesc->priority`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListCreateImmediate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    const ze_command_queue_desc_t* altdesc,         ///< [in] pointer to command queue descriptor
-    ze_command_list_handle_t* phCommandList         ///< [out] pointer to handle of command list object created
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    const ze_command_queue_desc_t* altdesc,                                 ///< [in] pointer to command queue descriptor
+    ze_command_list_handle_t* phCommandList                                 ///< [out] pointer to handle of command list object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2471,7 +2639,7 @@ zeCommandListCreateImmediate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListDestroy(
-    ze_command_list_handle_t hCommandList           ///< [in][release] handle of command list object to destroy
+    ze_command_list_handle_t hCommandList                                   ///< [in][release] handle of command list object to destroy
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2492,7 +2660,7 @@ zeCommandListDestroy(
 ///         + `nullptr == hCommandList`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListClose(
-    ze_command_list_handle_t hCommandList           ///< [in] handle of command list object to close
+    ze_command_list_handle_t hCommandList                                   ///< [in] handle of command list object to close
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2516,7 +2684,7 @@ zeCommandListClose(
 ///         + `nullptr == hCommandList`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListReset(
-    ze_command_list_handle_t hCommandList           ///< [in] handle of command list object to reset
+    ze_command_list_handle_t hCommandList                                   ///< [in] handle of command list object to reset
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2526,10 +2694,10 @@ zeCommandListReset(
 /// @details
 ///     - The application must ensure the events are accessible by the device on
 ///       which the command list was created.
-///     - The timestamp frequency can be queried from
-///       ::ze_device_properties_t.timerResolution.
+///     - The timestamp frequency can be queried from the `timerResolution`
+///       member of ::ze_device_properties_t.
 ///     - The number of valid bits in the timestamp value can be queried from
-///       ::ze_device_properties_t.timestampValidBits.
+///       the `timestampValidBits` member of ::ze_device_properties_t.
 ///     - The application must ensure the memory pointed to by dstptr is
 ///       accessible by the device on which the command list was created.
 ///     - The application must ensure the command list and events were created,
@@ -2553,14 +2721,14 @@ zeCommandListReset(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendWriteGlobalTimestamp(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint64_t* dstptr,                               ///< [in,out] pointer to memory where timestamp value will be written; must
-                                                    ///< be 8byte-aligned.
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before executing query;
-                                                    ///< must be 0 if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before executing query
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint64_t* dstptr,                                                       ///< [in,out] pointer to memory where timestamp value will be written; must
+                                                                            ///< be 8byte-aligned.
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing query;
+                                                                            ///< must be 0 if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing query
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2591,14 +2759,14 @@ zeCommandListAppendWriteGlobalTimestamp(
 ///         + handle does not correspond to an immediate command list
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListHostSynchronize(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the immediate command list
-    uint64_t timeout                                ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then immediately returns the status of the immediate command list;
-                                                    ///< if UINT64_MAX, then function will not return until complete or device
-                                                    ///< is lost.
-                                                    ///< Due to external dependencies, timeout may be rounded to the closest
-                                                    ///< value allowed by the accuracy of those dependencies.
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the immediate command list
+    uint64_t timeout                                                        ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then immediately returns the status of the immediate command list;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
     );
 
 #if !defined(__GNUC__)
@@ -2643,12 +2811,12 @@ zeCommandListHostSynchronize(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendBarrier(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before executing barrier;
-                                                    ///< must be 0 if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before executing barrier
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing barrier;
+                                                                            ///< must be 0 if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing barrier
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2683,15 +2851,15 @@ zeCommandListAppendBarrier(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendMemoryRangesBarrier(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint32_t numRanges,                             ///< [in] number of memory ranges
-    const size_t* pRangeSizes,                      ///< [in][range(0, numRanges)] array of sizes of memory range
-    const void** pRanges,                           ///< [in][range(0, numRanges)] array of memory ranges
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before executing barrier;
-                                                    ///< must be 0 if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before executing barrier
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numRanges,                                                     ///< [in] number of memory ranges
+    const size_t* pRangeSizes,                                              ///< [in][range(0, numRanges)] array of sizes of memory range
+    const void** pRanges,                                                   ///< [in][range(0, numRanges)] array of memory ranges
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing barrier;
+                                                                            ///< must be 0 if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing barrier
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2718,8 +2886,8 @@ zeCommandListAppendMemoryRangesBarrier(
 ///         + `nullptr == hDevice`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextSystemBarrier(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice                      ///< [in] handle of the device
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice                                              ///< [in] handle of the device
     );
 
 #if !defined(__GNUC__)
@@ -2769,15 +2937,15 @@ zeContextSystemBarrier(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendMemoryCopy(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    size_t size,                                    ///< [in] size in bytes to copy
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    size_t size,                                                            ///< [in] size in bytes to copy
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2791,8 +2959,9 @@ zeCommandListAppendMemoryCopy(
 ///       execution.
 ///     - The value to initialize memory to is described by the pattern and the
 ///       pattern size.
-///     - The pattern size must be a power-of-two and less than or equal to
-///       ::ze_command_queue_group_properties_t.maxMemoryFillPatternSize.
+///     - The pattern size must be a power-of-two and less than or equal to the
+///       `maxMemoryFillPatternSize` member of
+///       ::ze_command_queue_group_properties_t.
 ///     - The application must ensure the events are accessible by the device on
 ///       which the command list was created.
 ///     - The application must ensure the command list and events were created,
@@ -2822,29 +2991,29 @@ zeCommandListAppendMemoryCopy(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendMemoryFill(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* ptr,                                      ///< [in] pointer to memory to initialize
-    const void* pattern,                            ///< [in] pointer to value to initialize memory to
-    size_t pattern_size,                            ///< [in] size in bytes of the value to initialize memory to
-    size_t size,                                    ///< [in] size in bytes to initialize
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* ptr,                                                              ///< [in] pointer to memory to initialize
+    const void* pattern,                                                    ///< [in] pointer to value to initialize memory to
+    size_t pattern_size,                                                    ///< [in] size in bytes of the value to initialize memory to
+    size_t size,                                                            ///< [in] size in bytes to initialize
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Copy region descriptor
 typedef struct _ze_copy_region_t
 {
-    uint32_t originX;                               ///< [in] The origin x offset for region in bytes
-    uint32_t originY;                               ///< [in] The origin y offset for region in rows
-    uint32_t originZ;                               ///< [in] The origin z offset for region in slices
-    uint32_t width;                                 ///< [in] The region width relative to origin in bytes
-    uint32_t height;                                ///< [in] The region height relative to origin in rows
-    uint32_t depth;                                 ///< [in] The region depth relative to origin in slices. Set this to 0 for
-                                                    ///< 2D copy.
+    uint32_t originX;                                                       ///< [in] The origin x offset for region in bytes
+    uint32_t originY;                                                       ///< [in] The origin y offset for region in rows
+    uint32_t originZ;                                                       ///< [in] The origin z offset for region in slices
+    uint32_t width;                                                         ///< [in] The region width relative to origin in bytes
+    uint32_t height;                                                        ///< [in] The region height relative to origin in rows
+    uint32_t depth;                                                         ///< [in] The region depth relative to origin in slices. Set this to 0 for
+                                                                            ///< 2D copy.
 
 } ze_copy_region_t;
 
@@ -2888,24 +3057,24 @@ typedef struct _ze_copy_region_t
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendMemoryCopyRegion(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    const ze_copy_region_t* dstRegion,              ///< [in] pointer to destination region to copy to
-    uint32_t dstPitch,                              ///< [in] destination pitch in bytes
-    uint32_t dstSlicePitch,                         ///< [in] destination slice pitch in bytes. This is required for 3D region
-                                                    ///< copies where ::ze_copy_region_t.depth is not 0, otherwise it's
-                                                    ///< ignored.
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    const ze_copy_region_t* srcRegion,              ///< [in] pointer to source region to copy from
-    uint32_t srcPitch,                              ///< [in] source pitch in bytes
-    uint32_t srcSlicePitch,                         ///< [in] source slice pitch in bytes. This is required for 3D region
-                                                    ///< copies where ::ze_copy_region_t.depth is not 0, otherwise it's
-                                                    ///< ignored.
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    const ze_copy_region_t* dstRegion,                                      ///< [in] pointer to destination region to copy to
+    uint32_t dstPitch,                                                      ///< [in] destination pitch in bytes
+    uint32_t dstSlicePitch,                                                 ///< [in] destination slice pitch in bytes. This is required for 3D region
+                                                                            ///< copies where the `depth` member of ::ze_copy_region_t is not 0,
+                                                                            ///< otherwise it's ignored.
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    const ze_copy_region_t* srcRegion,                                      ///< [in] pointer to source region to copy from
+    uint32_t srcPitch,                                                      ///< [in] source pitch in bytes
+    uint32_t srcSlicePitch,                                                 ///< [in] source slice pitch in bytes. This is required for 3D region
+                                                                            ///< copies where the `depth` member of ::ze_copy_region_t is not 0,
+                                                                            ///< otherwise it's ignored.
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2943,16 +3112,16 @@ zeCommandListAppendMemoryCopyRegion(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendMemoryCopyFromContext(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    ze_context_handle_t hContextSrc,                ///< [in] handle of source context object
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    size_t size,                                    ///< [in] size in bytes to copy
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    ze_context_handle_t hContextSrc,                                        ///< [in] handle of source context object
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    size_t size,                                                            ///< [in] size in bytes to copy
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2988,27 +3157,27 @@ zeCommandListAppendMemoryCopyFromContext(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendImageCopy(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_image_handle_t hDstImage,                    ///< [in] handle of destination image to copy to
-    ze_image_handle_t hSrcImage,                    ///< [in] handle of source image to copy from
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_image_handle_t hDstImage,                                            ///< [in] handle of destination image to copy to
+    ze_image_handle_t hSrcImage,                                            ///< [in] handle of source image to copy from
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Region descriptor
 typedef struct _ze_image_region_t
 {
-    uint32_t originX;                               ///< [in] The origin x offset for region in pixels
-    uint32_t originY;                               ///< [in] The origin y offset for region in pixels
-    uint32_t originZ;                               ///< [in] The origin z offset for region in pixels
-    uint32_t width;                                 ///< [in] The region width relative to origin in pixels
-    uint32_t height;                                ///< [in] The region height relative to origin in pixels
-    uint32_t depth;                                 ///< [in] The region depth relative to origin. For 1D or 2D images, set
-                                                    ///< this to 1.
+    uint32_t originX;                                                       ///< [in] The origin x offset for region in pixels
+    uint32_t originY;                                                       ///< [in] The origin y offset for region in pixels
+    uint32_t originZ;                                                       ///< [in] The origin z offset for region in pixels
+    uint32_t width;                                                         ///< [in] The region width relative to origin in pixels
+    uint32_t height;                                                        ///< [in] The region height relative to origin in pixels
+    uint32_t depth;                                                         ///< [in] The region depth relative to origin. For 1D or 2D images, set
+                                                                            ///< this to 1.
 
 } ze_image_region_t;
 
@@ -3045,16 +3214,16 @@ typedef struct _ze_image_region_t
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendImageCopyRegion(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_image_handle_t hDstImage,                    ///< [in] handle of destination image to copy to
-    ze_image_handle_t hSrcImage,                    ///< [in] handle of source image to copy from
-    const ze_image_region_t* pDstRegion,            ///< [in][optional] destination region descriptor
-    const ze_image_region_t* pSrcRegion,            ///< [in][optional] source region descriptor
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_image_handle_t hDstImage,                                            ///< [in] handle of destination image to copy to
+    ze_image_handle_t hSrcImage,                                            ///< [in] handle of source image to copy from
+    const ze_image_region_t* pDstRegion,                                    ///< [in][optional] destination region descriptor
+    const ze_image_region_t* pSrcRegion,                                    ///< [in][optional] source region descriptor
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3096,15 +3265,15 @@ zeCommandListAppendImageCopyRegion(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendImageCopyToMemory(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    ze_image_handle_t hSrcImage,                    ///< [in] handle of source image to copy from
-    const ze_image_region_t* pSrcRegion,            ///< [in][optional] source region descriptor
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    ze_image_handle_t hSrcImage,                                            ///< [in] handle of source image to copy from
+    const ze_image_region_t* pSrcRegion,                                    ///< [in][optional] source region descriptor
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3146,15 +3315,15 @@ zeCommandListAppendImageCopyToMemory(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendImageCopyFromMemory(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_image_handle_t hDstImage,                    ///< [in] handle of destination image to copy to
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    const ze_image_region_t* pDstRegion,            ///< [in][optional] destination region descriptor
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_image_handle_t hDstImage,                                            ///< [in] handle of destination image to copy to
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    const ze_image_region_t* pDstRegion,                                    ///< [in][optional] destination region descriptor
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3198,23 +3367,26 @@ zeCommandListAppendImageCopyFromMemory(
 ///         + `nullptr == ptr`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendMemoryPrefetch(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    const void* ptr,                                ///< [in] pointer to start of the memory range to prefetch
-    size_t size                                     ///< [in] size in bytes of the memory range to prefetch
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    const void* ptr,                                                        ///< [in] pointer to start of the memory range to prefetch
+    size_t size                                                             ///< [in] size in bytes of the memory range to prefetch
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Supported memory advice hints
 typedef enum _ze_memory_advice_t
 {
-    ZE_MEMORY_ADVICE_SET_READ_MOSTLY = 0,           ///< hint that memory will be read from frequently and written to rarely
-    ZE_MEMORY_ADVICE_CLEAR_READ_MOSTLY = 1,         ///< removes the affect of ::ZE_MEMORY_ADVICE_SET_READ_MOSTLY
-    ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION = 2,    ///< hint that the preferred memory location is the specified device
-    ZE_MEMORY_ADVICE_CLEAR_PREFERRED_LOCATION = 3,  ///< removes the affect of ::ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION
-    ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY = 4,     ///< hints that memory will mostly be accessed non-atomically
-    ZE_MEMORY_ADVICE_CLEAR_NON_ATOMIC_MOSTLY = 5,   ///< removes the affect of ::ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY
-    ZE_MEMORY_ADVICE_BIAS_CACHED = 6,               ///< hints that memory should be cached
-    ZE_MEMORY_ADVICE_BIAS_UNCACHED = 7,             ///< hints that memory should be not be cached
+    ZE_MEMORY_ADVICE_SET_READ_MOSTLY = 0,                                   ///< hint that memory will be read from frequently and written to rarely
+    ZE_MEMORY_ADVICE_CLEAR_READ_MOSTLY = 1,                                 ///< removes the effect of ::ZE_MEMORY_ADVICE_SET_READ_MOSTLY
+    ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION = 2,                            ///< hint that the preferred memory location is the specified device
+    ZE_MEMORY_ADVICE_CLEAR_PREFERRED_LOCATION = 3,                          ///< removes the effect of ::ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION
+    ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY = 4,                             ///< hints that memory will mostly be accessed non-atomically
+    ZE_MEMORY_ADVICE_CLEAR_NON_ATOMIC_MOSTLY = 5,                           ///< removes the effect of ::ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY
+    ZE_MEMORY_ADVICE_BIAS_CACHED = 6,                                       ///< hints that memory should be cached
+    ZE_MEMORY_ADVICE_BIAS_UNCACHED = 7,                                     ///< hints that memory should be not be cached
+    ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION = 8,              ///< hint that the preferred memory location is host memory
+    ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION = 9,            ///< removes the effect of
+                                                                            ///< ::ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION
     ZE_MEMORY_ADVICE_FORCE_UINT32 = 0x7fffffff
 
 } ze_memory_advice_t;
@@ -3255,14 +3427,14 @@ typedef enum _ze_memory_advice_t
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == ptr`
 ///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZE_MEMORY_ADVICE_BIAS_UNCACHED < advice`
+///         + `::ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION < advice`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendMemAdvise(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_device_handle_t hDevice,                     ///< [in] device associated with the memory advice
-    const void* ptr,                                ///< [in] Pointer to the start of the memory range
-    size_t size,                                    ///< [in] Size in bytes of the memory range
-    ze_memory_advice_t advice                       ///< [in] Memory advice for the memory range
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_device_handle_t hDevice,                                             ///< [in] device associated with the memory advice
+    const void* ptr,                                                        ///< [in] Pointer to the start of the memory range
+    size_t size,                                                            ///< [in] Size in bytes of the memory range
+    ze_memory_advice_t advice                                               ///< [in] Memory advice for the memory range
     );
 
 #if !defined(__GNUC__)
@@ -3277,12 +3449,12 @@ zeCommandListAppendMemAdvise(
 typedef uint32_t ze_event_pool_flags_t;
 typedef enum _ze_event_pool_flag_t
 {
-    ZE_EVENT_POOL_FLAG_HOST_VISIBLE = ZE_BIT(0),    ///< signals and waits are also visible to host
-    ZE_EVENT_POOL_FLAG_IPC = ZE_BIT(1),             ///< signals and waits may be shared across processes
-    ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP = ZE_BIT(2),///< Indicates all events in pool will contain kernel timestamps
-    ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP = ZE_BIT(3), ///< Indicates all events in pool will contain kernel timestamps
-                                                    ///< synchronized to host time domain; cannot be combined with
-                                                    ///< ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP
+    ZE_EVENT_POOL_FLAG_HOST_VISIBLE = ZE_BIT(0),                            ///< signals and waits are also visible to host
+    ZE_EVENT_POOL_FLAG_IPC = ZE_BIT(1),                                     ///< signals and waits may be shared across processes
+    ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP = ZE_BIT(2),                        ///< Indicates all events in pool will contain kernel timestamps
+    ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP = ZE_BIT(3),                 ///< Indicates all events in pool will contain kernel timestamps
+                                                                            ///< synchronized to host time domain; cannot be combined with
+                                                                            ///< ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP
     ZE_EVENT_POOL_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_event_pool_flag_t;
@@ -3291,14 +3463,14 @@ typedef enum _ze_event_pool_flag_t
 /// @brief Event pool descriptor
 typedef struct _ze_event_pool_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_event_pool_flags_t flags;                    ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_event_pool_flag_t;
-                                                    ///< default behavior is signals and waits are visible to the entire device
-                                                    ///< and peer devices.
-    uint32_t count;                                 ///< [in] number of events within the pool; must be greater than 0
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_event_pool_flags_t flags;                                            ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_event_pool_flag_t;
+                                                                            ///< default behavior is signals and waits are visible to the entire device
+                                                                            ///< and peer devices.
+    uint32_t count;                                                         ///< [in] number of events within the pool; must be greater than 0
 
 } ze_event_pool_desc_t;
 
@@ -3329,15 +3501,15 @@ typedef struct _ze_event_pool_desc_t
 ///         + `(nullptr == phDevices) && (0 < numDevices)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventPoolCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_event_pool_desc_t* desc,               ///< [in] pointer to event pool descriptor
-    uint32_t numDevices,                            ///< [in][optional] number of device handles; must be 0 if `nullptr ==
-                                                    ///< phDevices`
-    ze_device_handle_t* phDevices,                  ///< [in][optional][range(0, numDevices)] array of device handles which
-                                                    ///< have visibility to the event pool.
-                                                    ///< if nullptr, then event pool is visible to all devices supported by the
-                                                    ///< driver instance.
-    ze_event_pool_handle_t* phEventPool             ///< [out] pointer handle of event pool object created
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_event_pool_desc_t* desc,                                       ///< [in] pointer to event pool descriptor
+    uint32_t numDevices,                                                    ///< [in][optional] number of device handles; must be 0 if `nullptr ==
+                                                                            ///< phDevices`
+    ze_device_handle_t* phDevices,                                          ///< [in][optional][range(0, numDevices)] array of device handles which
+                                                                            ///< have visibility to the event pool.
+                                                                            ///< if nullptr, then event pool is visible to all devices supported by the
+                                                                            ///< driver instance.
+    ze_event_pool_handle_t* phEventPool                                     ///< [out] pointer handle of event pool object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3365,7 +3537,7 @@ zeEventPoolCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventPoolDestroy(
-    ze_event_pool_handle_t hEventPool               ///< [in][release] handle of event pool object to destroy
+    ze_event_pool_handle_t hEventPool                                       ///< [in][release] handle of event pool object to destroy
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3373,12 +3545,12 @@ zeEventPoolDestroy(
 typedef uint32_t ze_event_scope_flags_t;
 typedef enum _ze_event_scope_flag_t
 {
-    ZE_EVENT_SCOPE_FLAG_SUBDEVICE = ZE_BIT(0),      ///< cache hierarchies are flushed or invalidated sufficient for local
-                                                    ///< sub-device access
-    ZE_EVENT_SCOPE_FLAG_DEVICE = ZE_BIT(1),         ///< cache hierarchies are flushed or invalidated sufficient for global
-                                                    ///< device access and peer device access
-    ZE_EVENT_SCOPE_FLAG_HOST = ZE_BIT(2),           ///< cache hierarchies are flushed or invalidated sufficient for device and
-                                                    ///< host access
+    ZE_EVENT_SCOPE_FLAG_SUBDEVICE = ZE_BIT(0),                              ///< cache hierarchies are flushed or invalidated sufficient for local
+                                                                            ///< sub-device access
+    ZE_EVENT_SCOPE_FLAG_DEVICE = ZE_BIT(1),                                 ///< cache hierarchies are flushed or invalidated sufficient for global
+                                                                            ///< device access and peer device access
+    ZE_EVENT_SCOPE_FLAG_HOST = ZE_BIT(2),                                   ///< cache hierarchies are flushed or invalidated sufficient for device and
+                                                                            ///< host access
     ZE_EVENT_SCOPE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_event_scope_flag_t;
@@ -3387,21 +3559,21 @@ typedef enum _ze_event_scope_flag_t
 /// @brief Event descriptor
 typedef struct _ze_event_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t index;                                 ///< [in] index of the event within the pool; must be less than the count
-                                                    ///< specified during pool creation
-    ze_event_scope_flags_t signal;                  ///< [in] defines the scope of relevant cache hierarchies to flush on a
-                                                    ///< signal action before the event is triggered.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
-                                                    ///< default behavior is synchronization within the command list only, no
-                                                    ///< additional cache hierarchies are flushed.
-    ze_event_scope_flags_t wait;                    ///< [in] defines the scope of relevant cache hierarchies to invalidate on
-                                                    ///< a wait action after the event is complete.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
-                                                    ///< default behavior is synchronization within the command list only, no
-                                                    ///< additional cache hierarchies are invalidated.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t index;                                                         ///< [in] index of the event within the pool; must be less than the count
+                                                                            ///< specified during pool creation
+    ze_event_scope_flags_t signal;                                          ///< [in] defines the scope of relevant cache hierarchies to flush on a
+                                                                            ///< signal action before the event is triggered.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
+                                                                            ///< default behavior is synchronization within the command list only, no
+                                                                            ///< additional cache hierarchies are flushed.
+    ze_event_scope_flags_t wait;                                            ///< [in] defines the scope of relevant cache hierarchies to invalidate on
+                                                                            ///< a wait action after the event is complete.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
+                                                                            ///< default behavior is synchronization within the command list only, no
+                                                                            ///< additional cache hierarchies are invalidated.
 
 } ze_event_desc_t;
 
@@ -3438,9 +3610,9 @@ typedef struct _ze_event_desc_t
 ///         + `0x7 < desc->wait`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventCreate(
-    ze_event_pool_handle_t hEventPool,              ///< [in] handle of the event pool
-    const ze_event_desc_t* desc,                    ///< [in] pointer to event descriptor
-    ze_event_handle_t* phEvent                      ///< [out] pointer to handle of event object created
+    ze_event_pool_handle_t hEventPool,                                      ///< [in] handle of the event pool
+    const ze_event_desc_t* desc,                                            ///< [in] pointer to event descriptor
+    ze_event_handle_t* phEvent                                              ///< [out] pointer to handle of event object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3471,7 +3643,7 @@ zeEventCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventDestroy(
-    ze_event_handle_t hEvent                        ///< [in][release] handle of event object to destroy
+    ze_event_handle_t hEvent                                                ///< [in][release] handle of event object to destroy
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3495,8 +3667,8 @@ zeEventDestroy(
 ///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventPoolGetIpcHandle(
-    ze_event_pool_handle_t hEventPool,              ///< [in] handle of event pool object
-    ze_ipc_event_pool_handle_t* phIpc               ///< [out] Returned IPC event handle
+    ze_event_pool_handle_t hEventPool,                                      ///< [in] handle of event pool object
+    ze_ipc_event_pool_handle_t* phIpc                                       ///< [out] Returned IPC event handle
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3525,9 +3697,9 @@ zeEventPoolGetIpcHandle(
 ///         + `nullptr == hContext`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventPoolPutIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object associated with the IPC event pool
-                                                    ///< handle
-    ze_ipc_event_pool_handle_t hIpc                 ///< [in] IPC event pool handle
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object associated with the IPC event pool
+                                                                            ///< handle
+    ze_ipc_event_pool_handle_t hIpc                                         ///< [in] IPC event pool handle
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3539,6 +3711,18 @@ zeEventPoolPutIpcHandle(
 ///       unique event pool handles.
 ///     - The event handle in this process should not be freed with
 ///       ::zeEventPoolDestroy, but rather with ::zeEventPoolCloseIpcHandle.
+///     - If the original event pool has been created for a device containing a
+///       number of sub-devices, then the event pool
+///       returned by this call may be used on a device containing the same
+///       number of sub-devices, or on any of
+///       those sub-devices.
+///     - However, if the original event pool has been created for a sub-device,
+///       then the event pool returned by this call
+///       cannot be used on a device containing any number of sub-devices, and
+///       must be used only in a sub-device. This ensures
+///       functional correctness for any implementation or optimizations the
+///       underlying Level Zero driver may do on
+///       event pools and events.
 ///     - The application may call this function from simultaneous threads.
 /// 
 /// @returns
@@ -3553,10 +3737,10 @@ zeEventPoolPutIpcHandle(
 ///         + `nullptr == phEventPool`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventPoolOpenIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object to associate with the IPC event pool
-                                                    ///< handle
-    ze_ipc_event_pool_handle_t hIpc,                ///< [in] IPC event pool handle
-    ze_event_pool_handle_t* phEventPool             ///< [out] pointer handle of event pool object created
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object to associate with the IPC event pool
+                                                                            ///< handle
+    ze_ipc_event_pool_handle_t hIpc,                                        ///< [in] IPC event pool handle
+    ze_event_pool_handle_t* phEventPool                                     ///< [out] pointer handle of event pool object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3578,7 +3762,7 @@ zeEventPoolOpenIpcHandle(
 ///         + `nullptr == hEventPool`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventPoolCloseIpcHandle(
-    ze_event_pool_handle_t hEventPool               ///< [in][release] handle of event pool object
+    ze_event_pool_handle_t hEventPool                                       ///< [in][release] handle of event pool object
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3615,8 +3799,8 @@ zeEventPoolCloseIpcHandle(
 ///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendSignalEvent(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3644,10 +3828,10 @@ zeCommandListAppendSignalEvent(
 ///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendWaitOnEvents(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint32_t numEvents,                             ///< [in] number of events to wait on before continuing
-    ze_event_handle_t* phEvents                     ///< [in][range(0, numEvents)] handles of the events to wait on before
-                                                    ///< continuing
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numEvents,                                                     ///< [in] number of events to wait on before continuing
+    ze_event_handle_t* phEvents                                             ///< [in][range(0, numEvents)] handles of the events to wait on before
+                                                                            ///< continuing
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3677,7 +3861,7 @@ zeCommandListAppendWaitOnEvents(
 ///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventHostSignal(
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3704,14 +3888,14 @@ zeEventHostSignal(
 ///         + timeout expired
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventHostSynchronize(
-    ze_event_handle_t hEvent,                       ///< [in] handle of the event
-    uint64_t timeout                                ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then operates exactly like ::zeEventQueryStatus;
-                                                    ///< if UINT64_MAX, then function will not return until complete or device
-                                                    ///< is lost.
-                                                    ///< Due to external dependencies, timeout may be rounded to the closest
-                                                    ///< value allowed by the accuracy of those dependencies.
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    uint64_t timeout                                                        ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then operates exactly like ::zeEventQueryStatus;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3739,7 +3923,7 @@ zeEventHostSynchronize(
 ///         + not signaled
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventQueryStatus(
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3771,8 +3955,8 @@ zeEventQueryStatus(
 ///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendEventReset(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3797,21 +3981,21 @@ zeCommandListAppendEventReset(
 ///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventHostReset(
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Kernel timestamp clock data
 /// 
 /// @details
-///     - The timestamp frequency can be queried from
-///       ::ze_device_properties_t.timerResolution.
+///     - The timestamp frequency can be queried from the `timerResolution`
+///       member of ::ze_device_properties_t.
 ///     - The number of valid bits in the timestamp value can be queried from
-///       ::ze_device_properties_t.kernelTimestampValidBits.
+///       the `kernelTimestampValidBits` member of ::ze_device_properties_t.
 typedef struct _ze_kernel_timestamp_data_t
 {
-    uint64_t kernelStart;                           ///< [out] device clock at start of kernel execution
-    uint64_t kernelEnd;                             ///< [out] device clock at end of kernel execution
+    uint64_t kernelStart;                                                   ///< [out] device clock at start of kernel execution
+    uint64_t kernelEnd;                                                     ///< [out] device clock at end of kernel execution
 
 } ze_kernel_timestamp_data_t;
 
@@ -3819,9 +4003,9 @@ typedef struct _ze_kernel_timestamp_data_t
 /// @brief Kernel timestamp result
 typedef struct _ze_kernel_timestamp_result_t
 {
-    ze_kernel_timestamp_data_t global;              ///< [out] wall-clock data
-    ze_kernel_timestamp_data_t context;             ///< [out] context-active data; only includes clocks while device context
-                                                    ///< was actively executing.
+    ze_kernel_timestamp_data_t global;                                      ///< [out] wall-clock data
+    ze_kernel_timestamp_data_t context;                                     ///< [out] context-active data; only includes clocks while device context
+                                                                            ///< was actively executing.
 
 } ze_kernel_timestamp_result_t;
 
@@ -3852,8 +4036,8 @@ typedef struct _ze_kernel_timestamp_result_t
 ///         + not signaled
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventQueryKernelTimestamp(
-    ze_event_handle_t hEvent,                       ///< [in] handle of the event
-    ze_kernel_timestamp_result_t* dstptr            ///< [in,out] pointer to memory for where timestamp result will be written.
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_kernel_timestamp_result_t* dstptr                                    ///< [in,out] pointer to memory for where timestamp result will be written.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3893,19 +4077,19 @@ zeEventQueryKernelTimestamp(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendQueryKernelTimestamps(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint32_t numEvents,                             ///< [in] the number of timestamp events to query
-    ze_event_handle_t* phEvents,                    ///< [in][range(0, numEvents)] handles of timestamp events to query
-    void* dstptr,                                   ///< [in,out] pointer to memory where ::ze_kernel_timestamp_result_t will
-                                                    ///< be written; must be size-aligned.
-    const size_t* pOffsets,                         ///< [in][optional][range(0, numEvents)] offset, in bytes, to write
-                                                    ///< results; address must be 4byte-aligned and offsets must be
-                                                    ///< size-aligned.
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before executing query;
-                                                    ///< must be 0 if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before executing query
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numEvents,                                                     ///< [in] the number of timestamp events to query
+    ze_event_handle_t* phEvents,                                            ///< [in][range(0, numEvents)] handles of timestamp events to query
+    void* dstptr,                                                           ///< [in,out] pointer to memory where ::ze_kernel_timestamp_result_t will
+                                                                            ///< be written; must be size-aligned.
+    const size_t* pOffsets,                                                 ///< [in][optional][range(0, numEvents)] offset, in bytes, to write
+                                                                            ///< results; address must be 4byte-aligned and offsets must be
+                                                                            ///< size-aligned.
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing query;
+                                                                            ///< must be 0 if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing query
     );
 
 #if !defined(__GNUC__)
@@ -3920,7 +4104,7 @@ zeCommandListAppendQueryKernelTimestamps(
 typedef uint32_t ze_fence_flags_t;
 typedef enum _ze_fence_flag_t
 {
-    ZE_FENCE_FLAG_SIGNALED = ZE_BIT(0),             ///< fence is created in the signaled state, otherwise not signaled.
+    ZE_FENCE_FLAG_SIGNALED = ZE_BIT(0),                                     ///< fence is created in the signaled state, otherwise not signaled.
     ZE_FENCE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_fence_flag_t;
@@ -3929,11 +4113,11 @@ typedef enum _ze_fence_flag_t
 /// @brief Fence descriptor
 typedef struct _ze_fence_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_fence_flags_t flags;                         ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_fence_flag_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_fence_flags_t flags;                                                 ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_fence_flag_t.
 
 } ze_fence_desc_t;
 
@@ -3967,9 +4151,9 @@ typedef struct _ze_fence_desc_t
 ///         + `0x1 < desc->flags`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFenceCreate(
-    ze_command_queue_handle_t hCommandQueue,        ///< [in] handle of command queue
-    const ze_fence_desc_t* desc,                    ///< [in] pointer to fence descriptor
-    ze_fence_handle_t* phFence                      ///< [out] pointer to handle of fence object created
+    ze_command_queue_handle_t hCommandQueue,                                ///< [in] handle of command queue
+    const ze_fence_desc_t* desc,                                            ///< [in] pointer to fence descriptor
+    ze_fence_handle_t* phFence                                              ///< [out] pointer to handle of fence object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3999,7 +4183,7 @@ zeFenceCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFenceDestroy(
-    ze_fence_handle_t hFence                        ///< [in][release] handle of fence object to destroy
+    ze_fence_handle_t hFence                                                ///< [in][release] handle of fence object to destroy
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4026,14 +4210,14 @@ zeFenceDestroy(
 ///         + timeout expired
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFenceHostSynchronize(
-    ze_fence_handle_t hFence,                       ///< [in] handle of the fence
-    uint64_t timeout                                ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then operates exactly like ::zeFenceQueryStatus;
-                                                    ///< if UINT64_MAX, then function will not return until complete or device
-                                                    ///< is lost.
-                                                    ///< Due to external dependencies, timeout may be rounded to the closest
-                                                    ///< value allowed by the accuracy of those dependencies.
+    ze_fence_handle_t hFence,                                               ///< [in] handle of the fence
+    uint64_t timeout                                                        ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then operates exactly like ::zeFenceQueryStatus;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4060,7 +4244,7 @@ zeFenceHostSynchronize(
 ///         + not signaled
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFenceQueryStatus(
-    ze_fence_handle_t hFence                        ///< [in] handle of the fence
+    ze_fence_handle_t hFence                                                ///< [in] handle of the fence
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4084,7 +4268,7 @@ zeFenceQueryStatus(
 ///         + `nullptr == hFence`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFenceReset(
-    ze_fence_handle_t hFence                        ///< [in] handle of the fence
+    ze_fence_handle_t hFence                                                ///< [in] handle of the fence
     );
 
 #if !defined(__GNUC__)
@@ -4099,8 +4283,8 @@ zeFenceReset(
 typedef uint32_t ze_image_flags_t;
 typedef enum _ze_image_flag_t
 {
-    ZE_IMAGE_FLAG_KERNEL_WRITE = ZE_BIT(0),         ///< kernels will write contents
-    ZE_IMAGE_FLAG_BIAS_UNCACHED = ZE_BIT(1),        ///< device should not cache contents
+    ZE_IMAGE_FLAG_KERNEL_WRITE = ZE_BIT(0),                                 ///< kernels will write contents
+    ZE_IMAGE_FLAG_BIAS_UNCACHED = ZE_BIT(1),                                ///< device should not cache contents
     ZE_IMAGE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_flag_t;
@@ -4109,12 +4293,12 @@ typedef enum _ze_image_flag_t
 /// @brief Supported image types
 typedef enum _ze_image_type_t
 {
-    ZE_IMAGE_TYPE_1D = 0,                           ///< 1D
-    ZE_IMAGE_TYPE_1DARRAY = 1,                      ///< 1D array
-    ZE_IMAGE_TYPE_2D = 2,                           ///< 2D
-    ZE_IMAGE_TYPE_2DARRAY = 3,                      ///< 2D array
-    ZE_IMAGE_TYPE_3D = 4,                           ///< 3D
-    ZE_IMAGE_TYPE_BUFFER = 5,                       ///< Buffer
+    ZE_IMAGE_TYPE_1D = 0,                                                   ///< 1D
+    ZE_IMAGE_TYPE_1DARRAY = 1,                                              ///< 1D array
+    ZE_IMAGE_TYPE_2D = 2,                                                   ///< 2D
+    ZE_IMAGE_TYPE_2DARRAY = 3,                                              ///< 2D array
+    ZE_IMAGE_TYPE_3D = 4,                                                   ///< 3D
+    ZE_IMAGE_TYPE_BUFFER = 5,                                               ///< Buffer
     ZE_IMAGE_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_type_t;
@@ -4123,49 +4307,49 @@ typedef enum _ze_image_type_t
 /// @brief Supported image format layouts
 typedef enum _ze_image_format_layout_t
 {
-    ZE_IMAGE_FORMAT_LAYOUT_8 = 0,                   ///< 8-bit single component layout
-    ZE_IMAGE_FORMAT_LAYOUT_16 = 1,                  ///< 16-bit single component layout
-    ZE_IMAGE_FORMAT_LAYOUT_32 = 2,                  ///< 32-bit single component layout
-    ZE_IMAGE_FORMAT_LAYOUT_8_8 = 3,                 ///< 2-component 8-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 = 4,             ///< 4-component 8-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_16_16 = 5,               ///< 2-component 16-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 = 6,         ///< 4-component 16-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_32_32 = 7,               ///< 2-component 32-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 = 8,         ///< 4-component 32-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_10_10_10_2 = 9,          ///< 4-component 10_10_10_2 layout
-    ZE_IMAGE_FORMAT_LAYOUT_11_11_10 = 10,           ///< 3-component 11_11_10 layout
-    ZE_IMAGE_FORMAT_LAYOUT_5_6_5 = 11,              ///< 3-component 5_6_5 layout
-    ZE_IMAGE_FORMAT_LAYOUT_5_5_5_1 = 12,            ///< 4-component 5_5_5_1 layout
-    ZE_IMAGE_FORMAT_LAYOUT_4_4_4_4 = 13,            ///< 4-component 4_4_4_4 layout
-    ZE_IMAGE_FORMAT_LAYOUT_Y8 = 14,                 ///< Media Format: Y8. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_NV12 = 15,               ///< Media Format: NV12. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_YUYV = 16,               ///< Media Format: YUYV. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_VYUY = 17,               ///< Media Format: VYUY. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_YVYU = 18,               ///< Media Format: YVYU. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_UYVY = 19,               ///< Media Format: UYVY. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_AYUV = 20,               ///< Media Format: AYUV. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P010 = 21,               ///< Media Format: P010. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y410 = 22,               ///< Media Format: Y410. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P012 = 23,               ///< Media Format: P012. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y16 = 24,                ///< Media Format: Y16. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P016 = 25,               ///< Media Format: P016. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y216 = 26,               ///< Media Format: Y216. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P216 = 27,               ///< Media Format: P216. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P8 = 28,                 ///< Media Format: P8. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_YUY2 = 29,               ///< Media Format: YUY2. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_A8P8 = 30,               ///< Media Format: A8P8. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_IA44 = 31,               ///< Media Format: IA44. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_AI44 = 32,               ///< Media Format: AI44. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y416 = 33,               ///< Media Format: Y416. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y210 = 34,               ///< Media Format: Y210. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_I420 = 35,               ///< Media Format: I420. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_YV12 = 36,               ///< Media Format: YV12. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_400P = 37,               ///< Media Format: 400P. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_422H = 38,               ///< Media Format: 422H. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_422V = 39,               ///< Media Format: 422V. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_444P = 40,               ///< Media Format: 444P. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_RGBP = 41,               ///< Media Format: RGBP. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_BRGP = 42,               ///< Media Format: BRGP. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_8 = 0,                                           ///< 8-bit single component layout
+    ZE_IMAGE_FORMAT_LAYOUT_16 = 1,                                          ///< 16-bit single component layout
+    ZE_IMAGE_FORMAT_LAYOUT_32 = 2,                                          ///< 32-bit single component layout
+    ZE_IMAGE_FORMAT_LAYOUT_8_8 = 3,                                         ///< 2-component 8-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 = 4,                                     ///< 4-component 8-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_16_16 = 5,                                       ///< 2-component 16-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 = 6,                                 ///< 4-component 16-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_32_32 = 7,                                       ///< 2-component 32-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 = 8,                                 ///< 4-component 32-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_10_10_10_2 = 9,                                  ///< 4-component 10_10_10_2 layout
+    ZE_IMAGE_FORMAT_LAYOUT_11_11_10 = 10,                                   ///< 3-component 11_11_10 layout
+    ZE_IMAGE_FORMAT_LAYOUT_5_6_5 = 11,                                      ///< 3-component 5_6_5 layout
+    ZE_IMAGE_FORMAT_LAYOUT_5_5_5_1 = 12,                                    ///< 4-component 5_5_5_1 layout
+    ZE_IMAGE_FORMAT_LAYOUT_4_4_4_4 = 13,                                    ///< 4-component 4_4_4_4 layout
+    ZE_IMAGE_FORMAT_LAYOUT_Y8 = 14,                                         ///< Media Format: Y8. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_NV12 = 15,                                       ///< Media Format: NV12. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_YUYV = 16,                                       ///< Media Format: YUYV. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_VYUY = 17,                                       ///< Media Format: VYUY. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_YVYU = 18,                                       ///< Media Format: YVYU. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_UYVY = 19,                                       ///< Media Format: UYVY. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_AYUV = 20,                                       ///< Media Format: AYUV. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P010 = 21,                                       ///< Media Format: P010. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y410 = 22,                                       ///< Media Format: Y410. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P012 = 23,                                       ///< Media Format: P012. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y16 = 24,                                        ///< Media Format: Y16. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P016 = 25,                                       ///< Media Format: P016. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y216 = 26,                                       ///< Media Format: Y216. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P216 = 27,                                       ///< Media Format: P216. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P8 = 28,                                         ///< Media Format: P8. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_YUY2 = 29,                                       ///< Media Format: YUY2. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_A8P8 = 30,                                       ///< Media Format: A8P8. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_IA44 = 31,                                       ///< Media Format: IA44. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_AI44 = 32,                                       ///< Media Format: AI44. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y416 = 33,                                       ///< Media Format: Y416. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y210 = 34,                                       ///< Media Format: Y210. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_I420 = 35,                                       ///< Media Format: I420. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_YV12 = 36,                                       ///< Media Format: YV12. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_400P = 37,                                       ///< Media Format: 400P. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_422H = 38,                                       ///< Media Format: 422H. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_422V = 39,                                       ///< Media Format: 422V. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_444P = 40,                                       ///< Media Format: 444P. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_RGBP = 41,                                       ///< Media Format: RGBP. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_BRGP = 42,                                       ///< Media Format: BRGP. Format type and swizzle is ignored for this.
     ZE_IMAGE_FORMAT_LAYOUT_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_format_layout_t;
@@ -4174,11 +4358,11 @@ typedef enum _ze_image_format_layout_t
 /// @brief Supported image format types
 typedef enum _ze_image_format_type_t
 {
-    ZE_IMAGE_FORMAT_TYPE_UINT = 0,                  ///< Unsigned integer
-    ZE_IMAGE_FORMAT_TYPE_SINT = 1,                  ///< Signed integer
-    ZE_IMAGE_FORMAT_TYPE_UNORM = 2,                 ///< Unsigned normalized integer
-    ZE_IMAGE_FORMAT_TYPE_SNORM = 3,                 ///< Signed normalized integer
-    ZE_IMAGE_FORMAT_TYPE_FLOAT = 4,                 ///< Float
+    ZE_IMAGE_FORMAT_TYPE_UINT = 0,                                          ///< Unsigned integer
+    ZE_IMAGE_FORMAT_TYPE_SINT = 1,                                          ///< Signed integer
+    ZE_IMAGE_FORMAT_TYPE_UNORM = 2,                                         ///< Unsigned normalized integer
+    ZE_IMAGE_FORMAT_TYPE_SNORM = 3,                                         ///< Signed normalized integer
+    ZE_IMAGE_FORMAT_TYPE_FLOAT = 4,                                         ///< Float
     ZE_IMAGE_FORMAT_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_format_type_t;
@@ -4187,13 +4371,13 @@ typedef enum _ze_image_format_type_t
 /// @brief Supported image format component swizzle into channel
 typedef enum _ze_image_format_swizzle_t
 {
-    ZE_IMAGE_FORMAT_SWIZZLE_R = 0,                  ///< Red component
-    ZE_IMAGE_FORMAT_SWIZZLE_G = 1,                  ///< Green component
-    ZE_IMAGE_FORMAT_SWIZZLE_B = 2,                  ///< Blue component
-    ZE_IMAGE_FORMAT_SWIZZLE_A = 3,                  ///< Alpha component
-    ZE_IMAGE_FORMAT_SWIZZLE_0 = 4,                  ///< Zero
-    ZE_IMAGE_FORMAT_SWIZZLE_1 = 5,                  ///< One
-    ZE_IMAGE_FORMAT_SWIZZLE_X = 6,                  ///< Don't care
+    ZE_IMAGE_FORMAT_SWIZZLE_R = 0,                                          ///< Red component
+    ZE_IMAGE_FORMAT_SWIZZLE_G = 1,                                          ///< Green component
+    ZE_IMAGE_FORMAT_SWIZZLE_B = 2,                                          ///< Blue component
+    ZE_IMAGE_FORMAT_SWIZZLE_A = 3,                                          ///< Alpha component
+    ZE_IMAGE_FORMAT_SWIZZLE_0 = 4,                                          ///< Zero
+    ZE_IMAGE_FORMAT_SWIZZLE_1 = 5,                                          ///< One
+    ZE_IMAGE_FORMAT_SWIZZLE_X = 6,                                          ///< Don't care
     ZE_IMAGE_FORMAT_SWIZZLE_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_format_swizzle_t;
@@ -4202,13 +4386,13 @@ typedef enum _ze_image_format_swizzle_t
 /// @brief Image format 
 typedef struct _ze_image_format_t
 {
-    ze_image_format_layout_t layout;                ///< [in] image format component layout (e.g. N-component layouts and media
-                                                    ///< formats)
-    ze_image_format_type_t type;                    ///< [in] image format type
-    ze_image_format_swizzle_t x;                    ///< [in] image component swizzle into channel x
-    ze_image_format_swizzle_t y;                    ///< [in] image component swizzle into channel y
-    ze_image_format_swizzle_t z;                    ///< [in] image component swizzle into channel z
-    ze_image_format_swizzle_t w;                    ///< [in] image component swizzle into channel w
+    ze_image_format_layout_t layout;                                        ///< [in] image format component layout (e.g. N-component layouts and media
+                                                                            ///< formats)
+    ze_image_format_type_t type;                                            ///< [in] image format type
+    ze_image_format_swizzle_t x;                                            ///< [in] image component swizzle into channel x
+    ze_image_format_swizzle_t y;                                            ///< [in] image component swizzle into channel y
+    ze_image_format_swizzle_t z;                                            ///< [in] image component swizzle into channel z
+    ze_image_format_swizzle_t w;                                            ///< [in] image component swizzle into channel w
 
 } ze_image_format_t;
 
@@ -4216,39 +4400,39 @@ typedef struct _ze_image_format_t
 /// @brief Image descriptor
 typedef struct _ze_image_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_image_flags_t flags;                         ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_image_flag_t;
-                                                    ///< default is read-only, cached access.
-    ze_image_type_t type;                           ///< [in] image type. Media format layouts are unsupported for
-                                                    ///< ::ZE_IMAGE_TYPE_BUFFER
-    ze_image_format_t format;                       ///< [in] image format
-    uint64_t width;                                 ///< [in] width dimension.
-                                                    ///< ::ZE_IMAGE_TYPE_BUFFER: size in bytes; see
-                                                    ///< ::ze_device_image_properties_t.maxImageBufferSize for limits.
-                                                    ///< ::ZE_IMAGE_TYPE_1D, ::ZE_IMAGE_TYPE_1DARRAY: width in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims1D for limits.
-                                                    ///< ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: width in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims2D for limits.
-                                                    ///< ::ZE_IMAGE_TYPE_3D: width in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims3D for limits.
-    uint32_t height;                                ///< [in] height dimension.
-                                                    ///< ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: height in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims2D for limits.
-                                                    ///< ::ZE_IMAGE_TYPE_3D: height in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims3D for limits.
-                                                    ///< other: ignored.
-    uint32_t depth;                                 ///< [in] depth dimension.
-                                                    ///< ::ZE_IMAGE_TYPE_3D: depth in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims3D for limits.
-                                                    ///< other: ignored.
-    uint32_t arraylevels;                           ///< [in] array levels.
-                                                    ///< ::ZE_IMAGE_TYPE_1DARRAY, ::ZE_IMAGE_TYPE_2DARRAY: see
-                                                    ///< ::ze_device_image_properties_t.maxImageArraySlices for limits.
-                                                    ///< other: ignored.
-    uint32_t miplevels;                             ///< [in] mipmap levels (must be 0)
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_image_flags_t flags;                                                 ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_image_flag_t;
+                                                                            ///< default is read-only, cached access.
+    ze_image_type_t type;                                                   ///< [in] image type. Media format layouts are unsupported for
+                                                                            ///< ::ZE_IMAGE_TYPE_BUFFER
+    ze_image_format_t format;                                               ///< [in] image format
+    uint64_t width;                                                         ///< [in] width dimension.
+                                                                            ///< ::ZE_IMAGE_TYPE_BUFFER: size in bytes; see the `maxImageBufferSize`
+                                                                            ///< member of ::ze_device_image_properties_t for limits.
+                                                                            ///< ::ZE_IMAGE_TYPE_1D, ::ZE_IMAGE_TYPE_1DARRAY: width in pixels; see the
+                                                                            ///< `maxImageDims1D` member of ::ze_device_image_properties_t for limits.
+                                                                            ///< ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: width in pixels; see the
+                                                                            ///< `maxImageDims2D` member of ::ze_device_image_properties_t for limits.
+                                                                            ///< ::ZE_IMAGE_TYPE_3D: width in pixels; see the `maxImageDims3D` member
+                                                                            ///< of ::ze_device_image_properties_t for limits.
+    uint32_t height;                                                        ///< [in] height dimension.
+                                                                            ///< ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: height in pixels; see the
+                                                                            ///< `maxImageDims2D` member of ::ze_device_image_properties_t for limits.
+                                                                            ///< ::ZE_IMAGE_TYPE_3D: height in pixels; see the `maxImageDims3D` member
+                                                                            ///< of ::ze_device_image_properties_t for limits.
+                                                                            ///< other: ignored.
+    uint32_t depth;                                                         ///< [in] depth dimension.
+                                                                            ///< ::ZE_IMAGE_TYPE_3D: depth in pixels; see the `maxImageDims3D` member
+                                                                            ///< of ::ze_device_image_properties_t for limits.
+                                                                            ///< other: ignored.
+    uint32_t arraylevels;                                                   ///< [in] array levels.
+                                                                            ///< ::ZE_IMAGE_TYPE_1DARRAY, ::ZE_IMAGE_TYPE_2DARRAY: see the
+                                                                            ///< `maxImageArraySlices` member of ::ze_device_image_properties_t for limits.
+                                                                            ///< other: ignored.
+    uint32_t miplevels;                                                     ///< [in] mipmap levels (must be 0)
 
 } ze_image_desc_t;
 
@@ -4257,8 +4441,8 @@ typedef struct _ze_image_desc_t
 typedef uint32_t ze_image_sampler_filter_flags_t;
 typedef enum _ze_image_sampler_filter_flag_t
 {
-    ZE_IMAGE_SAMPLER_FILTER_FLAG_POINT = ZE_BIT(0), ///< device supports point filtering
-    ZE_IMAGE_SAMPLER_FILTER_FLAG_LINEAR = ZE_BIT(1),///< device supports linear filtering
+    ZE_IMAGE_SAMPLER_FILTER_FLAG_POINT = ZE_BIT(0),                         ///< device supports point filtering
+    ZE_IMAGE_SAMPLER_FILTER_FLAG_LINEAR = ZE_BIT(1),                        ///< device supports linear filtering
     ZE_IMAGE_SAMPLER_FILTER_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_sampler_filter_flag_t;
@@ -4267,11 +4451,11 @@ typedef enum _ze_image_sampler_filter_flag_t
 /// @brief Image properties
 typedef struct _ze_image_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_image_sampler_filter_flags_t samplerFilterFlags; ///< [out] supported sampler filtering.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_image_sampler_filter_flag_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_image_sampler_filter_flags_t samplerFilterFlags;                     ///< [out] supported sampler filtering.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_image_sampler_filter_flag_t.
 
 } ze_image_properties_t;
 
@@ -4298,9 +4482,9 @@ typedef struct _ze_image_properties_t
 ///         + `::ZE_IMAGE_TYPE_BUFFER < desc->type`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeImageGetProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_image_desc_t* desc,                    ///< [in] pointer to image descriptor
-    ze_image_properties_t* pImageProperties         ///< [out] pointer to image properties
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_image_desc_t* desc,                                            ///< [in] pointer to image descriptor
+    ze_image_properties_t* pImageProperties                                 ///< [out] pointer to image properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4334,10 +4518,10 @@ zeImageGetProperties(
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeImageCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_image_desc_t* desc,                    ///< [in] pointer to image descriptor
-    ze_image_handle_t* phImage                      ///< [out] pointer to handle of image object created
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_image_desc_t* desc,                                            ///< [in] pointer to image descriptor
+    ze_image_handle_t* phImage                                              ///< [out] pointer to handle of image object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4363,7 +4547,7 @@ zeImageCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeImageDestroy(
-    ze_image_handle_t hImage                        ///< [in][release] handle of image object to destroy
+    ze_image_handle_t hImage                                                ///< [in][release] handle of image object to destroy
     );
 
 #if !defined(__GNUC__)
@@ -4378,9 +4562,9 @@ zeImageDestroy(
 typedef uint32_t ze_device_mem_alloc_flags_t;
 typedef enum _ze_device_mem_alloc_flag_t
 {
-    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0),   ///< device should cache allocation
-    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1), ///< device should not cache allocation (UC)
-    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT = ZE_BIT(2),///< optimize shared allocation for first access on the device
+    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0),                       ///< device should cache allocation
+    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1),                     ///< device should not cache allocation (UC)
+    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT = ZE_BIT(2),            ///< optimize shared allocation for first access on the device
     ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_mem_alloc_flag_t;
@@ -4389,14 +4573,14 @@ typedef enum _ze_device_mem_alloc_flag_t
 /// @brief Device memory allocation descriptor
 typedef struct _ze_device_mem_alloc_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_mem_alloc_flags_t flags;              ///< [in] flags specifying additional allocation controls.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_device_mem_alloc_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics.
-    uint32_t ordinal;                               ///< [in] ordinal of the device's local memory to allocate from.
-                                                    ///< must be less than the count returned from ::zeDeviceGetMemoryProperties.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_mem_alloc_flags_t flags;                                      ///< [in] flags specifying additional allocation controls.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_device_mem_alloc_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics.
+    uint32_t ordinal;                                                       ///< [in] ordinal of the device's local memory to allocate from.
+                                                                            ///< must be less than the count returned from ::zeDeviceGetMemoryProperties.
 
 } ze_device_mem_alloc_desc_t;
 
@@ -4405,10 +4589,10 @@ typedef struct _ze_device_mem_alloc_desc_t
 typedef uint32_t ze_host_mem_alloc_flags_t;
 typedef enum _ze_host_mem_alloc_flag_t
 {
-    ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0), ///< host should cache allocation
-    ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1),   ///< host should not cache allocation (UC)
-    ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED = ZE_BIT(2), ///< host memory should be allocated write-combined (WC)
-    ZE_HOST_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT = ZE_BIT(3),  ///< optimize shared allocation for first access on the host
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0),                         ///< host should cache allocation
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1),                       ///< host should not cache allocation (UC)
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED = ZE_BIT(2),                 ///< host memory should be allocated write-combined (WC)
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT = ZE_BIT(3),              ///< optimize shared allocation for first access on the host
     ZE_HOST_MEM_ALLOC_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_host_mem_alloc_flag_t;
@@ -4417,12 +4601,12 @@ typedef enum _ze_host_mem_alloc_flag_t
 /// @brief Host memory allocation descriptor
 typedef struct _ze_host_mem_alloc_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_host_mem_alloc_flags_t flags;                ///< [in] flags specifying additional allocation controls.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_host_mem_alloc_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_host_mem_alloc_flags_t flags;                                        ///< [in] flags specifying additional allocation controls.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_host_mem_alloc_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics.
 
 } ze_host_mem_alloc_desc_t;
 
@@ -4470,15 +4654,15 @@ typedef struct _ze_host_mem_alloc_desc_t
 ///         + `0 != (alignment & (alignment - 1))`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemAllocShared(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_device_mem_alloc_desc_t* device_desc,  ///< [in] pointer to device memory allocation descriptor
-    const ze_host_mem_alloc_desc_t* host_desc,      ///< [in] pointer to host memory allocation descriptor
-    size_t size,                                    ///< [in] size in bytes to allocate; must be less than or equal to
-                                                    ///< ::ze_device_properties_t.maxMemAllocSize.
-    size_t alignment,                               ///< [in] minimum alignment in bytes for the allocation; must be a power of
-                                                    ///< two.
-    ze_device_handle_t hDevice,                     ///< [in][optional] device handle to associate with
-    void** pptr                                     ///< [out] pointer to shared allocation
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_device_mem_alloc_desc_t* device_desc,                          ///< [in] pointer to device memory allocation descriptor
+    const ze_host_mem_alloc_desc_t* host_desc,                              ///< [in] pointer to host memory allocation descriptor
+    size_t size,                                                            ///< [in] size in bytes to allocate; must be less than or equal to the
+                                                                            ///< `maxMemAllocSize` member of ::ze_device_properties_t
+    size_t alignment,                                                       ///< [in] minimum alignment in bytes for the allocation; must be a power of
+                                                                            ///< two
+    ze_device_handle_t hDevice,                                             ///< [in][optional] device handle to associate with
+    void** pptr                                                             ///< [out] pointer to shared allocation
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4514,14 +4698,14 @@ zeMemAllocShared(
 ///         + `0 != (alignment & (alignment - 1))`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemAllocDevice(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_device_mem_alloc_desc_t* device_desc,  ///< [in] pointer to device memory allocation descriptor
-    size_t size,                                    ///< [in] size in bytes to allocate; must be less than or equal to
-                                                    ///< ::ze_device_properties_t.maxMemAllocSize.
-    size_t alignment,                               ///< [in] minimum alignment in bytes for the allocation; must be a power of
-                                                    ///< two.
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    void** pptr                                     ///< [out] pointer to device allocation
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_device_mem_alloc_desc_t* device_desc,                          ///< [in] pointer to device memory allocation descriptor
+    size_t size,                                                            ///< [in] size in bytes to allocate; must be less than or equal to the
+                                                                            ///< `maxMemAllocSize` member of ::ze_device_properties_t
+    size_t alignment,                                                       ///< [in] minimum alignment in bytes for the allocation; must be a power of
+                                                                            ///< two
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    void** pptr                                                             ///< [out] pointer to device allocation
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4558,13 +4742,13 @@ zeMemAllocDevice(
 ///         + `0 != (alignment & (alignment - 1))`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemAllocHost(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_host_mem_alloc_desc_t* host_desc,      ///< [in] pointer to host memory allocation descriptor
-    size_t size,                                    ///< [in] size in bytes to allocate; must be less than or equal to
-                                                    ///< ::ze_device_properties_t.maxMemAllocSize.
-    size_t alignment,                               ///< [in] minimum alignment in bytes for the allocation; must be a power of
-                                                    ///< two.
-    void** pptr                                     ///< [out] pointer to host allocation
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_host_mem_alloc_desc_t* host_desc,                              ///< [in] pointer to host memory allocation descriptor
+    size_t size,                                                            ///< [in] size in bytes to allocate; must be less than or equal to the
+                                                                            ///< `maxMemAllocSize` member of ::ze_device_properties_t
+    size_t alignment,                                                       ///< [in] minimum alignment in bytes for the allocation; must be a power of
+                                                                            ///< two
+    void** pptr                                                             ///< [out] pointer to host allocation
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4592,18 +4776,18 @@ zeMemAllocHost(
 ///         + `nullptr == ptr`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemFree(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    void* ptr                                       ///< [in][release] pointer to memory to free
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    void* ptr                                                               ///< [in][release] pointer to memory to free
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Memory allocation type
 typedef enum _ze_memory_type_t
 {
-    ZE_MEMORY_TYPE_UNKNOWN = 0,                     ///< the memory pointed to is of unknown type
-    ZE_MEMORY_TYPE_HOST = 1,                        ///< the memory pointed to is a host allocation
-    ZE_MEMORY_TYPE_DEVICE = 2,                      ///< the memory pointed to is a device allocation
-    ZE_MEMORY_TYPE_SHARED = 3,                      ///< the memory pointed to is a shared ownership allocation
+    ZE_MEMORY_TYPE_UNKNOWN = 0,                                             ///< the memory pointed to is of unknown type
+    ZE_MEMORY_TYPE_HOST = 1,                                                ///< the memory pointed to is a host allocation
+    ZE_MEMORY_TYPE_DEVICE = 2,                                              ///< the memory pointed to is a device allocation
+    ZE_MEMORY_TYPE_SHARED = 3,                                              ///< the memory pointed to is a shared ownership allocation
     ZE_MEMORY_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } ze_memory_type_t;
@@ -4612,12 +4796,12 @@ typedef enum _ze_memory_type_t
 /// @brief Memory allocation properties queried using ::zeMemGetAllocProperties
 typedef struct _ze_memory_allocation_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_memory_type_t type;                          ///< [out] type of allocated memory
-    uint64_t id;                                    ///< [out] identifier for this allocation
-    uint64_t pageSize;                              ///< [out] page size used for allocation
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_memory_type_t type;                                                  ///< [out] type of allocated memory
+    uint64_t id;                                                            ///< [out] identifier for this allocation
+    uint64_t pageSize;                                                      ///< [out] page size used for allocation
 
 } ze_memory_allocation_properties_t;
 
@@ -4645,10 +4829,10 @@ typedef struct _ze_memory_allocation_properties_t
 ///         + `nullptr == pMemAllocProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemGetAllocProperties(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] memory pointer to query
-    ze_memory_allocation_properties_t* pMemAllocProperties, ///< [in,out] query result for memory allocation properties
-    ze_device_handle_t* phDevice                    ///< [out][optional] device associated with this allocation
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] memory pointer to query
+    ze_memory_allocation_properties_t* pMemAllocProperties,                 ///< [in,out] query result for memory allocation properties
+    ze_device_handle_t* phDevice                                            ///< [out][optional] device associated with this allocation
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4669,10 +4853,10 @@ zeMemGetAllocProperties(
 ///         + `nullptr == ptr`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemGetAddressRange(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] memory pointer to query
-    void** pBase,                                   ///< [in,out][optional] base address of the allocation
-    size_t* pSize                                   ///< [in,out][optional] size of the allocation
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] memory pointer to query
+    void** pBase,                                                           ///< [in,out][optional] base address of the allocation
+    size_t* pSize                                                           ///< [in,out][optional] size of the allocation
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4700,9 +4884,9 @@ zeMemGetAddressRange(
 ///         + `nullptr == pIpcHandle`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemGetIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to the device memory allocation
-    ze_ipc_mem_handle_t* pIpcHandle                 ///< [out] Returned IPC memory handle
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to the device memory allocation
+    ze_ipc_mem_handle_t* pIpcHandle                                         ///< [out] Returned IPC memory handle
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4728,9 +4912,9 @@ zeMemGetIpcHandle(
 ///         + `nullptr == pIpcHandle`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemGetIpcHandleFromFileDescriptorExp(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    uint64_t handle,                                ///< [in] file descriptor
-    ze_ipc_mem_handle_t* pIpcHandle                 ///< [out] Returned IPC memory handle
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    uint64_t handle,                                                        ///< [in] file descriptor
+    ze_ipc_mem_handle_t* pIpcHandle                                         ///< [out] Returned IPC memory handle
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4754,9 +4938,9 @@ zeMemGetIpcHandleFromFileDescriptorExp(
 ///         + `nullptr == pHandle`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemGetFileDescriptorFromIpcHandleExp(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_ipc_mem_handle_t ipcHandle,                  ///< [in] IPC memory handle
-    uint64_t* pHandle                               ///< [out] Returned file descriptor
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_ipc_mem_handle_t ipcHandle,                                          ///< [in] IPC memory handle
+    uint64_t* pHandle                                                       ///< [out] Returned file descriptor
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4785,8 +4969,8 @@ zeMemGetFileDescriptorFromIpcHandleExp(
 ///         + `nullptr == hContext`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemPutIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_ipc_mem_handle_t handle                      ///< [in] IPC memory handle
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_ipc_mem_handle_t handle                                              ///< [in] IPC memory handle
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4794,8 +4978,8 @@ zeMemPutIpcHandle(
 typedef uint32_t ze_ipc_memory_flags_t;
 typedef enum _ze_ipc_memory_flag_t
 {
-    ZE_IPC_MEMORY_FLAG_BIAS_CACHED = ZE_BIT(0),     ///< device should cache allocation
-    ZE_IPC_MEMORY_FLAG_BIAS_UNCACHED = ZE_BIT(1),   ///< device should not cache allocation (UC)
+    ZE_IPC_MEMORY_FLAG_BIAS_CACHED = ZE_BIT(0),                             ///< device should cache allocation
+    ZE_IPC_MEMORY_FLAG_BIAS_UNCACHED = ZE_BIT(1),                           ///< device should not cache allocation (UC)
     ZE_IPC_MEMORY_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_ipc_memory_flag_t;
@@ -4829,12 +5013,12 @@ typedef enum _ze_ipc_memory_flag_t
 ///         + `nullptr == pptr`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemOpenIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device to associate with the IPC memory handle
-    ze_ipc_mem_handle_t handle,                     ///< [in] IPC memory handle
-    ze_ipc_memory_flags_t flags,                    ///< [in] flags controlling the operation.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_ipc_memory_flag_t.
-    void** pptr                                     ///< [out] pointer to device allocation in this process
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device to associate with the IPC memory handle
+    ze_ipc_mem_handle_t handle,                                             ///< [in] IPC memory handle
+    ze_ipc_memory_flags_t flags,                                            ///< [in] flags controlling the operation.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_ipc_memory_flag_t.
+    void** pptr                                                             ///< [out] pointer to device allocation in this process
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4859,8 +5043,8 @@ zeMemOpenIpcHandle(
 ///         + `nullptr == ptr`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemCloseIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr                                 ///< [in][release] pointer to device allocation in this process
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr                                                         ///< [in][release] pointer to device allocation in this process
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4875,11 +5059,11 @@ zeMemCloseIpcHandle(
 ///       member of ::ze_image_desc_t, to indicate an exportable image.
 typedef struct _ze_external_memory_export_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying memory export types for this allocation.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying memory export types for this allocation.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
 
 } ze_external_memory_export_desc_t;
 
@@ -4896,12 +5080,12 @@ typedef struct _ze_external_memory_export_desc_t
 ///       member of ::ze_image_desc_t, to import memory from a file descriptor.
 typedef struct _ze_external_memory_import_fd_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying the memory import type for the file descriptor.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    int fd;                                         ///< [in] the file descriptor handle to import
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying the memory import type for the file descriptor.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    int fd;                                                                 ///< [in] the file descriptor handle to import
 
 } ze_external_memory_import_fd_t;
 
@@ -4919,12 +5103,12 @@ typedef struct _ze_external_memory_import_fd_t
 ///       allocation was made.
 typedef struct _ze_external_memory_export_fd_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying the memory export type for the file descriptor.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    int fd;                                         ///< [out] the exported file descriptor handle representing the allocation.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying the memory export type for the file descriptor.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    int fd;                                                                 ///< [out] the exported file descriptor handle representing the allocation.
 
 } ze_external_memory_export_fd_t;
 
@@ -4945,13 +5129,13 @@ typedef struct _ze_external_memory_export_fd_t
 ///       member of ::ze_image_desc_t, to import memory from a Win32 handle.
 typedef struct _ze_external_memory_import_win32_handle_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying the memory import type for the Win32 handle.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    void* handle;                                   ///< [in][optional] the Win32 handle to import
-    const void* name;                               ///< [in][optional] name of a memory object to import
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying the memory import type for the Win32 handle.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    void* handle;                                                           ///< [in][optional] the Win32 handle to import
+    const void* name;                                                       ///< [in][optional] name of a memory object to import
 
 } ze_external_memory_import_win32_handle_t;
 
@@ -4969,15 +5153,114 @@ typedef struct _ze_external_memory_import_win32_handle_t
 ///       allocation was made.
 typedef struct _ze_external_memory_export_win32_handle_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying the memory export type for the Win32 handle.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    void* handle;                                   ///< [out] the exported Win32 handle representing the allocation.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying the memory export type for the Win32 handle.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    void* handle;                                                           ///< [out] the exported Win32 handle representing the allocation.
 
 } ze_external_memory_export_win32_handle_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief atomic access attribute flags
+typedef uint32_t ze_memory_atomic_attr_exp_flags_t;
+typedef enum _ze_memory_atomic_attr_exp_flag_t
+{
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_NO_ATOMICS = ZE_BIT(0),                  ///< Atomics on the pointer are not allowed
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_NO_HOST_ATOMICS = ZE_BIT(1),             ///< Host atomics on the pointer are not allowed
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_HOST_ATOMICS = ZE_BIT(2),                ///< Host atomics on the pointer are allowed. Requires
+                                                                            ///< ::ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC returned by
+                                                                            ///< ::zeDeviceGetMemoryAccessProperties.
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_NO_DEVICE_ATOMICS = ZE_BIT(3),           ///< Device atomics on the pointer are not allowed
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_DEVICE_ATOMICS = ZE_BIT(4),              ///< Device atomics on the pointer are allowed. Requires
+                                                                            ///< ::ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC returned by
+                                                                            ///< ::zeDeviceGetMemoryAccessProperties.
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_NO_SYSTEM_ATOMICS = ZE_BIT(5),           ///< Concurrent atomics on the pointer from both host and device are not
+                                                                            ///< allowed
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_SYSTEM_ATOMICS = ZE_BIT(6),              ///< Concurrent atomics on the pointer from both host and device are
+                                                                            ///< allowed. Requires ::ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC
+                                                                            ///< returned by ::zeDeviceGetMemoryAccessProperties.
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_atomic_attr_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sets atomic access attributes for a shared allocation
+/// 
+/// @details
+///     - If the shared-allocation is owned by multiple devices (i.e. nullptr
+///       was passed to ::zeMemAllocShared when creating it), then hDevice may be
+///       passed to set the attributes in that specific device. If nullptr is
+///       passed in hDevice, then the atomic attributes are set in all devices
+///       associated with the allocation.
+///     - If the atomic access attribute select is not supported by the driver,
+///       ::ZE_RESULT_INVALID_ARGUMENT is returned.
+///     - The atomic access attribute may be only supported at a device-specific
+///       granularity, such as at a page boundary. In this case, the memory range
+///       may be expanded such that the start and end of the range satisfy granularity
+///       requirements.
+///     - When calling this function multiple times with different flags, only the
+///       attributes from last call are honored.
+///     - The application must not call this function for shared-allocations currently
+///       being used by the device.
+///     - The application must **not** call this function from simultaneous threads
+///       with the same pointer.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7f < attr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemSetAtomicAccessAttributeExp(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context
+    ze_device_handle_t hDevice,                                             ///< [in] device associated with the memory advice
+    const void* ptr,                                                        ///< [in] Pointer to the start of the memory range
+    size_t size,                                                            ///< [in] Size in bytes of the memory range
+    ze_memory_atomic_attr_exp_flags_t attr                                  ///< [in] Atomic access attributes to set for the specified range.
+                                                                            ///< Must be 0 (default) or a valid combination of ::ze_memory_atomic_attr_exp_flag_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves the atomic access attributes previously set for a shared
+///        allocation
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads
+///       with the same pointer.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///         + `nullptr == pAttr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemGetAtomicAccessAttributeExp(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context
+    ze_device_handle_t hDevice,                                             ///< [in] device associated with the memory advice
+    const void* ptr,                                                        ///< [in] Pointer to the start of the memory range
+    size_t size,                                                            ///< [in] Size in bytes of the memory range
+    ze_memory_atomic_attr_exp_flags_t* pAttr                                ///< [out] Atomic access attributes for the specified range
+    );
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -4989,8 +5272,8 @@ typedef struct _ze_external_memory_export_win32_handle_t
 /// @brief Supported module creation input formats
 typedef enum _ze_module_format_t
 {
-    ZE_MODULE_FORMAT_IL_SPIRV = 0,                  ///< Format is SPIRV IL format
-    ZE_MODULE_FORMAT_NATIVE = 1,                    ///< Format is device native format
+    ZE_MODULE_FORMAT_IL_SPIRV = 0,                                          ///< Format is SPIRV IL format
+    ZE_MODULE_FORMAT_NATIVE = 1,                                            ///< Format is device native format
     ZE_MODULE_FORMAT_FORCE_UINT32 = 0x7fffffff
 
 } ze_module_format_t;
@@ -4999,11 +5282,11 @@ typedef enum _ze_module_format_t
 /// @brief Specialization constants - User defined constants
 typedef struct _ze_module_constants_t
 {
-    uint32_t numConstants;                          ///< [in] Number of specialization constants.
-    const uint32_t* pConstantIds;                   ///< [in][range(0, numConstants)] Array of IDs that is sized to
-                                                    ///< numConstants.
-    const void** pConstantValues;                   ///< [in][range(0, numConstants)] Array of pointers to values that is sized
-                                                    ///< to numConstants.
+    uint32_t numConstants;                                                  ///< [in] Number of specialization constants.
+    const uint32_t* pConstantIds;                                           ///< [in][range(0, numConstants)] Array of IDs that is sized to
+                                                                            ///< numConstants.
+    const void** pConstantValues;                                           ///< [in][range(0, numConstants)] Array of pointers to values that is sized
+                                                                            ///< to numConstants.
 
 } ze_module_constants_t;
 
@@ -5011,35 +5294,35 @@ typedef struct _ze_module_constants_t
 /// @brief Module descriptor
 typedef struct _ze_module_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_module_format_t format;                      ///< [in] Module format passed in with pInputModule
-    size_t inputSize;                               ///< [in] size of input IL or ISA from pInputModule.
-    const uint8_t* pInputModule;                    ///< [in] pointer to IL or ISA
-    const char* pBuildFlags;                        ///< [in][optional] string containing one or more (comma-separated)
-                                                    ///< compiler flags. If unsupported, flag is ignored with a warning.
-                                                    ///<  - "-ze-opt-disable"
-                                                    ///<       - Disable optimizations
-                                                    ///<  - "-ze-opt-level"
-                                                    ///<       - Specifies optimization level for compiler. Levels are
-                                                    ///< implementation specific.
-                                                    ///<           - 0 is no optimizations (equivalent to -ze-opt-disable)
-                                                    ///<           - 1 is optimize minimally (may be the same as 2)
-                                                    ///<           - 2 is optimize more (default)
-                                                    ///<  - "-ze-opt-greater-than-4GB-buffer-required"
-                                                    ///<       - Use 64-bit offset calculations for buffers.
-                                                    ///<  - "-ze-opt-large-register-file"
-                                                    ///<       - Increase number of registers available to threads.
-                                                    ///<  - "-ze-opt-has-buffer-offset-arg"
-                                                    ///<       - Extend stateless to stateful optimization to more
-                                                    ///<         cases with the use of additional offset (e.g. 64-bit
-                                                    ///<         pointer to binding table with 32-bit offset).
-                                                    ///<  - "-g"
-                                                    ///<       - Include debugging information.
-    const ze_module_constants_t* pConstants;        ///< [in][optional] pointer to specialization constants. Valid only for
-                                                    ///< SPIR-V input. This must be set to nullptr if no specialization
-                                                    ///< constants are provided.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_module_format_t format;                                              ///< [in] Module format passed in with pInputModule
+    size_t inputSize;                                                       ///< [in] size of input IL or ISA from pInputModule.
+    const uint8_t* pInputModule;                                            ///< [in] pointer to IL or ISA
+    const char* pBuildFlags;                                                ///< [in][optional] string containing one or more (comma-separated)
+                                                                            ///< compiler flags. If unsupported, flag is ignored with a warning.
+                                                                            ///<  - "-ze-opt-disable"
+                                                                            ///<       - Disable optimizations
+                                                                            ///<  - "-ze-opt-level"
+                                                                            ///<       - Specifies optimization level for compiler. Levels are
+                                                                            ///< implementation specific.
+                                                                            ///<           - 0 is no optimizations (equivalent to -ze-opt-disable)
+                                                                            ///<           - 1 is optimize minimally (may be the same as 2)
+                                                                            ///<           - 2 is optimize more (default)
+                                                                            ///<  - "-ze-opt-greater-than-4GB-buffer-required"
+                                                                            ///<       - Use 64-bit offset calculations for buffers.
+                                                                            ///<  - "-ze-opt-large-register-file"
+                                                                            ///<       - Increase number of registers available to threads.
+                                                                            ///<  - "-ze-opt-has-buffer-offset-arg"
+                                                                            ///<       - Extend stateless to stateful optimization to more
+                                                                            ///<         cases with the use of additional offset (e.g. 64-bit
+                                                                            ///<         pointer to binding table with 32-bit offset).
+                                                                            ///<  - "-g"
+                                                                            ///<       - Include debugging information.
+    const ze_module_constants_t* pConstants;                                ///< [in][optional] pointer to specialization constants. Valid only for
+                                                                            ///< SPIR-V input. This must be set to nullptr if no specialization
+                                                                            ///< constants are provided.
 
 } ze_module_desc_t;
 
@@ -5080,19 +5363,19 @@ typedef struct _ze_module_desc_t
 ///     - ::ZE_RESULT_ERROR_MODULE_BUILD_FAILURE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_module_desc_t* desc,                   ///< [in] pointer to module descriptor
-    ze_module_handle_t* phModule,                   ///< [out] pointer to handle of module object created
-    ze_module_build_log_handle_t* phBuildLog        ///< [out][optional] pointer to handle of module's build log.
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_module_desc_t* desc,                                           ///< [in] pointer to module descriptor
+    ze_module_handle_t* phModule,                                           ///< [out] pointer to handle of module object created
+    ze_module_build_log_handle_t* phBuildLog                                ///< [out][optional] pointer to handle of module's build log.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Destroys module
 /// 
 /// @details
-///     - The application must destroy all kernel and build log handles created
-///       from the module before destroying the module itself.
+///     - The application must destroy all kernel handles created from the
+///       module before destroying the module itself.
 ///     - The application must ensure the device is not currently referencing
 ///       the module before it is deleted.
 ///     - The implementation of this function may immediately free all Host and
@@ -5112,7 +5395,7 @@ zeModuleCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleDestroy(
-    ze_module_handle_t hModule                      ///< [in][release] handle of the module
+    ze_module_handle_t hModule                                              ///< [in][release] handle of the module
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5159,10 +5442,10 @@ zeModuleDestroy(
 ///     - ::ZE_RESULT_ERROR_MODULE_LINK_FAILURE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleDynamicLink(
-    uint32_t numModules,                            ///< [in] number of modules to be linked pointed to by phModules.
-    ze_module_handle_t* phModules,                  ///< [in][range(0, numModules)] pointer to an array of modules to
-                                                    ///< dynamically link together.
-    ze_module_build_log_handle_t* phLinkLog         ///< [out][optional] pointer to handle of dynamic link log.
+    uint32_t numModules,                                                    ///< [in] number of modules to be linked pointed to by phModules.
+    ze_module_handle_t* phModules,                                          ///< [in][range(0, numModules)] pointer to an array of modules to
+                                                                            ///< dynamically link together.
+    ze_module_build_log_handle_t* phLinkLog                                 ///< [out][optional] pointer to handle of dynamic link log.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5188,7 +5471,7 @@ zeModuleDynamicLink(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleBuildLogDestroy(
-    ze_module_build_log_handle_t hModuleBuildLog    ///< [in][release] handle of the module build log object.
+    ze_module_build_log_handle_t hModuleBuildLog                            ///< [in][release] handle of the module build log object.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5212,9 +5495,9 @@ zeModuleBuildLogDestroy(
 ///         + `nullptr == pSize`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleBuildLogGetString(
-    ze_module_build_log_handle_t hModuleBuildLog,   ///< [in] handle of the module build log object.
-    size_t* pSize,                                  ///< [in,out] size of build log string.
-    char* pBuildLog                                 ///< [in,out][optional] pointer to null-terminated string of the log.
+    ze_module_build_log_handle_t hModuleBuildLog,                           ///< [in] handle of the module build log object.
+    size_t* pSize,                                                          ///< [in,out] size of build log string.
+    char* pBuildLog                                                         ///< [in,out][optional] pointer to null-terminated string of the log.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5244,9 +5527,9 @@ zeModuleBuildLogGetString(
 ///         + `nullptr == pSize`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleGetNativeBinary(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    size_t* pSize,                                  ///< [in,out] size of native binary in bytes.
-    uint8_t* pModuleNativeBinary                    ///< [in,out][optional] byte pointer to native binary
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    size_t* pSize,                                                          ///< [in,out] size of native binary in bytes.
+    uint8_t* pModuleNativeBinary                                            ///< [in,out][optional] byte pointer to native binary
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5273,10 +5556,10 @@ zeModuleGetNativeBinary(
 ///     - ::ZE_RESULT_ERROR_INVALID_GLOBAL_NAME
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleGetGlobalPointer(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    const char* pGlobalName,                        ///< [in] name of global variable in module
-    size_t* pSize,                                  ///< [in,out][optional] size of global variable
-    void** pptr                                     ///< [in,out][optional] device visible pointer
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    const char* pGlobalName,                                                ///< [in] name of global variable in module
+    size_t* pSize,                                                          ///< [in,out][optional] size of global variable
+    void** pptr                                                             ///< [in,out][optional] device visible pointer
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5298,15 +5581,15 @@ zeModuleGetGlobalPointer(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleGetKernelNames(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of names.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of names available.
-                                                    ///< if count is greater than the number of names available, then the
-                                                    ///< driver shall update the value with the correct number of names available.
-    const char** pNames                             ///< [in,out][optional][range(0, *pCount)] array of names of functions.
-                                                    ///< if count is less than the number of names available, then driver shall
-                                                    ///< only retrieve that number of names.
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of names.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of names available.
+                                                                            ///< if count is greater than the number of names available, then the
+                                                                            ///< driver shall update the value with the correct number of names available.
+    const char** pNames                                                     ///< [in,out][optional][range(0, *pCount)] array of names of functions.
+                                                                            ///< if count is less than the number of names available, then driver shall
+                                                                            ///< only retrieve that number of names.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5314,8 +5597,8 @@ zeModuleGetKernelNames(
 typedef uint32_t ze_module_property_flags_t;
 typedef enum _ze_module_property_flag_t
 {
-    ZE_MODULE_PROPERTY_FLAG_IMPORTS = ZE_BIT(0),    ///< Module has imports (i.e. imported global variables and/or kernels).
-                                                    ///< See ::zeModuleDynamicLink.
+    ZE_MODULE_PROPERTY_FLAG_IMPORTS = ZE_BIT(0),                            ///< Module has imports (i.e. imported global variables and/or kernels).
+                                                                            ///< See ::zeModuleDynamicLink.
     ZE_MODULE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_module_property_flag_t;
@@ -5324,10 +5607,10 @@ typedef enum _ze_module_property_flag_t
 /// @brief Module properties
 typedef struct _ze_module_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_module_property_flags_t flags;               ///< [out] 0 (none) or a valid combination of ::ze_module_property_flag_t
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_module_property_flags_t flags;                                       ///< [out] 0 (none) or a valid combination of ::ze_module_property_flag_t
 
 } ze_module_properties_t;
 
@@ -5350,8 +5633,8 @@ typedef struct _ze_module_properties_t
 ///         + `nullptr == pModuleProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleGetProperties(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    ze_module_properties_t* pModuleProperties       ///< [in,out] query result for module properties.
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    ze_module_properties_t* pModuleProperties                               ///< [in,out] query result for module properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5359,9 +5642,9 @@ zeModuleGetProperties(
 typedef uint32_t ze_kernel_flags_t;
 typedef enum _ze_kernel_flag_t
 {
-    ZE_KERNEL_FLAG_FORCE_RESIDENCY = ZE_BIT(0),     ///< force all device allocations to be resident during execution
-    ZE_KERNEL_FLAG_EXPLICIT_RESIDENCY = ZE_BIT(1),  ///< application is responsible for all residency of device allocations.
-                                                    ///< driver may disable implicit residency management.
+    ZE_KERNEL_FLAG_FORCE_RESIDENCY = ZE_BIT(0),                             ///< force all device allocations to be resident during execution
+    ZE_KERNEL_FLAG_EXPLICIT_RESIDENCY = ZE_BIT(1),                          ///< application is responsible for all residency of device allocations.
+                                                                            ///< driver may disable implicit residency management.
     ZE_KERNEL_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_kernel_flag_t;
@@ -5370,13 +5653,13 @@ typedef enum _ze_kernel_flag_t
 /// @brief Kernel descriptor
 typedef struct _ze_kernel_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_kernel_flags_t flags;                        ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_kernel_flag_t;
-                                                    ///< default behavior may use driver-based residency.
-    const char* pKernelName;                        ///< [in] null-terminated name of kernel in module
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_kernel_flags_t flags;                                                ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_kernel_flag_t;
+                                                                            ///< default behavior may use driver-based residency.
+    const char* pKernelName;                                                ///< [in] null-terminated name of kernel in module
 
 } ze_kernel_desc_t;
 
@@ -5407,9 +5690,9 @@ typedef struct _ze_kernel_desc_t
 ///     - ::ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelCreate(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    const ze_kernel_desc_t* desc,                   ///< [in] pointer to kernel descriptor
-    ze_kernel_handle_t* phKernel                    ///< [out] handle of the Function object
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    const ze_kernel_desc_t* desc,                                           ///< [in] pointer to kernel descriptor
+    ze_kernel_handle_t* phKernel                                            ///< [out] handle of the Function object
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5435,7 +5718,7 @@ zeKernelCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelDestroy(
-    ze_kernel_handle_t hKernel                      ///< [in][release] handle of the kernel object
+    ze_kernel_handle_t hKernel                                              ///< [in][release] handle of the kernel object
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5464,9 +5747,9 @@ zeKernelDestroy(
 ///     - ::ZE_RESULT_ERROR_INVALID_FUNCTION_NAME
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleGetFunctionPointer(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    const char* pFunctionName,                      ///< [in] Name of function to retrieve function pointer for.
-    void** pfnFunction                              ///< [out] pointer to function.
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    const char* pFunctionName,                                              ///< [in] Name of function to retrieve function pointer for.
+    void** pfnFunction                                                      ///< [out] pointer to function.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5490,10 +5773,10 @@ zeModuleGetFunctionPointer(
 ///     - ::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelSetGroupSize(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t groupSizeX,                            ///< [in] group size for X dimension to use for this kernel
-    uint32_t groupSizeY,                            ///< [in] group size for Y dimension to use for this kernel
-    uint32_t groupSizeZ                             ///< [in] group size for Z dimension to use for this kernel
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t groupSizeX,                                                    ///< [in] group size for X dimension to use for this kernel
+    uint32_t groupSizeY,                                                    ///< [in] group size for Y dimension to use for this kernel
+    uint32_t groupSizeZ                                                     ///< [in] group size for Z dimension to use for this kernel
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5521,13 +5804,13 @@ zeKernelSetGroupSize(
 ///     - ::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelSuggestGroupSize(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t globalSizeX,                           ///< [in] global width for X dimension
-    uint32_t globalSizeY,                           ///< [in] global width for Y dimension
-    uint32_t globalSizeZ,                           ///< [in] global width for Z dimension
-    uint32_t* groupSizeX,                           ///< [out] recommended size of group for X dimension
-    uint32_t* groupSizeY,                           ///< [out] recommended size of group for Y dimension
-    uint32_t* groupSizeZ                            ///< [out] recommended size of group for Z dimension
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t globalSizeX,                                                   ///< [in] global width for X dimension
+    uint32_t globalSizeY,                                                   ///< [in] global width for Y dimension
+    uint32_t globalSizeZ,                                                   ///< [in] global width for Z dimension
+    uint32_t* groupSizeX,                                                   ///< [out] recommended size of group for X dimension
+    uint32_t* groupSizeY,                                                   ///< [out] recommended size of group for Y dimension
+    uint32_t* groupSizeZ                                                    ///< [out] recommended size of group for Z dimension
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5549,8 +5832,8 @@ zeKernelSuggestGroupSize(
 ///         + `nullptr == totalGroupCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelSuggestMaxCooperativeGroupCount(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t* totalGroupCount                       ///< [out] recommended total group count.
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t* totalGroupCount                                               ///< [out] recommended total group count.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5575,11 +5858,11 @@ zeKernelSuggestMaxCooperativeGroupCount(
 ///     - ::ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelSetArgumentValue(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t argIndex,                              ///< [in] argument index in range [0, num args - 1]
-    size_t argSize,                                 ///< [in] size of argument type
-    const void* pArgValue                           ///< [in][optional] argument value represented as matching arg type. If
-                                                    ///< null then argument value is considered null.
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t argIndex,                                                      ///< [in] argument index in range [0, num args - 1]
+    size_t argSize,                                                         ///< [in] size of argument type
+    const void* pArgValue                                                   ///< [in][optional] argument value represented as matching arg type. If
+                                                                            ///< null then argument value is considered null.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5587,9 +5870,9 @@ zeKernelSetArgumentValue(
 typedef uint32_t ze_kernel_indirect_access_flags_t;
 typedef enum _ze_kernel_indirect_access_flag_t
 {
-    ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST = ZE_BIT(0),///< Indicates that the kernel accesses host allocations indirectly.
-    ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE = ZE_BIT(1),  ///< Indicates that the kernel accesses device allocations indirectly.
-    ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED = ZE_BIT(2),  ///< Indicates that the kernel accesses shared allocations indirectly.
+    ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST = ZE_BIT(0),                        ///< Indicates that the kernel accesses host allocations indirectly.
+    ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE = ZE_BIT(1),                      ///< Indicates that the kernel accesses device allocations indirectly.
+    ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED = ZE_BIT(2),                      ///< Indicates that the kernel accesses shared allocations indirectly.
     ZE_KERNEL_INDIRECT_ACCESS_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_kernel_indirect_access_flag_t;
@@ -5617,8 +5900,8 @@ typedef enum _ze_kernel_indirect_access_flag_t
 ///         + `0x7 < flags`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelSetIndirectAccess(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_kernel_indirect_access_flags_t flags         ///< [in] kernel indirect access flags
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_kernel_indirect_access_flags_t flags                                 ///< [in] kernel indirect access flags
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5641,8 +5924,8 @@ zeKernelSetIndirectAccess(
 ///         + `nullptr == pFlags`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelGetIndirectAccess(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_kernel_indirect_access_flags_t* pFlags       ///< [out] query result for kernel indirect access flags.
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_kernel_indirect_access_flags_t* pFlags                               ///< [out] query result for kernel indirect access flags.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5666,16 +5949,16 @@ zeKernelGetIndirectAccess(
 ///         + `nullptr == pSize`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelGetSourceAttributes(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t* pSize,                                ///< [in,out] pointer to size of string in bytes, including
-                                                    ///< null-terminating character.
-    char** pString                                  ///< [in,out][optional] pointer to application-managed character array
-                                                    ///< (string data).
-                                                    ///< If NULL, the string length of the kernel source attributes, including
-                                                    ///< a null-terminating character, is returned in pSize.
-                                                    ///< Otherwise, pString must point to valid application memory that is
-                                                    ///< greater than or equal to *pSize bytes in length, and on return the
-                                                    ///< pointed-to string will contain a space-separated list of kernel source attributes.
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t* pSize,                                                        ///< [in,out] pointer to size of string in bytes, including
+                                                                            ///< null-terminating character.
+    char** pString                                                          ///< [in,out][optional] pointer to application-managed character array
+                                                                            ///< (string data).
+                                                                            ///< If NULL, the string length of the kernel source attributes, including
+                                                                            ///< a null-terminating character, is returned in pSize.
+                                                                            ///< Otherwise, pString must point to valid application memory that is
+                                                                            ///< greater than or equal to *pSize bytes in length, and on return the
+                                                                            ///< pointed-to string will contain a space-separated list of kernel source attributes.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5683,8 +5966,8 @@ zeKernelGetSourceAttributes(
 typedef uint32_t ze_cache_config_flags_t;
 typedef enum _ze_cache_config_flag_t
 {
-    ZE_CACHE_CONFIG_FLAG_LARGE_SLM = ZE_BIT(0),     ///< Large SLM size
-    ZE_CACHE_CONFIG_FLAG_LARGE_DATA = ZE_BIT(1),    ///< Large General Data size
+    ZE_CACHE_CONFIG_FLAG_LARGE_SLM = ZE_BIT(0),                             ///< Large SLM size
+    ZE_CACHE_CONFIG_FLAG_LARGE_DATA = ZE_BIT(1),                            ///< Large General Data size
     ZE_CACHE_CONFIG_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_cache_config_flag_t;
@@ -5712,9 +5995,9 @@ typedef enum _ze_cache_config_flag_t
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelSetCacheConfig(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_cache_config_flags_t flags                   ///< [in] cache configuration.
-                                                    ///< must be 0 (default configuration) or a valid combination of ::ze_cache_config_flag_t.
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_cache_config_flags_t flags                                           ///< [in] cache configuration.
+                                                                            ///< must be 0 (default configuration) or a valid combination of ::ze_cache_config_flag_t.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5733,8 +6016,8 @@ zeKernelSetCacheConfig(
 /// @brief Kernel universal unique id (UUID)
 typedef struct _ze_kernel_uuid_t
 {
-    uint8_t kid[ZE_MAX_KERNEL_UUID_SIZE];           ///< [out] opaque data representing a kernel UUID
-    uint8_t mid[ZE_MAX_MODULE_UUID_SIZE];           ///< [out] opaque data representing the kernel's module UUID
+    uint8_t kid[ZE_MAX_KERNEL_UUID_SIZE];                                   ///< [out] opaque data representing a kernel UUID
+    uint8_t mid[ZE_MAX_MODULE_UUID_SIZE];                                   ///< [out] opaque data representing the kernel's module UUID
 
 } ze_kernel_uuid_t;
 
@@ -5742,26 +6025,26 @@ typedef struct _ze_kernel_uuid_t
 /// @brief Kernel properties
 typedef struct _ze_kernel_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t numKernelArgs;                         ///< [out] number of kernel arguments.
-    uint32_t requiredGroupSizeX;                    ///< [out] required group size in the X dimension,
-                                                    ///< or zero if there is no required group size
-    uint32_t requiredGroupSizeY;                    ///< [out] required group size in the Y dimension,
-                                                    ///< or zero if there is no required group size
-    uint32_t requiredGroupSizeZ;                    ///< [out] required group size in the Z dimension,
-                                                    ///< or zero if there is no required group size
-    uint32_t requiredNumSubGroups;                  ///< [out] required number of subgroups per thread group,
-                                                    ///< or zero if there is no required number of subgroups
-    uint32_t requiredSubgroupSize;                  ///< [out] required subgroup size,
-                                                    ///< or zero if there is no required subgroup size
-    uint32_t maxSubgroupSize;                       ///< [out] maximum subgroup size
-    uint32_t maxNumSubgroups;                       ///< [out] maximum number of subgroups per thread group
-    uint32_t localMemSize;                          ///< [out] local memory size used by each thread group
-    uint32_t privateMemSize;                        ///< [out] private memory size allocated by compiler used by each thread
-    uint32_t spillMemSize;                          ///< [out] spill memory size allocated by compiler
-    ze_kernel_uuid_t uuid;                          ///< [out] universal unique identifier.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t numKernelArgs;                                                 ///< [out] number of kernel arguments.
+    uint32_t requiredGroupSizeX;                                            ///< [out] required group size in the X dimension,
+                                                                            ///< or zero if there is no required group size
+    uint32_t requiredGroupSizeY;                                            ///< [out] required group size in the Y dimension,
+                                                                            ///< or zero if there is no required group size
+    uint32_t requiredGroupSizeZ;                                            ///< [out] required group size in the Z dimension,
+                                                                            ///< or zero if there is no required group size
+    uint32_t requiredNumSubGroups;                                          ///< [out] required number of subgroups per thread group,
+                                                                            ///< or zero if there is no required number of subgroups
+    uint32_t requiredSubgroupSize;                                          ///< [out] required subgroup size,
+                                                                            ///< or zero if there is no required subgroup size
+    uint32_t maxSubgroupSize;                                               ///< [out] maximum subgroup size
+    uint32_t maxNumSubgroups;                                               ///< [out] maximum number of subgroups per thread group
+    uint32_t localMemSize;                                                  ///< [out] local memory size used by each thread group
+    uint32_t privateMemSize;                                                ///< [out] private memory size allocated by compiler used by each thread
+    uint32_t spillMemSize;                                                  ///< [out] spill memory size allocated by compiler
+    ze_kernel_uuid_t uuid;                                                  ///< [out] universal unique identifier.
 
 } ze_kernel_properties_t;
 
@@ -5774,10 +6057,10 @@ typedef struct _ze_kernel_properties_t
 ///       preferred group size properties.
 typedef struct _ze_kernel_preferred_group_size_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t preferredMultiple;                     ///< [out] preferred group size multiple
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t preferredMultiple;                                             ///< [out] preferred group size multiple
 
 } ze_kernel_preferred_group_size_properties_t;
 
@@ -5800,8 +6083,8 @@ typedef struct _ze_kernel_preferred_group_size_properties_t
 ///         + `nullptr == pKernelProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelGetProperties(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_kernel_properties_t* pKernelProperties       ///< [in,out] query result for kernel properties.
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_kernel_properties_t* pKernelProperties                               ///< [in,out] query result for kernel properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5826,19 +6109,19 @@ zeKernelGetProperties(
 ///         + `nullptr == pSize`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelGetName(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    size_t* pSize,                                  ///< [in,out] size of kernel name string, including null terminator, in
-                                                    ///< bytes.
-    char* pName                                     ///< [in,out][optional] char pointer to kernel name.
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    size_t* pSize,                                                          ///< [in,out] size of kernel name string, including null terminator, in
+                                                                            ///< bytes.
+    char* pName                                                             ///< [in,out][optional] char pointer to kernel name.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Kernel dispatch group count.
 typedef struct _ze_group_count_t
 {
-    uint32_t groupCountX;                           ///< [in] number of thread groups in X dimension
-    uint32_t groupCountY;                           ///< [in] number of thread groups in Y dimension
-    uint32_t groupCountZ;                           ///< [in] number of thread groups in Z dimension
+    uint32_t groupCountX;                                                   ///< [in] number of thread groups in X dimension
+    uint32_t groupCountY;                                                   ///< [in] number of thread groups in Y dimension
+    uint32_t groupCountZ;                                                   ///< [in] number of thread groups in Z dimension
 
 } ze_group_count_t;
 
@@ -5872,14 +6155,14 @@ typedef struct _ze_group_count_t
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendLaunchKernel(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    const ze_group_count_t* pLaunchFuncArgs,        ///< [in] thread group launch arguments
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    const ze_group_count_t* pLaunchFuncArgs,                                ///< [in] thread group launch arguments
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5916,14 +6199,14 @@ zeCommandListAppendLaunchKernel(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendLaunchCooperativeKernel(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    const ze_group_count_t* pLaunchFuncArgs,        ///< [in] thread group launch arguments
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    const ze_group_count_t* pLaunchFuncArgs,                                ///< [in] thread group launch arguments
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5961,15 +6244,15 @@ zeCommandListAppendLaunchCooperativeKernel(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendLaunchKernelIndirect(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    const ze_group_count_t* pLaunchArgumentsBuffer, ///< [in] pointer to device buffer that will contain thread group launch
-                                                    ///< arguments
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    const ze_group_count_t* pLaunchArgumentsBuffer,                         ///< [in] pointer to device buffer that will contain thread group launch
+                                                                            ///< arguments
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6010,19 +6293,19 @@ zeCommandListAppendLaunchKernelIndirect(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendLaunchMultipleKernelsIndirect(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint32_t numKernels,                            ///< [in] maximum number of kernels to launch
-    ze_kernel_handle_t* phKernels,                  ///< [in][range(0, numKernels)] handles of the kernel objects
-    const uint32_t* pCountBuffer,                   ///< [in] pointer to device memory location that will contain the actual
-                                                    ///< number of kernels to launch; value must be less than or equal to
-                                                    ///< numKernels
-    const ze_group_count_t* pLaunchArgumentsBuffer, ///< [in][range(0, numKernels)] pointer to device buffer that will contain
-                                                    ///< a contiguous array of thread group launch arguments
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numKernels,                                                    ///< [in] maximum number of kernels to launch
+    ze_kernel_handle_t* phKernels,                                          ///< [in][range(0, numKernels)] handles of the kernel objects
+    const uint32_t* pCountBuffer,                                           ///< [in] pointer to device memory location that will contain the actual
+                                                                            ///< number of kernels to launch; value must be less than or equal to
+                                                                            ///< numKernels
+    const ze_group_count_t* pLaunchArgumentsBuffer,                         ///< [in][range(0, numKernels)] pointer to device buffer that will contain
+                                                                            ///< a contiguous array of thread group launch arguments
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 #if !defined(__GNUC__)
@@ -6042,8 +6325,8 @@ zeCommandListAppendLaunchMultipleKernelsIndirect(
 /// @brief Module Program Extension Version(s)
 typedef enum _ze_module_program_exp_version_t
 {
-    ZE_MODULE_PROGRAM_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_MODULE_PROGRAM_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_MODULE_PROGRAM_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),            ///< version 1.0
+    ZE_MODULE_PROGRAM_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),        ///< latest known version
     ZE_MODULE_PROGRAM_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_module_program_exp_version_t;
@@ -6063,18 +6346,18 @@ typedef enum _ze_module_program_exp_version_t
 ///       ::ZE_MODULE_FORMAT_IL_SPIRV.
 typedef struct _ze_module_program_exp_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t count;                                 ///< [in] Count of input modules
-    const size_t* inputSizes;                       ///< [in][range(0, count)] sizes of each input IL module in pInputModules.
-    const uint8_t** pInputModules;                  ///< [in][range(0, count)] pointer to an array of IL (e.g. SPIR-V modules).
-                                                    ///< Valid only for SPIR-V input.
-    const char** pBuildFlags;                       ///< [in][optional][range(0, count)] array of strings containing build
-                                                    ///< flags. See pBuildFlags in ::ze_module_desc_t.
-    const ze_module_constants_t** pConstants;       ///< [in][optional][range(0, count)] pointer to array of specialization
-                                                    ///< constant strings. Valid only for SPIR-V input. This must be set to
-                                                    ///< nullptr if no specialization constants are provided.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t count;                                                         ///< [in] Count of input modules
+    const size_t* inputSizes;                                               ///< [in][range(0, count)] sizes of each input IL module in pInputModules.
+    const uint8_t** pInputModules;                                          ///< [in][range(0, count)] pointer to an array of IL (e.g. SPIR-V modules).
+                                                                            ///< Valid only for SPIR-V input.
+    const char** pBuildFlags;                                               ///< [in][optional][range(0, count)] array of strings containing build
+                                                                            ///< flags. See pBuildFlags in ::ze_module_desc_t.
+    const ze_module_constants_t** pConstants;                               ///< [in][optional][range(0, count)] pointer to array of specialization
+                                                                            ///< constant strings. Valid only for SPIR-V input. This must be set to
+                                                                            ///< nullptr if no specialization constants are provided.
 
 } ze_module_program_exp_desc_t;
 
@@ -6095,8 +6378,8 @@ typedef struct _ze_module_program_exp_desc_t
 /// @brief Raytracing Extension Version(s)
 typedef enum _ze_raytracing_ext_version_t
 {
-    ZE_RAYTRACING_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_RAYTRACING_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_RAYTRACING_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZE_RAYTRACING_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
     ZE_RAYTRACING_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_raytracing_ext_version_t;
@@ -6106,7 +6389,7 @@ typedef enum _ze_raytracing_ext_version_t
 typedef uint32_t ze_device_raytracing_ext_flags_t;
 typedef enum _ze_device_raytracing_ext_flag_t
 {
-    ZE_DEVICE_RAYTRACING_EXT_FLAG_RAYQUERY = ZE_BIT(0), ///< Supports rayquery
+    ZE_DEVICE_RAYTRACING_EXT_FLAG_RAYQUERY = ZE_BIT(0),                     ///< Supports rayquery
     ZE_DEVICE_RAYTRACING_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_raytracing_ext_flag_t;
@@ -6116,14 +6399,14 @@ typedef enum _ze_device_raytracing_ext_flag_t
 /// 
 /// @details
 ///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-///       `pNext` member of ::ze_device_module_properties_t.
+///       the `pNext` member of ::ze_device_module_properties_t.
 typedef struct _ze_device_raytracing_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_raytracing_ext_flags_t flags;         ///< [out] 0 or a valid combination of ::ze_device_raytracing_ext_flags_t
-    uint32_t maxBVHLevels;                          ///< [out] Maximum number of BVH levels supported
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_raytracing_ext_flags_t flags;                                 ///< [out] 0 or a valid combination of ::ze_device_raytracing_ext_flags_t
+    uint32_t maxBVHLevels;                                                  ///< [out] Maximum number of BVH levels supported
 
 } ze_device_raytracing_ext_properties_t;
 
@@ -6132,7 +6415,7 @@ typedef struct _ze_device_raytracing_ext_properties_t
 typedef uint32_t ze_raytracing_mem_alloc_ext_flags_t;
 typedef enum _ze_raytracing_mem_alloc_ext_flag_t
 {
-    ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_TBD = ZE_BIT(0),   ///< reserved for future use
+    ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_TBD = ZE_BIT(0),                       ///< reserved for future use
     ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_raytracing_mem_alloc_ext_flag_t;
@@ -6142,17 +6425,17 @@ typedef enum _ze_raytracing_mem_alloc_ext_flag_t
 /// 
 /// @details
 ///     - This structure must be passed to ::zeMemAllocShared or
-///       ::zeMemAllocDevice, via `pNext` member of
+///       ::zeMemAllocDevice, via the `pNext` member of
 ///       ::ze_device_mem_alloc_desc_t, for any memory allocation that is to be
 ///       accessed by raytracing fixed-function of the device.
 typedef struct _ze_raytracing_mem_alloc_ext_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_raytracing_mem_alloc_ext_flags_t flags;      ///< [in] flags specifying additional allocation controls.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_raytracing_mem_alloc_ext_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_raytracing_mem_alloc_ext_flags_t flags;                              ///< [in] flags specifying additional allocation controls.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_raytracing_mem_alloc_ext_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics.
 
 } ze_raytracing_mem_alloc_ext_desc_t;
 
@@ -6185,10 +6468,10 @@ typedef struct _ze_raytracing_mem_alloc_ext_desc_t
 ///         + `nullptr == ptr`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextMakeMemoryResident(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    void* ptr,                                      ///< [in] pointer to memory to make resident
-    size_t size                                     ///< [in] size in bytes to make resident
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    void* ptr,                                                              ///< [in] pointer to memory to make resident
+    size_t size                                                             ///< [in] size in bytes to make resident
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6215,10 +6498,10 @@ zeContextMakeMemoryResident(
 ///         + `nullptr == ptr`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextEvictMemory(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    void* ptr,                                      ///< [in] pointer to memory to evict
-    size_t size                                     ///< [in] size in bytes to evict
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    void* ptr,                                                              ///< [in] pointer to memory to evict
+    size_t size                                                             ///< [in] size in bytes to evict
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6242,9 +6525,9 @@ zeContextEvictMemory(
 ///         + `nullptr == hImage`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextMakeImageResident(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_image_handle_t hImage                        ///< [in] handle of image to make resident
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_image_handle_t hImage                                                ///< [in] handle of image to make resident
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6270,9 +6553,9 @@ zeContextMakeImageResident(
 ///         + `nullptr == hImage`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeContextEvictImage(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_image_handle_t hImage                        ///< [in] handle of image to make evict
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_image_handle_t hImage                                                ///< [in] handle of image to make evict
     );
 
 #if !defined(__GNUC__)
@@ -6286,13 +6569,13 @@ zeContextEvictImage(
 /// @brief Sampler addressing modes
 typedef enum _ze_sampler_address_mode_t
 {
-    ZE_SAMPLER_ADDRESS_MODE_NONE = 0,               ///< No coordinate modifications for out-of-bounds image access.
-    ZE_SAMPLER_ADDRESS_MODE_REPEAT = 1,             ///< Out-of-bounds coordinates are wrapped back around.
-    ZE_SAMPLER_ADDRESS_MODE_CLAMP = 2,              ///< Out-of-bounds coordinates are clamped to edge.
-    ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3,    ///< Out-of-bounds coordinates are clamped to border color which is (0.0f,
-                                                    ///< 0.0f, 0.0f, 0.0f) if image format swizzle contains alpha, otherwise
-                                                    ///< (0.0f, 0.0f, 0.0f, 1.0f).
-    ZE_SAMPLER_ADDRESS_MODE_MIRROR = 4,             ///< Out-of-bounds coordinates are mirrored starting from edge.
+    ZE_SAMPLER_ADDRESS_MODE_NONE = 0,                                       ///< No coordinate modifications for out-of-bounds image access.
+    ZE_SAMPLER_ADDRESS_MODE_REPEAT = 1,                                     ///< Out-of-bounds coordinates are wrapped back around.
+    ZE_SAMPLER_ADDRESS_MODE_CLAMP = 2,                                      ///< Out-of-bounds coordinates are clamped to edge.
+    ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3,                            ///< Out-of-bounds coordinates are clamped to border color which is (0.0f,
+                                                                            ///< 0.0f, 0.0f, 0.0f) if image format swizzle contains alpha, otherwise
+                                                                            ///< (0.0f, 0.0f, 0.0f, 1.0f).
+    ZE_SAMPLER_ADDRESS_MODE_MIRROR = 4,                                     ///< Out-of-bounds coordinates are mirrored starting from edge.
     ZE_SAMPLER_ADDRESS_MODE_FORCE_UINT32 = 0x7fffffff
 
 } ze_sampler_address_mode_t;
@@ -6301,8 +6584,8 @@ typedef enum _ze_sampler_address_mode_t
 /// @brief Sampler filtering modes
 typedef enum _ze_sampler_filter_mode_t
 {
-    ZE_SAMPLER_FILTER_MODE_NEAREST = 0,             ///< No coordinate modifications for out of bounds image access.
-    ZE_SAMPLER_FILTER_MODE_LINEAR = 1,              ///< Out-of-bounds coordinates are wrapped back around.
+    ZE_SAMPLER_FILTER_MODE_NEAREST = 0,                                     ///< No coordinate modifications for out of bounds image access.
+    ZE_SAMPLER_FILTER_MODE_LINEAR = 1,                                      ///< Out-of-bounds coordinates are wrapped back around.
     ZE_SAMPLER_FILTER_MODE_FORCE_UINT32 = 0x7fffffff
 
 } ze_sampler_filter_mode_t;
@@ -6311,13 +6594,13 @@ typedef enum _ze_sampler_filter_mode_t
 /// @brief Sampler descriptor
 typedef struct _ze_sampler_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_sampler_address_mode_t addressMode;          ///< [in] Sampler addressing mode to determine how out-of-bounds
-                                                    ///< coordinates are handled.
-    ze_sampler_filter_mode_t filterMode;            ///< [in] Sampler filter mode to determine how samples are filtered.
-    ze_bool_t isNormalized;                         ///< [in] Are coordinates normalized [0, 1] or not.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_sampler_address_mode_t addressMode;                                  ///< [in] Sampler addressing mode to determine how out-of-bounds
+                                                                            ///< coordinates are handled.
+    ze_sampler_filter_mode_t filterMode;                                    ///< [in] Sampler filter mode to determine how samples are filtered.
+    ze_bool_t isNormalized;                                                 ///< [in] Are coordinates normalized [0, 1] or not.
 
 } ze_sampler_desc_t;
 
@@ -6347,10 +6630,10 @@ typedef struct _ze_sampler_desc_t
 ///         + `::ZE_SAMPLER_FILTER_MODE_LINEAR < desc->filterMode`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeSamplerCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_sampler_desc_t* desc,                  ///< [in] pointer to sampler descriptor
-    ze_sampler_handle_t* phSampler                  ///< [out] handle of the sampler
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_sampler_desc_t* desc,                                          ///< [in] pointer to sampler descriptor
+    ze_sampler_handle_t* phSampler                                          ///< [out] handle of the sampler
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6376,7 +6659,7 @@ zeSamplerCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeSamplerDestroy(
-    ze_sampler_handle_t hSampler                    ///< [in][release] handle of the sampler
+    ze_sampler_handle_t hSampler                                            ///< [in][release] handle of the sampler
     );
 
 #if !defined(__GNUC__)
@@ -6390,9 +6673,9 @@ zeSamplerDestroy(
 /// @brief Virtual memory page access attributes
 typedef enum _ze_memory_access_attribute_t
 {
-    ZE_MEMORY_ACCESS_ATTRIBUTE_NONE = 0,            ///< Indicates the memory page is inaccessible.
-    ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE = 1,       ///< Indicates the memory page supports read write access.
-    ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY = 2,        ///< Indicates the memory page supports read-only access.
+    ZE_MEMORY_ACCESS_ATTRIBUTE_NONE = 0,                                    ///< Indicates the memory page is inaccessible.
+    ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE = 1,                               ///< Indicates the memory page supports read write access.
+    ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY = 2,                                ///< Indicates the memory page supports read-only access.
     ZE_MEMORY_ACCESS_ATTRIBUTE_FORCE_UINT32 = 0x7fffffff
 
 } ze_memory_access_attribute_t;
@@ -6427,11 +6710,11 @@ typedef enum _ze_memory_access_attribute_t
 ///         + `0 == size`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeVirtualMemReserve(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* pStart,                             ///< [in][optional] pointer to start of region to reserve. If nullptr then
-                                                    ///< implementation will choose a start address.
-    size_t size,                                    ///< [in] size in bytes to reserve; must be page aligned.
-    void** pptr                                     ///< [out] pointer to virtual reservation.
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* pStart,                                                     ///< [in][optional] pointer to start of region to reserve. If nullptr then
+                                                                            ///< implementation will choose a start address.
+    size_t size,                                                            ///< [in] size in bytes to reserve; must be page aligned.
+    void** pptr                                                             ///< [out] pointer to virtual reservation.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6459,9 +6742,9 @@ zeVirtualMemReserve(
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeVirtualMemFree(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of region to free.
-    size_t size                                     ///< [in] size in bytes to free; must be page aligned.
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of region to free.
+    size_t size                                                             ///< [in] size in bytes to free; must be page aligned.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6487,11 +6770,11 @@ zeVirtualMemFree(
 ///         + `0 == size`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeVirtualMemQueryPageSize(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    size_t size,                                    ///< [in] unaligned allocation size in bytes
-    size_t* pagesize                                ///< [out] pointer to page size to use for start address and size
-                                                    ///< alignments.
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    size_t size,                                                            ///< [in] unaligned allocation size in bytes
+    size_t* pagesize                                                        ///< [out] pointer to page size to use for start address and size
+                                                                            ///< alignments.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6499,7 +6782,7 @@ zeVirtualMemQueryPageSize(
 typedef uint32_t ze_physical_mem_flags_t;
 typedef enum _ze_physical_mem_flag_t
 {
-    ZE_PHYSICAL_MEM_FLAG_TBD = ZE_BIT(0),           ///< reserved for future use.
+    ZE_PHYSICAL_MEM_FLAG_TBD = ZE_BIT(0),                                   ///< reserved for future use.
     ZE_PHYSICAL_MEM_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_physical_mem_flag_t;
@@ -6508,12 +6791,12 @@ typedef enum _ze_physical_mem_flag_t
 /// @brief Physical memory descriptor
 typedef struct _ze_physical_mem_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_physical_mem_flags_t flags;                  ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_physical_mem_flag_t.
-    size_t size;                                    ///< [in] size in bytes to reserve; must be page aligned.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_physical_mem_flags_t flags;                                          ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_physical_mem_flag_t.
+    size_t size;                                                            ///< [in] size in bytes to reserve; must be page aligned.
 
 } ze_physical_mem_desc_t;
 
@@ -6546,10 +6829,10 @@ typedef struct _ze_physical_mem_desc_t
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zePhysicalMemCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    ze_physical_mem_desc_t* desc,                   ///< [in] pointer to physical memory descriptor.
-    ze_physical_mem_handle_t* phPhysicalMemory      ///< [out] pointer to handle of physical memory object created
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    ze_physical_mem_desc_t* desc,                                           ///< [in] pointer to physical memory descriptor.
+    ze_physical_mem_handle_t* phPhysicalMemory                              ///< [out] pointer to handle of physical memory object created
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6574,8 +6857,8 @@ zePhysicalMemCreate(
 ///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zePhysicalMemDestroy(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_physical_mem_handle_t hPhysicalMemory        ///< [in][release] handle of physical memory object to destroy
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_physical_mem_handle_t hPhysicalMemory                                ///< [in][release] handle of physical memory object to destroy
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6612,15 +6895,15 @@ zePhysicalMemDestroy(
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeVirtualMemMap(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of virtual address range to map.
-    size_t size,                                    ///< [in] size in bytes of virtual address range to map; must be page
-                                                    ///< aligned.
-    ze_physical_mem_handle_t hPhysicalMemory,       ///< [in] handle to physical memory object.
-    size_t offset,                                  ///< [in] offset into physical memory allocation object; must be page
-                                                    ///< aligned.
-    ze_memory_access_attribute_t access             ///< [in] specifies page access attributes to apply to the virtual address
-                                                    ///< range.
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of virtual address range to map.
+    size_t size,                                                            ///< [in] size in bytes of virtual address range to map; must be page
+                                                                            ///< aligned.
+    ze_physical_mem_handle_t hPhysicalMemory,                               ///< [in] handle to physical memory object.
+    size_t offset,                                                          ///< [in] offset into physical memory allocation object; must be page
+                                                                            ///< aligned.
+    ze_memory_access_attribute_t access                                     ///< [in] specifies page access attributes to apply to the virtual address
+                                                                            ///< range.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6643,15 +6926,16 @@ zeVirtualMemMap(
 ///         + `nullptr == hContext`
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned"
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Address must be page aligned
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
 ///         + `0 == size`
 ///         + Size must be page aligned
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeVirtualMemUnmap(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of region to unmap.
-    size_t size                                     ///< [in] size in bytes to unmap; must be page aligned.
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of region to unmap.
+    size_t size                                                             ///< [in] size in bytes to unmap; must be page aligned.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6674,17 +6958,18 @@ zeVirtualMemUnmap(
 ///         + `nullptr == ptr`
 ///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
 ///         + `::ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY < access`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned"
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Address must be page aligned
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
 ///         + `0 == size`
 ///         + Size must be page aligned
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeVirtualMemSetAccessAttribute(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of reserved virtual address region.
-    size_t size,                                    ///< [in] size in bytes; must be page aligned.
-    ze_memory_access_attribute_t access             ///< [in] specifies page access attributes to apply to the virtual address
-                                                    ///< range.
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of reserved virtual address region.
+    size_t size,                                                            ///< [in] size in bytes; must be page aligned.
+    ze_memory_access_attribute_t access                                     ///< [in] specifies page access attributes to apply to the virtual address
+                                                                            ///< range.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6709,18 +6994,19 @@ zeVirtualMemSetAccessAttribute(
 ///         + `nullptr == ptr`
 ///         + `nullptr == access`
 ///         + `nullptr == outSize`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned"
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Address must be page aligned
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
 ///         + `0 == size`
 ///         + Size must be page aligned
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeVirtualMemGetAccessAttribute(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of virtual address region for query.
-    size_t size,                                    ///< [in] size in bytes; must be page aligned.
-    ze_memory_access_attribute_t* access,           ///< [out] query result for page access attribute.
-    size_t* outSize                                 ///< [out] query result for size of virtual address range, starting at ptr,
-                                                    ///< that shares same access attribute.
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of virtual address region for query.
+    size_t size,                                                            ///< [in] size in bytes; must be page aligned.
+    ze_memory_access_attribute_t* access,                                   ///< [out] query result for page access attribute.
+    size_t* outSize                                                         ///< [out] query result for size of virtual address range, starting at ptr,
+                                                                            ///< that shares same access attribute.
     );
 
 #if !defined(__GNUC__)
@@ -6740,8 +7026,8 @@ zeVirtualMemGetAccessAttribute(
 /// @brief Floating-Point Atomics Extension Version(s)
 typedef enum _ze_float_atomics_ext_version_t
 {
-    ZE_FLOAT_ATOMICS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_FLOAT_ATOMICS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_FLOAT_ATOMICS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),             ///< version 1.0
+    ZE_FLOAT_ATOMICS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),         ///< latest known version
     ZE_FLOAT_ATOMICS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_float_atomics_ext_version_t;
@@ -6751,12 +7037,12 @@ typedef enum _ze_float_atomics_ext_version_t
 typedef uint32_t ze_device_fp_atomic_ext_flags_t;
 typedef enum _ze_device_fp_atomic_ext_flag_t
 {
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE = ZE_BIT(0), ///< Supports atomic load, store, and exchange
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD = ZE_BIT(1),///< Supports atomic add and subtract
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX = ZE_BIT(2),///< Supports atomic min and max
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE = ZE_BIT(16), ///< Supports atomic load, store, and exchange
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD = ZE_BIT(17),///< Supports atomic add and subtract
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX = ZE_BIT(18),///< Supports atomic min and max
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE = ZE_BIT(0),             ///< Supports atomic load, store, and exchange
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD = ZE_BIT(1),                    ///< Supports atomic add and subtract
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX = ZE_BIT(2),                ///< Supports atomic min and max
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE = ZE_BIT(16),             ///< Supports atomic load, store, and exchange
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD = ZE_BIT(17),                    ///< Supports atomic add and subtract
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX = ZE_BIT(18),                ///< Supports atomic min and max
     ZE_DEVICE_FP_ATOMIC_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_fp_atomic_ext_flag_t;
@@ -6767,17 +7053,17 @@ typedef enum _ze_device_fp_atomic_ext_flag_t
 /// 
 /// @details
 ///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-///       `pNext` member of ::ze_device_module_properties_t.
+///       the `pNext` member of ::ze_device_module_properties_t.
 typedef struct _ze_float_atomic_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_fp_atomic_ext_flags_t fp16Flags;      ///< [out] Capabilities for half-precision floating-point atomic operations
-    ze_device_fp_atomic_ext_flags_t fp32Flags;      ///< [out] Capabilities for single-precision floating-point atomic
-                                                    ///< operations
-    ze_device_fp_atomic_ext_flags_t fp64Flags;      ///< [out] Capabilities for double-precision floating-point atomic
-                                                    ///< operations
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_fp_atomic_ext_flags_t fp16Flags;                              ///< [out] Capabilities for half-precision floating-point atomic operations
+    ze_device_fp_atomic_ext_flags_t fp32Flags;                              ///< [out] Capabilities for single-precision floating-point atomic
+                                                                            ///< operations
+    ze_device_fp_atomic_ext_flags_t fp64Flags;                              ///< [out] Capabilities for double-precision floating-point atomic
+                                                                            ///< operations
 
 } ze_float_atomic_ext_properties_t;
 
@@ -6798,8 +7084,8 @@ typedef struct _ze_float_atomic_ext_properties_t
 /// @brief Global Offset Extension Version(s)
 typedef enum _ze_global_offset_exp_version_t
 {
-    ZE_GLOBAL_OFFSET_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_GLOBAL_OFFSET_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_GLOBAL_OFFSET_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),             ///< version 1.0
+    ZE_GLOBAL_OFFSET_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),         ///< latest known version
     ZE_GLOBAL_OFFSET_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_global_offset_exp_version_t;
@@ -6824,10 +7110,10 @@ typedef enum _ze_global_offset_exp_version_t
 ///         + `nullptr == hKernel`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelSetGlobalOffsetExp(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t offsetX,                               ///< [in] global offset for X dimension to use for this kernel
-    uint32_t offsetY,                               ///< [in] global offset for Y dimension to use for this kernel
-    uint32_t offsetZ                                ///< [in] global offset for Z dimension to use for this kernel
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t offsetX,                                                       ///< [in] global offset for X dimension to use for this kernel
+    uint32_t offsetY,                                                       ///< [in] global offset for Y dimension to use for this kernel
+    uint32_t offsetZ                                                        ///< [in] global offset for Z dimension to use for this kernel
     );
 
 #if !defined(__GNUC__)
@@ -6858,7 +7144,8 @@ typedef enum _ze_relaxed_allocation_limits_exp_version_t
 typedef uint32_t ze_relaxed_allocation_limits_exp_flags_t;
 typedef enum _ze_relaxed_allocation_limits_exp_flag_t
 {
-    ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE = ZE_BIT(0), ///< Allocation size may exceed ::ze_device_properties_t.maxMemAllocSize
+    ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE = ZE_BIT(0),             ///< Allocation size may exceed the `maxMemAllocSize` member of
+                                                                            ///< ::ze_device_properties_t.
     ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_relaxed_allocation_limits_exp_flag_t;
@@ -6868,17 +7155,17 @@ typedef enum _ze_relaxed_allocation_limits_exp_flag_t
 /// 
 /// @details
 ///     - This structure may be passed to ::zeMemAllocShared or
-///       ::zeMemAllocDevice, via `pNext` member of
+///       ::zeMemAllocDevice, via the `pNext` member of
 ///       ::ze_device_mem_alloc_desc_t.
-///     - This structure may also be passed to ::zeMemAllocHost, via `pNext`
+///     - This structure may also be passed to ::zeMemAllocHost, via the `pNext`
 ///       member of ::ze_host_mem_alloc_desc_t.
 typedef struct _ze_relaxed_allocation_limits_exp_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_relaxed_allocation_limits_exp_flags_t flags; ///< [in] flags specifying allocation limits to relax.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_relaxed_allocation_limits_exp_flag_t;
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_relaxed_allocation_limits_exp_flags_t flags;                         ///< [in] flags specifying allocation limits to relax.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_relaxed_allocation_limits_exp_flag_t;
 
 } ze_relaxed_allocation_limits_exp_desc_t;
 
@@ -6899,8 +7186,8 @@ typedef struct _ze_relaxed_allocation_limits_exp_desc_t
 /// @brief Cache_Reservation Extension Version(s)
 typedef enum _ze_cache_reservation_ext_version_t
 {
-    ZE_CACHE_RESERVATION_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_CACHE_RESERVATION_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_CACHE_RESERVATION_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),         ///< version 1.0
+    ZE_CACHE_RESERVATION_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),     ///< latest known version
     ZE_CACHE_RESERVATION_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_cache_reservation_ext_version_t;
@@ -6909,9 +7196,15 @@ typedef enum _ze_cache_reservation_ext_version_t
 /// @brief Cache Reservation Region
 typedef enum _ze_cache_ext_region_t
 {
-    ZE_CACHE_EXT_REGION_ZE_CACHE_REGION_DEFAULT = 0,///< utilize driver default scheme
-    ZE_CACHE_EXT_REGION_ZE_CACHE_RESERVE_REGION = 1,///< Utilize reserver region
-    ZE_CACHE_EXT_REGION_ZE_CACHE_NON_RESERVED_REGION = 2,   ///< Utilize non-reserverd region
+    ZE_CACHE_EXT_REGION_ZE_CACHE_REGION_DEFAULT = 0,                        ///< [DEPRECATED] utilize driver default scheme. Use
+                                                                            ///< ::ZE_CACHE_EXT_REGION_DEFAULT.
+    ZE_CACHE_EXT_REGION_ZE_CACHE_RESERVE_REGION = 1,                        ///< [DEPRECATED] utilize reserved region. Use
+                                                                            ///< ::ZE_CACHE_EXT_REGION_RESERVED.
+    ZE_CACHE_EXT_REGION_ZE_CACHE_NON_RESERVED_REGION = 2,                   ///< [DEPRECATED] utilize non-reserverd region. Use
+                                                                            ///< ::ZE_CACHE_EXT_REGION_NON_RESERVED.
+    ZE_CACHE_EXT_REGION_DEFAULT = 0,                                        ///< utilize driver default scheme
+    ZE_CACHE_EXT_REGION_RESERVED = 1,                                       ///< utilize reserved region
+    ZE_CACHE_EXT_REGION_NON_RESERVED = 2,                                   ///< utilize non-reserverd region
     ZE_CACHE_EXT_REGION_FORCE_UINT32 = 0x7fffffff
 
 } ze_cache_ext_region_t;
@@ -6920,16 +7213,16 @@ typedef enum _ze_cache_ext_region_t
 /// @brief CacheReservation structure
 /// 
 /// @details
-///     - This structure must be passed to ::zeDeviceGetCacheProperties via
+///     - This structure must be passed to ::zeDeviceGetCacheProperties via the
 ///       `pNext` member of ::ze_device_cache_properties_t
 ///     - Used for determining the max cache reservation allowed on device. Size
 ///       of zero means no reservation available.
 typedef struct _ze_cache_reservation_ext_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    size_t maxCacheReservationSize;                 ///< [out] max cache reservation size
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    size_t maxCacheReservationSize;                                         ///< [out] max cache reservation size
 
 } ze_cache_reservation_ext_desc_t;
 
@@ -6954,12 +7247,12 @@ typedef struct _ze_cache_reservation_ext_desc_t
 ///         + `nullptr == hDevice`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceReserveCacheExt(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    size_t cacheLevel,                              ///< [in] cache level where application want to reserve. If zero, then the
-                                                    ///< driver shall default to last level of cache and attempt to reserve in
-                                                    ///< that cache.
-    size_t cacheReservationSize                     ///< [in] value for reserving size, in bytes. If zero, then the driver
-                                                    ///< shall remove prior reservation
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    size_t cacheLevel,                                                      ///< [in] cache level where application want to reserve. If zero, then the
+                                                                            ///< driver shall default to last level of cache and attempt to reserve in
+                                                                            ///< that cache.
+    size_t cacheReservationSize                                             ///< [in] value for reserving size, in bytes. If zero, then the driver
+                                                                            ///< shall remove prior reservation
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6980,13 +7273,13 @@ zeDeviceReserveCacheExt(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == ptr`
 ///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZE_CACHE_EXT_REGION_::ZE_CACHE_NON_RESERVED_REGION < cacheRegion`
+///         + `::ZE_CACHE_EXT_REGION_NON_RESERVED < cacheRegion`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceSetCacheAdviceExt(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    void* ptr,                                      ///< [in] memory pointer to query
-    size_t regionSize,                              ///< [in] region size, in pages
-    ze_cache_ext_region_t cacheRegion               ///< [in] reservation region
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    void* ptr,                                                              ///< [in] memory pointer to query
+    size_t regionSize,                                                      ///< [in] region size, in pages
+    ze_cache_ext_region_t cacheRegion                                       ///< [in] reservation region
     );
 
 #if !defined(__GNUC__)
@@ -7006,7 +7299,7 @@ zeDeviceSetCacheAdviceExt(
 /// @brief Event Query Timestamps Extension Version(s)
 typedef enum _ze_event_query_timestamps_exp_version_t
 {
-    ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
+    ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),    ///< version 1.0
     ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
     ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 
@@ -7044,16 +7337,16 @@ typedef enum _ze_event_query_timestamps_exp_version_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventQueryTimestampsExp(
-    ze_event_handle_t hEvent,                       ///< [in] handle of the event
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device to query
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of timestamp results.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of timestamps available.
-                                                    ///< if count is greater than the number of timestamps available, then the
-                                                    ///< driver shall update the value with the correct number of timestamps available.
-    ze_kernel_timestamp_result_t* pTimestamps       ///< [in,out][optional][range(0, *pCount)] array of timestamp results.
-                                                    ///< if count is less than the number of timestamps available, then driver
-                                                    ///< shall only retrieve that number of timestamps.
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device to query
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of timestamp results.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of timestamps available.
+                                                                            ///< if count is greater than the number of timestamps available, then the
+                                                                            ///< driver shall update the value with the correct number of timestamps available.
+    ze_kernel_timestamp_result_t* pTimestamps                               ///< [in,out][optional][range(0, *pCount)] array of timestamp results.
+                                                                            ///< if count is less than the number of timestamps available, then driver
+                                                                            ///< shall only retrieve that number of timestamps.
     );
 
 #if !defined(__GNUC__)
@@ -7083,12 +7376,12 @@ typedef enum _ze_image_memory_properties_exp_version_t
 /// @brief Image memory properties
 typedef struct _ze_image_memory_properties_exp_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint64_t size;                                  ///< [out] size of image allocation in bytes.
-    uint64_t rowPitch;                              ///< [out] size of image row in bytes.
-    uint64_t slicePitch;                            ///< [out] size of image slice in bytes.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t size;                                                          ///< [out] size of image allocation in bytes.
+    uint64_t rowPitch;                                                      ///< [out] size of image row in bytes.
+    uint64_t slicePitch;                                                    ///< [out] size of image slice in bytes.
 
 } ze_image_memory_properties_exp_t;
 
@@ -7117,8 +7410,8 @@ typedef struct _ze_image_memory_properties_exp_t
 ///         + `nullptr == pMemoryProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeImageGetMemoryPropertiesExp(
-    ze_image_handle_t hImage,                       ///< [in] handle of image object
-    ze_image_memory_properties_exp_t* pMemoryProperties ///< [in,out] query result for image memory properties.
+    ze_image_handle_t hImage,                                               ///< [in] handle of image object
+    ze_image_memory_properties_exp_t* pMemoryProperties                     ///< [in,out] query result for image memory properties.
     );
 
 #if !defined(__GNUC__)
@@ -7138,8 +7431,8 @@ zeImageGetMemoryPropertiesExp(
 /// @brief Image View Extension Version(s)
 typedef enum _ze_image_view_ext_version_t
 {
-    ZE_IMAGE_VIEW_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_IMAGE_VIEW_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_IMAGE_VIEW_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZE_IMAGE_VIEW_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
     ZE_IMAGE_VIEW_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_view_ext_version_t;
@@ -7183,11 +7476,11 @@ typedef enum _ze_image_view_ext_version_t
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeImageViewCreateExt(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_image_desc_t* desc,                    ///< [in] pointer to image descriptor
-    ze_image_handle_t hImage,                       ///< [in] handle of image object to create view from
-    ze_image_handle_t* phImageView                  ///< [out] pointer to handle of image object created for view
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_image_desc_t* desc,                                            ///< [in] pointer to image descriptor
+    ze_image_handle_t hImage,                                               ///< [in] handle of image object to create view from
+    ze_image_handle_t* phImageView                                          ///< [out] pointer to handle of image object created for view
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -7200,8 +7493,8 @@ zeImageViewCreateExt(
 /// @brief Image View Extension Version(s)
 typedef enum _ze_image_view_exp_version_t
 {
-    ZE_IMAGE_VIEW_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_IMAGE_VIEW_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_IMAGE_VIEW_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZE_IMAGE_VIEW_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
     ZE_IMAGE_VIEW_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_view_exp_version_t;
@@ -7248,11 +7541,11 @@ typedef enum _ze_image_view_exp_version_t
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeImageViewCreateExp(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_image_desc_t* desc,                    ///< [in] pointer to image descriptor
-    ze_image_handle_t hImage,                       ///< [in] handle of image object to create view from
-    ze_image_handle_t* phImageView                  ///< [out] pointer to handle of image object created for view
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_image_desc_t* desc,                                            ///< [in] pointer to image descriptor
+    ze_image_handle_t hImage,                                               ///< [in] handle of image object to create view from
+    ze_image_handle_t* phImageView                                          ///< [out] pointer to handle of image object created for view
     );
 
 #if !defined(__GNUC__)
@@ -7272,8 +7565,8 @@ zeImageViewCreateExp(
 /// @brief Image View Planar Extension Version(s)
 typedef enum _ze_image_view_planar_ext_version_t
 {
-    ZE_IMAGE_VIEW_PLANAR_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_IMAGE_VIEW_PLANAR_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_IMAGE_VIEW_PLANAR_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),         ///< version 1.0
+    ZE_IMAGE_VIEW_PLANAR_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),     ///< latest known version
     ZE_IMAGE_VIEW_PLANAR_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_view_planar_ext_version_t;
@@ -7282,10 +7575,10 @@ typedef enum _ze_image_view_planar_ext_version_t
 /// @brief Image view planar descriptor
 typedef struct _ze_image_view_planar_ext_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t planeIndex;                            ///< [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t planeIndex;                                                    ///< [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
 
 } ze_image_view_planar_ext_desc_t;
 
@@ -7299,8 +7592,8 @@ typedef struct _ze_image_view_planar_ext_desc_t
 /// @brief Image View Planar Extension Version(s)
 typedef enum _ze_image_view_planar_exp_version_t
 {
-    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),         ///< version 1.0
+    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),     ///< latest known version
     ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_view_planar_exp_version_t;
@@ -7309,10 +7602,10 @@ typedef enum _ze_image_view_planar_exp_version_t
 /// @brief Image view planar descriptor
 typedef struct _ze_image_view_planar_exp_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t planeIndex;                            ///< [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t planeIndex;                                                    ///< [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
 
 } ze_image_view_planar_exp_desc_t;
 
@@ -7333,8 +7626,8 @@ typedef struct _ze_image_view_planar_exp_desc_t
 /// @brief Kernel Scheduling Hints Extension Version(s)
 typedef enum _ze_scheduling_hints_exp_version_t
 {
-    ZE_SCHEDULING_HINTS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_SCHEDULING_HINTS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_SCHEDULING_HINTS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),          ///< version 1.0
+    ZE_SCHEDULING_HINTS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),      ///< latest known version
     ZE_SCHEDULING_HINTS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_scheduling_hints_exp_version_t;
@@ -7344,9 +7637,9 @@ typedef enum _ze_scheduling_hints_exp_version_t
 typedef uint32_t ze_scheduling_hint_exp_flags_t;
 typedef enum _ze_scheduling_hint_exp_flag_t
 {
-    ZE_SCHEDULING_HINT_EXP_FLAG_OLDEST_FIRST = ZE_BIT(0),   ///< Hint that the kernel prefers oldest-first scheduling
-    ZE_SCHEDULING_HINT_EXP_FLAG_ROUND_ROBIN = ZE_BIT(1),///< Hint that the kernel prefers round-robin scheduling
-    ZE_SCHEDULING_HINT_EXP_FLAG_STALL_BASED_ROUND_ROBIN = ZE_BIT(2),///< Hint that the kernel prefers stall-based round-robin scheduling
+    ZE_SCHEDULING_HINT_EXP_FLAG_OLDEST_FIRST = ZE_BIT(0),                   ///< Hint that the kernel prefers oldest-first scheduling
+    ZE_SCHEDULING_HINT_EXP_FLAG_ROUND_ROBIN = ZE_BIT(1),                    ///< Hint that the kernel prefers round-robin scheduling
+    ZE_SCHEDULING_HINT_EXP_FLAG_STALL_BASED_ROUND_ROBIN = ZE_BIT(2),        ///< Hint that the kernel prefers stall-based round-robin scheduling
     ZE_SCHEDULING_HINT_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_scheduling_hint_exp_flag_t;
@@ -7357,14 +7650,14 @@ typedef enum _ze_scheduling_hint_exp_flag_t
 /// 
 /// @details
 ///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-///       `pNext` member of ::ze_device_module_properties_t.
+///       the `pNext` member of ::ze_device_module_properties_t.
 typedef struct _ze_scheduling_hint_exp_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_scheduling_hint_exp_flags_t schedulingHintFlags; ///< [out] Supported kernel scheduling hints.
-                                                    ///< May be 0 (none) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_scheduling_hint_exp_flags_t schedulingHintFlags;                     ///< [out] Supported kernel scheduling hints.
+                                                                            ///< May be 0 (none) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
 
 } ze_scheduling_hint_exp_properties_t;
 
@@ -7375,11 +7668,11 @@ typedef struct _ze_scheduling_hint_exp_properties_t
 ///     - This structure may be passed to ::zeKernelSchedulingHintExp.
 typedef struct _ze_scheduling_hint_exp_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_scheduling_hint_exp_flags_t flags;           ///< [in] flags specifying kernel scheduling hints.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_scheduling_hint_exp_flags_t flags;                                   ///< [in] flags specifying kernel scheduling hints.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
 
 } ze_scheduling_hint_exp_desc_t;
 
@@ -7411,8 +7704,8 @@ typedef struct _ze_scheduling_hint_exp_desc_t
 ///         + `0x7 < pHint->flags`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeKernelSchedulingHintExp(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_scheduling_hint_exp_desc_t* pHint            ///< [in] pointer to kernel scheduling hint descriptor
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_scheduling_hint_exp_desc_t* pHint                                    ///< [in] pointer to kernel scheduling hint descriptor
     );
 
 #if !defined(__GNUC__)
@@ -7432,8 +7725,8 @@ zeKernelSchedulingHintExp(
 /// @brief Linkonce ODR Extension Version(s)
 typedef enum _ze_linkonce_odr_ext_version_t
 {
-    ZE_LINKONCE_ODR_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_LINKONCE_ODR_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_LINKONCE_ODR_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),              ///< version 1.0
+    ZE_LINKONCE_ODR_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),          ///< latest known version
     ZE_LINKONCE_ODR_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_linkonce_odr_ext_version_t;
@@ -7455,8 +7748,8 @@ typedef enum _ze_linkonce_odr_ext_version_t
 /// @brief Power Saving Hint Extension Version(s)
 typedef enum _ze_power_saving_hint_exp_version_t
 {
-    ZE_POWER_SAVING_HINT_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_POWER_SAVING_HINT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_POWER_SAVING_HINT_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),         ///< version 1.0
+    ZE_POWER_SAVING_HINT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),     ///< latest known version
     ZE_POWER_SAVING_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_power_saving_hint_exp_version_t;
@@ -7465,10 +7758,10 @@ typedef enum _ze_power_saving_hint_exp_version_t
 /// @brief Supported device types
 typedef enum _ze_power_saving_hint_type_t
 {
-    ZE_POWER_SAVING_HINT_TYPE_MIN = 0,              ///< Minumum power savings. The device will make no attempt to save power
-                                                    ///< while executing work submitted to this context.
-    ZE_POWER_SAVING_HINT_TYPE_MAX = 100,            ///< Maximum power savings. The device will do everything to bring power to
-                                                    ///< a minimum while executing work submitted to this context.
+    ZE_POWER_SAVING_HINT_TYPE_MIN = 0,                                      ///< Minumum power savings. The device will make no attempt to save power
+                                                                            ///< while executing work submitted to this context.
+    ZE_POWER_SAVING_HINT_TYPE_MAX = 100,                                    ///< Maximum power savings. The device will do everything to bring power to
+                                                                            ///< a minimum while executing work submitted to this context.
     ZE_POWER_SAVING_HINT_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } ze_power_saving_hint_type_t;
@@ -7477,11 +7770,11 @@ typedef enum _ze_power_saving_hint_type_t
 /// @brief Extended context descriptor containing power saving hint.
 typedef struct _ze_context_power_saving_hint_exp_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t hint;                                  ///< [in] power saving hint (default value = 0). This is value from [0,100]
-                                                    ///< and can use pre-defined settings from ::ze_power_saving_hint_type_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t hint;                                                          ///< [in] power saving hint (default value = 0). This is value from [0,100]
+                                                                            ///< and can use pre-defined settings from ::ze_power_saving_hint_type_t.
 
 } ze_context_power_saving_hint_exp_desc_t;
 
@@ -7502,8 +7795,8 @@ typedef struct _ze_context_power_saving_hint_exp_desc_t
 /// @brief Subgroups Extension Version(s)
 typedef enum _ze_subgroup_ext_version_t
 {
-    ZE_SUBGROUP_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_SUBGROUP_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_SUBGROUP_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                  ///< version 1.0
+    ZE_SUBGROUP_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),              ///< latest known version
     ZE_SUBGROUP_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_subgroup_ext_version_t;
@@ -7525,8 +7818,8 @@ typedef enum _ze_subgroup_ext_version_t
 /// @brief EU Count Extension Version(s)
 typedef enum _ze_eu_count_ext_version_t
 {
-    ZE_EU_COUNT_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_EU_COUNT_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_EU_COUNT_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                  ///< version 1.0
+    ZE_EU_COUNT_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),              ///< latest known version
     ZE_EU_COUNT_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_eu_count_ext_version_t;
@@ -7535,15 +7828,15 @@ typedef enum _ze_eu_count_ext_version_t
 /// @brief EU count queried using ::zeDeviceGetProperties
 /// 
 /// @details
-///     - This structure may be returned from ::zeDeviceGetProperties via
-///       `pNext` member of ::ze_device_properties_t
+///     - This structure may be returned from ::zeDeviceGetProperties via the
+///       `pNext` member of ::ze_device_properties_t.
 ///     - Used for determining the total number of EUs available on device.
 typedef struct _ze_eu_count_ext_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t numTotalEUs;                           ///< [out] Total number of EUs available
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t numTotalEUs;                                                   ///< [out] Total number of EUs available
 
 } ze_eu_count_ext_t;
 
@@ -7564,8 +7857,8 @@ typedef struct _ze_eu_count_ext_t
 /// @brief PCI Properties Extension Version(s)
 typedef enum _ze_pci_properties_ext_version_t
 {
-    ZE_PCI_PROPERTIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_PCI_PROPERTIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_PCI_PROPERTIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),            ///< version 1.0
+    ZE_PCI_PROPERTIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),        ///< latest known version
     ZE_PCI_PROPERTIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_pci_properties_ext_version_t;
@@ -7580,10 +7873,10 @@ typedef enum _ze_pci_properties_ext_version_t
 ///       is useful for locating the device in the PCI switch fabric.
 typedef struct _ze_pci_address_ext_t
 {
-    uint32_t domain;                                ///< [out] PCI domain number
-    uint32_t bus;                                   ///< [out] PCI BDF bus number
-    uint32_t device;                                ///< [out] PCI BDF device number
-    uint32_t function;                              ///< [out] PCI BDF function number
+    uint32_t domain;                                                        ///< [out] PCI domain number
+    uint32_t bus;                                                           ///< [out] PCI BDF bus number
+    uint32_t device;                                                        ///< [out] PCI BDF device number
+    uint32_t function;                                                      ///< [out] PCI BDF function number
 
 } ze_pci_address_ext_t;
 
@@ -7591,12 +7884,12 @@ typedef struct _ze_pci_address_ext_t
 /// @brief Device PCI speed
 typedef struct _ze_pci_speed_ext_t
 {
-    int32_t genVersion;                             ///< [out] The link generation. A value of -1 means that this property is
-                                                    ///< unknown.
-    int32_t width;                                  ///< [out] The number of lanes. A value of -1 means that this property is
-                                                    ///< unknown.
-    int64_t maxBandwidth;                           ///< [out] The theoretical maximum bandwidth in bytes/sec (sum of all
-                                                    ///< lanes). A value of -1 means that this property is unknown.
+    int32_t genVersion;                                                     ///< [out] The link generation. A value of -1 means that this property is
+                                                                            ///< unknown.
+    int32_t width;                                                          ///< [out] The number of lanes. A value of -1 means that this property is
+                                                                            ///< unknown.
+    int64_t maxBandwidth;                                                   ///< [out] The theoretical maximum bandwidth in bytes/sec (sum of all
+                                                                            ///< lanes). A value of -1 means that this property is unknown.
 
 } ze_pci_speed_ext_t;
 
@@ -7604,12 +7897,12 @@ typedef struct _ze_pci_speed_ext_t
 /// @brief Static PCI properties
 typedef struct _ze_pci_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_pci_address_ext_t address;                   ///< [out] The BDF address
-    ze_pci_speed_ext_t maxSpeed;                    ///< [out] Fastest port configuration supported by the device (sum of all
-                                                    ///< lanes)
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_pci_address_ext_t address;                                           ///< [out] The BDF address
+    ze_pci_speed_ext_t maxSpeed;                                            ///< [out] Fastest port configuration supported by the device (sum of all
+                                                                            ///< lanes)
 
 } ze_pci_ext_properties_t;
 
@@ -7636,8 +7929,8 @@ typedef struct _ze_pci_ext_properties_t
 ///         + `nullptr == pPciProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDevicePciGetPropertiesExt(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object.
-    ze_pci_ext_properties_t* pPciProperties         ///< [in,out] returns the PCI properties of the device.
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object.
+    ze_pci_ext_properties_t* pPciProperties                                 ///< [in,out] returns the PCI properties of the device.
     );
 
 #if !defined(__GNUC__)
@@ -7657,8 +7950,8 @@ zeDevicePciGetPropertiesExt(
 /// @brief sRGB Extension Version(s)
 typedef enum _ze_srgb_ext_version_t
 {
-    ZE_SRGB_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_SRGB_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_SRGB_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                      ///< version 1.0
+    ZE_SRGB_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),                  ///< latest known version
     ZE_SRGB_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_srgb_ext_version_t;
@@ -7672,10 +7965,10 @@ typedef enum _ze_srgb_ext_version_t
 ///     - Used for specifying that the image is in sRGB format.
 typedef struct _ze_srgb_ext_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t sRGB;                                 ///< [in] Is sRGB.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t sRGB;                                                         ///< [in] Is sRGB.
 
 } ze_srgb_ext_desc_t;
 
@@ -7696,8 +7989,8 @@ typedef struct _ze_srgb_ext_desc_t
 /// @brief Image Copy Extension Version(s)
 typedef enum _ze_image_copy_ext_version_t
 {
-    ZE_IMAGE_COPY_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_IMAGE_COPY_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_IMAGE_COPY_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZE_IMAGE_COPY_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
     ZE_IMAGE_COPY_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_image_copy_ext_version_t;
@@ -7751,19 +8044,19 @@ typedef enum _ze_image_copy_ext_version_t
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendImageCopyToMemoryExt(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    ze_image_handle_t hSrcImage,                    ///< [in] handle of source image to copy from
-    const ze_image_region_t* pSrcRegion,            ///< [in][optional] source region descriptor
-    uint32_t destRowPitch,                          ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D
-                                                    ///< image or each image of a 1D or 2D image array being written
-    uint32_t destSlicePitch,                        ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or
-                                                    ///< each image of a 1D or 2D image array being written
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    ze_image_handle_t hSrcImage,                                            ///< [in] handle of source image to copy from
+    const ze_image_region_t* pSrcRegion,                                    ///< [in][optional] source region descriptor
+    uint32_t destRowPitch,                                                  ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D
+                                                                            ///< image or each image of a 1D or 2D image array being written
+    uint32_t destSlicePitch,                                                ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or
+                                                                            ///< each image of a 1D or 2D image array being written
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -7815,19 +8108,19 @@ zeCommandListAppendImageCopyToMemoryExt(
 ///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeCommandListAppendImageCopyFromMemoryExt(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_image_handle_t hDstImage,                    ///< [in] handle of destination image to copy to
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    const ze_image_region_t* pDstRegion,            ///< [in][optional] destination region descriptor
-    uint32_t srcRowPitch,                           ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D
-                                                    ///< image or each image of a 1D or 2D image array being read
-    uint32_t srcSlicePitch,                         ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or
-                                                    ///< each image of a 1D or 2D image array being read
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_image_handle_t hDstImage,                                            ///< [in] handle of destination image to copy to
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    const ze_image_region_t* pDstRegion,                                    ///< [in][optional] destination region descriptor
+    uint32_t srcRowPitch,                                                   ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D
+                                                                            ///< image or each image of a 1D or 2D image array being read
+    uint32_t srcSlicePitch,                                                 ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or
+                                                                            ///< each image of a 1D or 2D image array being read
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
     );
 
 #if !defined(__GNUC__)
@@ -7858,10 +8151,10 @@ typedef enum _ze_image_query_alloc_properties_ext_version_t
 ///        ::zeImageGetAllocPropertiesExt
 typedef struct _ze_image_allocation_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint64_t id;                                    ///< [out] identifier for this allocation
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t id;                                                            ///< [out] identifier for this allocation
 
 } ze_image_allocation_ext_properties_t;
 
@@ -7884,9 +8177,9 @@ typedef struct _ze_image_allocation_ext_properties_t
 ///         + `nullptr == pImageAllocProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeImageGetAllocPropertiesExt(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_image_handle_t hImage,                       ///< [in] handle of image object to query
-    ze_image_allocation_ext_properties_t* pImageAllocProperties ///< [in,out] query result for image allocation properties
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_image_handle_t hImage,                                               ///< [in] handle of image object to query
+    ze_image_allocation_ext_properties_t* pImageAllocProperties             ///< [in,out] query result for image allocation properties
     );
 
 #if !defined(__GNUC__)
@@ -7906,8 +8199,8 @@ zeImageGetAllocPropertiesExt(
 /// @brief Linkage Inspection Extension Version(s)
 typedef enum _ze_linkage_inspection_ext_version_t
 {
-    ZE_LINKAGE_INSPECTION_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_LINKAGE_INSPECTION_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_LINKAGE_INSPECTION_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),        ///< version 1.0
+    ZE_LINKAGE_INSPECTION_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),    ///< latest known version
     ZE_LINKAGE_INSPECTION_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_linkage_inspection_ext_version_t;
@@ -7917,9 +8210,9 @@ typedef enum _ze_linkage_inspection_ext_version_t
 typedef uint32_t ze_linkage_inspection_ext_flags_t;
 typedef enum _ze_linkage_inspection_ext_flag_t
 {
-    ZE_LINKAGE_INSPECTION_EXT_FLAG_IMPORTS = ZE_BIT(0), ///< List all imports of modules
-    ZE_LINKAGE_INSPECTION_EXT_FLAG_UNRESOLVABLE_IMPORTS = ZE_BIT(1),///< List all imports of modules that do not have a corresponding export
-    ZE_LINKAGE_INSPECTION_EXT_FLAG_EXPORTS = ZE_BIT(2), ///< List all exports of modules
+    ZE_LINKAGE_INSPECTION_EXT_FLAG_IMPORTS = ZE_BIT(0),                     ///< List all imports of modules
+    ZE_LINKAGE_INSPECTION_EXT_FLAG_UNRESOLVABLE_IMPORTS = ZE_BIT(1),        ///< List all imports of modules that do not have a corresponding export
+    ZE_LINKAGE_INSPECTION_EXT_FLAG_EXPORTS = ZE_BIT(2),                     ///< List all exports of modules
     ZE_LINKAGE_INSPECTION_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_linkage_inspection_ext_flag_t;
@@ -7931,11 +8224,11 @@ typedef enum _ze_linkage_inspection_ext_flag_t
 ///     - This structure may be passed to ::zeModuleInspectLinkageExt.
 typedef struct _ze_linkage_inspection_ext_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_linkage_inspection_ext_flags_t flags;        ///< [in] flags specifying module linkage inspection.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_linkage_inspection_ext_flag_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_linkage_inspection_ext_flags_t flags;                                ///< [in] flags specifying module linkage inspection.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_linkage_inspection_ext_flag_t.
 
 } ze_linkage_inspection_ext_desc_t;
 
@@ -7964,12 +8257,12 @@ typedef struct _ze_linkage_inspection_ext_desc_t
 ///         + `0x7 < pInspectDesc->flags`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeModuleInspectLinkageExt(
-    ze_linkage_inspection_ext_desc_t* pInspectDesc, ///< [in] pointer to linkage inspection descriptor structure.
-    uint32_t numModules,                            ///< [in] number of modules to be inspected pointed to by phModules.
-    ze_module_handle_t* phModules,                  ///< [in][range(0, numModules)] pointer to an array of modules to be
-                                                    ///< inspected for import dependencies.
-    ze_module_build_log_handle_t* phLog             ///< [out] pointer to handle of linkage inspection log. Log object will
-                                                    ///< contain separate lists of imports, un-resolvable imports, and exports.
+    ze_linkage_inspection_ext_desc_t* pInspectDesc,                         ///< [in] pointer to linkage inspection descriptor structure.
+    uint32_t numModules,                                                    ///< [in] number of modules to be inspected pointed to by phModules.
+    ze_module_handle_t* phModules,                                          ///< [in][range(0, numModules)] pointer to an array of modules to be
+                                                                            ///< inspected for import dependencies.
+    ze_module_build_log_handle_t* phLog                                     ///< [out] pointer to handle of linkage inspection log. Log object will
+                                                                            ///< contain separate lists of imports, un-resolvable imports, and exports.
     );
 
 #if !defined(__GNUC__)
@@ -8000,8 +8293,8 @@ typedef enum _ze_memory_compression_hints_ext_version_t
 typedef uint32_t ze_memory_compression_hints_ext_flags_t;
 typedef enum _ze_memory_compression_hints_ext_flag_t
 {
-    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_COMPRESSED = ZE_BIT(0),///< Hint Driver implementation to make allocation compressible
-    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_UNCOMPRESSED = ZE_BIT(1),  ///< Hint Driver implementation to make allocation not compressible
+    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_COMPRESSED = ZE_BIT(0),            ///< Hint Driver implementation to make allocation compressible
+    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_UNCOMPRESSED = ZE_BIT(1),          ///< Hint Driver implementation to make allocation not compressible
     ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_memory_compression_hints_ext_flag_t;
@@ -8011,19 +8304,19 @@ typedef enum _ze_memory_compression_hints_ext_flag_t
 /// 
 /// @details
 ///     - This structure may be passed to ::zeMemAllocShared or
-///       ::zeMemAllocDevice, via `pNext` member of
+///       ::zeMemAllocDevice, via the `pNext` member of
 ///       ::ze_device_mem_alloc_desc_t.
-///     - This structure may be passed to ::zeMemAllocHost, via `pNext` member
-///       of ::ze_host_mem_alloc_desc_t.
-///     - This structure may be passed to ::zeImageCreate, via `pNext` member of
-///       ::ze_image_desc_t.
+///     - This structure may be passed to ::zeMemAllocHost, via the `pNext`
+///       member of ::ze_host_mem_alloc_desc_t.
+///     - This structure may be passed to ::zeImageCreate, via the `pNext`
+///       member of ::ze_image_desc_t.
 typedef struct _ze_memory_compression_hints_ext_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_memory_compression_hints_ext_flags_t flags;  ///< [in] flags specifying if allocation should be compressible or not.
-                                                    ///< Must be set to one of the ::ze_memory_compression_hints_ext_flag_t;
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_memory_compression_hints_ext_flags_t flags;                          ///< [in] flags specifying if allocation should be compressible or not.
+                                                                            ///< Must be set to one of the ::ze_memory_compression_hints_ext_flag_t;
 
 } ze_memory_compression_hints_ext_desc_t;
 
@@ -8044,7 +8337,7 @@ typedef struct _ze_memory_compression_hints_ext_desc_t
 /// @brief Memory Free Policies Extension Version(s)
 typedef enum _ze_memory_free_policies_ext_version_t
 {
-    ZE_MEMORY_FREE_POLICIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_MEMORY_FREE_POLICIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),      ///< version 1.0
     ZE_MEMORY_FREE_POLICIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
     ZE_MEMORY_FREE_POLICIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
@@ -8055,8 +8348,8 @@ typedef enum _ze_memory_free_policies_ext_version_t
 typedef uint32_t ze_driver_memory_free_policy_ext_flags_t;
 typedef enum _ze_driver_memory_free_policy_ext_flag_t
 {
-    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE = ZE_BIT(0),///< blocks until all commands using the memory are complete before freeing
-    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_DEFER_FREE = ZE_BIT(1),   ///< schedules the memory to be freed but does not free immediately
+    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE = ZE_BIT(0),        ///< blocks until all commands using the memory are complete before freeing
+    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_DEFER_FREE = ZE_BIT(1),           ///< schedules the memory to be freed but does not free immediately
     ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_driver_memory_free_policy_ext_flag_t;
@@ -8067,15 +8360,15 @@ typedef enum _ze_driver_memory_free_policy_ext_flag_t
 /// @details
 ///     - All drivers must support an immediate free policy, which is the
 ///       default free policy.
-///     - This structure may be returned from ::zeDriverGetProperties, via
+///     - This structure may be returned from ::zeDriverGetProperties, via the
 ///       `pNext` member of ::ze_driver_properties_t.
 typedef struct _ze_driver_memory_free_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_driver_memory_free_policy_ext_flags_t freePolicies;  ///< [out] Supported memory free policies.
-                                                    ///< must be 0 or a combination of ::ze_driver_memory_free_policy_ext_flag_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_driver_memory_free_policy_ext_flags_t freePolicies;                  ///< [out] Supported memory free policies.
+                                                                            ///< must be 0 or a combination of ::ze_driver_memory_free_policy_ext_flag_t.
 
 } ze_driver_memory_free_ext_properties_t;
 
@@ -8083,12 +8376,12 @@ typedef struct _ze_driver_memory_free_ext_properties_t
 /// @brief Memory free descriptor with free policy
 typedef struct _ze_memory_free_ext_desc_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_driver_memory_free_policy_ext_flags_t freePolicy;///< [in] flags specifying the memory free policy.
-                                                    ///< must be 0 (default) or a supported ::ze_driver_memory_free_policy_ext_flag_t;
-                                                    ///< default behavior is to free immediately.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_driver_memory_free_policy_ext_flags_t freePolicy;                    ///< [in] flags specifying the memory free policy.
+                                                                            ///< must be 0 (default) or a supported ::ze_driver_memory_free_policy_ext_flag_t;
+                                                                            ///< default behavior is to free immediately.
 
 } ze_memory_free_ext_desc_t;
 
@@ -8117,9 +8410,9 @@ typedef struct _ze_memory_free_ext_desc_t
 ///         + `0x3 < pMemFreeDesc->freePolicy`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeMemFreeExt(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_memory_free_ext_desc_t* pMemFreeDesc,  ///< [in] pointer to memory free descriptor
-    void* ptr                                       ///< [in][release] pointer to memory to free
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_memory_free_ext_desc_t* pMemFreeDesc,                          ///< [in] pointer to memory free descriptor
+    void* ptr                                                               ///< [in][release] pointer to memory to free
     );
 
 #if !defined(__GNUC__)
@@ -8143,19 +8436,19 @@ zeMemFreeExt(
 ///       the pNext member of ::ze_device_p2p_properties_t point at this struct.
 typedef struct _ze_device_p2p_bandwidth_exp_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t logicalBandwidth;                      ///< [out] total logical design bandwidth for all links connecting the two
-                                                    ///< devices
-    uint32_t physicalBandwidth;                     ///< [out] total physical design bandwidth for all links connecting the two
-                                                    ///< devices
-    ze_bandwidth_unit_t bandwidthUnit;              ///< [out] bandwidth unit
-    uint32_t logicalLatency;                        ///< [out] average logical design latency for all links connecting the two
-                                                    ///< devices
-    uint32_t physicalLatency;                       ///< [out] average physical design latency for all links connecting the two
-                                                    ///< devices
-    ze_latency_unit_t latencyUnit;                  ///< [out] latency unit
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t logicalBandwidth;                                              ///< [out] total logical design bandwidth for all links connecting the two
+                                                                            ///< devices
+    uint32_t physicalBandwidth;                                             ///< [out] total physical design bandwidth for all links connecting the two
+                                                                            ///< devices
+    ze_bandwidth_unit_t bandwidthUnit;                                      ///< [out] bandwidth unit
+    uint32_t logicalLatency;                                                ///< [out] average logical design latency for all links connecting the two
+                                                                            ///< devices
+    uint32_t physicalLatency;                                               ///< [out] average physical design latency for all links connecting the two
+                                                                            ///< devices
+    ze_latency_unit_t latencyUnit;                                          ///< [out] latency unit
 
 } ze_device_p2p_bandwidth_exp_properties_t;
 
@@ -8168,12 +8461,12 @@ typedef struct _ze_device_p2p_bandwidth_exp_properties_t
 ///       ::ze_command_queue_group_properties_t point at this struct.
 typedef struct _ze_copy_bandwidth_exp_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t copyBandwidth;                         ///< [out] design bandwidth supported by this engine type for copy
-                                                    ///< operations
-    ze_bandwidth_unit_t copyBandwidthUnit;          ///< [out] copy bandwidth unit
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t copyBandwidth;                                                 ///< [out] design bandwidth supported by this engine type for copy
+                                                                            ///< operations
+    ze_bandwidth_unit_t copyBandwidthUnit;                                  ///< [out] copy bandwidth unit
 
 } ze_copy_bandwidth_exp_properties_t;
 
@@ -8194,8 +8487,8 @@ typedef struct _ze_copy_bandwidth_exp_properties_t
 /// @brief Device Local Identifier (LUID) Extension Version(s)
 typedef enum _ze_device_luid_ext_version_t
 {
-    ZE_DEVICE_LUID_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
-    ZE_DEVICE_LUID_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
+    ZE_DEVICE_LUID_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),               ///< version 1.0
+    ZE_DEVICE_LUID_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),           ///< latest known version
     ZE_DEVICE_LUID_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_luid_ext_version_t;
@@ -8210,7 +8503,7 @@ typedef enum _ze_device_luid_ext_version_t
 /// @brief Device local identifier (LUID)
 typedef struct _ze_device_luid_ext_t
 {
-    uint8_t id[ZE_MAX_DEVICE_LUID_SIZE_EXT];        ///< [out] opaque data representing a device LUID
+    uint8_t id[ZE_MAX_DEVICE_LUID_SIZE_EXT];                                ///< [out] opaque data representing a device LUID
 
 } ze_device_luid_ext_t;
 
@@ -8218,26 +8511,26 @@ typedef struct _ze_device_luid_ext_t
 /// @brief Device LUID properties queried using ::zeDeviceGetProperties
 /// 
 /// @details
-///     - This structure may be returned from ::zeDeviceGetProperties, via
+///     - This structure may be returned from ::zeDeviceGetProperties, via the
 ///       `pNext` member of ::ze_device_properties_t.
 typedef struct _ze_device_luid_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_luid_ext_t luid;                      ///< [out] locally unique identifier (LUID).
-                                                    ///< The returned LUID can be cast to a LUID object and must be equal to
-                                                    ///< the locally
-                                                    ///< unique identifier of an IDXGIAdapter1 object that corresponds to the device.
-    uint32_t nodeMask;                              ///< [out] node mask.
-                                                    ///< The returned node mask must contain exactly one bit.
-                                                    ///< If the device is running on an operating system that supports the
-                                                    ///< Direct3D 12 API
-                                                    ///< and the device corresponds to an individual device in a linked device
-                                                    ///< adapter, the
-                                                    ///< returned node mask identifies the Direct3D 12 node corresponding to
-                                                    ///< the device.
-                                                    ///< Otherwise, the returned node mask must be 1.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_luid_ext_t luid;                                              ///< [out] locally unique identifier (LUID).
+                                                                            ///< The returned LUID can be cast to a LUID object and must be equal to
+                                                                            ///< the locally
+                                                                            ///< unique identifier of an IDXGIAdapter1 object that corresponds to the device.
+    uint32_t nodeMask;                                                      ///< [out] node mask.
+                                                                            ///< The returned node mask must contain exactly one bit.
+                                                                            ///< If the device is running on an operating system that supports the
+                                                                            ///< Direct3D 12 API
+                                                                            ///< and the device corresponds to an individual device in a linked device
+                                                                            ///< adapter, the
+                                                                            ///< returned node mask identifies the Direct3D 12 node corresponding to
+                                                                            ///< the device.
+                                                                            ///< Otherwise, the returned node mask must be 1.
 
 } ze_device_luid_ext_properties_t;
 
@@ -8264,10 +8557,10 @@ typedef struct _ze_device_luid_ext_properties_t
 /// @brief Fabric Vertex types
 typedef enum _ze_fabric_vertex_exp_type_t
 {
-    ZE_FABRIC_VERTEX_EXP_TYPE_UNKNOWN = 0,          ///< Fabric vertex type is unknown
-    ZE_FABRIC_VERTEX_EXP_TYPE_DEVICE = 1,           ///< Fabric vertex represents a device
-    ZE_FABRIC_VERTEX_EXP_TYPE_SUBDEVICE = 2,        ///< Fabric vertex represents a subdevice
-    ZE_FABRIC_VERTEX_EXP_TYPE_SWITCH = 3,           ///< Fabric vertex represents a switch
+    ZE_FABRIC_VERTEX_EXP_TYPE_UNKNOWN = 0,                                  ///< Fabric vertex type is unknown
+    ZE_FABRIC_VERTEX_EXP_TYPE_DEVICE = 1,                                   ///< Fabric vertex represents a device
+    ZE_FABRIC_VERTEX_EXP_TYPE_SUBDEVICE = 2,                                ///< Fabric vertex represents a subdevice
+    ZE_FABRIC_VERTEX_EXP_TYPE_SWITCH = 3,                                   ///< Fabric vertex represents a switch
     ZE_FABRIC_VERTEX_EXP_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } ze_fabric_vertex_exp_type_t;
@@ -8276,11 +8569,11 @@ typedef enum _ze_fabric_vertex_exp_type_t
 /// @brief Fabric edge duplexity
 typedef enum _ze_fabric_edge_exp_duplexity_t
 {
-    ZE_FABRIC_EDGE_EXP_DUPLEXITY_UNKNOWN = 0,       ///< Fabric edge duplexity is unknown
-    ZE_FABRIC_EDGE_EXP_DUPLEXITY_HALF_DUPLEX = 1,   ///< Fabric edge is half duplex, i.e. stated bandwidth is obtained in only
-                                                    ///< one direction at time
-    ZE_FABRIC_EDGE_EXP_DUPLEXITY_FULL_DUPLEX = 2,   ///< Fabric edge is full duplex, i.e. stated bandwidth is supported in both
-                                                    ///< directions simultaneously
+    ZE_FABRIC_EDGE_EXP_DUPLEXITY_UNKNOWN = 0,                               ///< Fabric edge duplexity is unknown
+    ZE_FABRIC_EDGE_EXP_DUPLEXITY_HALF_DUPLEX = 1,                           ///< Fabric edge is half duplex, i.e. stated bandwidth is obtained in only
+                                                                            ///< one direction at time
+    ZE_FABRIC_EDGE_EXP_DUPLEXITY_FULL_DUPLEX = 2,                           ///< Fabric edge is full duplex, i.e. stated bandwidth is supported in both
+                                                                            ///< directions simultaneously
     ZE_FABRIC_EDGE_EXP_DUPLEXITY_FORCE_UINT32 = 0x7fffffff
 
 } ze_fabric_edge_exp_duplexity_t;
@@ -8293,10 +8586,10 @@ typedef enum _ze_fabric_edge_exp_duplexity_t
 ///       is useful for locating the device in the PCI switch fabric.
 typedef struct _ze_fabric_vertex_pci_exp_address_t
 {
-    uint32_t domain;                                ///< [out] PCI domain number
-    uint32_t bus;                                   ///< [out] PCI BDF bus number
-    uint32_t device;                                ///< [out] PCI BDF device number
-    uint32_t function;                              ///< [out] PCI BDF function number
+    uint32_t domain;                                                        ///< [out] PCI domain number
+    uint32_t bus;                                                           ///< [out] PCI BDF bus number
+    uint32_t device;                                                        ///< [out] PCI BDF device number
+    uint32_t function;                                                      ///< [out] PCI BDF function number
 
 } ze_fabric_vertex_pci_exp_address_t;
 
@@ -8304,17 +8597,17 @@ typedef struct _ze_fabric_vertex_pci_exp_address_t
 /// @brief Fabric Vertex properties
 typedef struct _ze_fabric_vertex_exp_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_uuid_t uuid;                                 ///< [out] universal unique identifier. If the vertex is co-located with a
-                                                    ///< device/subdevice, then this uuid will match that of the corresponding
-                                                    ///< device/subdevice
-    ze_fabric_vertex_exp_type_t type;               ///< [out] does the fabric vertex represent a device, subdevice, or switch?
-    ze_bool_t remote;                               ///< [out] does the fabric vertex live on the local node or on a remote
-                                                    ///< node?
-    ze_fabric_vertex_pci_exp_address_t address;     ///< [out] B/D/F address of fabric vertex & associated device/subdevice if
-                                                    ///< available
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_uuid_t uuid;                                                         ///< [out] universal unique identifier. If the vertex is co-located with a
+                                                                            ///< device/subdevice, then this uuid will match that of the corresponding
+                                                                            ///< device/subdevice
+    ze_fabric_vertex_exp_type_t type;                                       ///< [out] does the fabric vertex represent a device, subdevice, or switch?
+    ze_bool_t remote;                                                       ///< [out] does the fabric vertex live on the local node or on a remote
+                                                                            ///< node?
+    ze_fabric_vertex_pci_exp_address_t address;                             ///< [out] B/D/F address of fabric vertex & associated device/subdevice if
+                                                                            ///< available
 
 } ze_fabric_vertex_exp_properties_t;
 
@@ -8322,17 +8615,17 @@ typedef struct _ze_fabric_vertex_exp_properties_t
 /// @brief Fabric Edge properties
 typedef struct _ze_fabric_edge_exp_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_uuid_t uuid;                                 ///< [out] universal unique identifier.
-    char model[ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE];  ///< [out] Description of fabric edge technology. Will be set to the string
-                                                    ///< "unkown" if this cannot be determined for this edge
-    uint32_t bandwidth;                             ///< [out] design bandwidth
-    ze_bandwidth_unit_t bandwidthUnit;              ///< [out] bandwidth unit
-    uint32_t latency;                               ///< [out] design latency
-    ze_latency_unit_t latencyUnit;                  ///< [out] latency unit
-    ze_fabric_edge_exp_duplexity_t duplexity;       ///< [out] Duplexity of the fabric edge
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_uuid_t uuid;                                                         ///< [out] universal unique identifier.
+    char model[ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE];                          ///< [out] Description of fabric edge technology. Will be set to the string
+                                                                            ///< "unkown" if this cannot be determined for this edge
+    uint32_t bandwidth;                                                     ///< [out] design bandwidth
+    ze_bandwidth_unit_t bandwidthUnit;                                      ///< [out] bandwidth unit
+    uint32_t latency;                                                       ///< [out] design latency
+    ze_latency_unit_t latencyUnit;                                          ///< [out] latency unit
+    ze_fabric_edge_exp_duplexity_t duplexity;                               ///< [out] Duplexity of the fabric edge
 
 } ze_fabric_edge_exp_properties_t;
 
@@ -8357,16 +8650,16 @@ typedef struct _ze_fabric_edge_exp_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFabricVertexGetExp(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of fabric vertices.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of fabric vertices available.
-                                                    ///< if count is greater than the number of fabric vertices available, then
-                                                    ///< the driver shall update the value with the correct number of fabric
-                                                    ///< vertices available.
-    ze_fabric_vertex_handle_t* phVertices           ///< [in,out][optional][range(0, *pCount)] array of handle of fabric vertices.
-                                                    ///< if count is less than the number of fabric vertices available, then
-                                                    ///< driver shall only retrieve that number of fabric vertices.
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of fabric vertices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of fabric vertices available.
+                                                                            ///< if count is greater than the number of fabric vertices available, then
+                                                                            ///< the driver shall update the value with the correct number of fabric
+                                                                            ///< vertices available.
+    ze_fabric_vertex_handle_t* phVertices                                   ///< [in,out][optional][range(0, *pCount)] array of handle of fabric vertices.
+                                                                            ///< if count is less than the number of fabric vertices available, then
+                                                                            ///< driver shall only retrieve that number of fabric vertices.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -8392,16 +8685,16 @@ zeFabricVertexGetExp(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFabricVertexGetSubVerticesExp(
-    ze_fabric_vertex_handle_t hVertex,              ///< [in] handle of the fabric vertex object
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of sub-vertices.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of sub-vertices available.
-                                                    ///< if count is greater than the number of sub-vertices available, then
-                                                    ///< the driver shall update the value with the correct number of
-                                                    ///< sub-vertices available.
-    ze_fabric_vertex_handle_t* phSubvertices        ///< [in,out][optional][range(0, *pCount)] array of handle of sub-vertices.
-                                                    ///< if count is less than the number of sub-vertices available, then
-                                                    ///< driver shall only retrieve that number of sub-vertices.
+    ze_fabric_vertex_handle_t hVertex,                                      ///< [in] handle of the fabric vertex object
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sub-vertices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sub-vertices available.
+                                                                            ///< if count is greater than the number of sub-vertices available, then
+                                                                            ///< the driver shall update the value with the correct number of
+                                                                            ///< sub-vertices available.
+    ze_fabric_vertex_handle_t* phSubvertices                                ///< [in,out][optional][range(0, *pCount)] array of handle of sub-vertices.
+                                                                            ///< if count is less than the number of sub-vertices available, then
+                                                                            ///< driver shall only retrieve that number of sub-vertices.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -8423,8 +8716,8 @@ zeFabricVertexGetSubVerticesExp(
 ///         + `nullptr == pVertexProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFabricVertexGetPropertiesExp(
-    ze_fabric_vertex_handle_t hVertex,              ///< [in] handle of the fabric vertex
-    ze_fabric_vertex_exp_properties_t* pVertexProperties///< [in,out] query result for fabric vertex properties
+    ze_fabric_vertex_handle_t hVertex,                                      ///< [in] handle of the fabric vertex
+    ze_fabric_vertex_exp_properties_t* pVertexProperties                    ///< [in,out] query result for fabric vertex properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -8450,8 +8743,8 @@ zeFabricVertexGetPropertiesExp(
 ///         + Provided fabric vertex handle corresponds to remote device or subdevice.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFabricVertexGetDeviceExp(
-    ze_fabric_vertex_handle_t hVertex,              ///< [in] handle of the fabric vertex
-    ze_device_handle_t* phDevice                    ///< [out] device handle corresponding to fabric vertex
+    ze_fabric_vertex_handle_t hVertex,                                      ///< [in] handle of the fabric vertex
+    ze_device_handle_t* phDevice                                            ///< [out] device handle corresponding to fabric vertex
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -8475,8 +8768,8 @@ zeFabricVertexGetDeviceExp(
 ///         + Provided device handle does not correspond to a fabric vertex.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeDeviceGetFabricVertexExp(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_fabric_vertex_handle_t* phVertex             ///< [out] fabric vertex handle corresponding to device
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_fabric_vertex_handle_t* phVertex                                     ///< [out] fabric vertex handle corresponding to device
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -8501,17 +8794,17 @@ zeDeviceGetFabricVertexExp(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFabricEdgeGetExp(
-    ze_fabric_vertex_handle_t hVertexA,             ///< [in] handle of first fabric vertex instance
-    ze_fabric_vertex_handle_t hVertexB,             ///< [in] handle of second fabric vertex instance
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of fabric edges.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of fabric edges available.
-                                                    ///< if count is greater than the number of fabric edges available, then
-                                                    ///< the driver shall update the value with the correct number of fabric
-                                                    ///< edges available.
-    ze_fabric_edge_handle_t* phEdges                ///< [in,out][optional][range(0, *pCount)] array of handle of fabric edges.
-                                                    ///< if count is less than the number of fabric edges available, then
-                                                    ///< driver shall only retrieve that number of fabric edges.
+    ze_fabric_vertex_handle_t hVertexA,                                     ///< [in] handle of first fabric vertex instance
+    ze_fabric_vertex_handle_t hVertexB,                                     ///< [in] handle of second fabric vertex instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of fabric edges.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of fabric edges available.
+                                                                            ///< if count is greater than the number of fabric edges available, then
+                                                                            ///< the driver shall update the value with the correct number of fabric
+                                                                            ///< edges available.
+    ze_fabric_edge_handle_t* phEdges                                        ///< [in,out][optional][range(0, *pCount)] array of handle of fabric edges.
+                                                                            ///< if count is less than the number of fabric edges available, then
+                                                                            ///< driver shall only retrieve that number of fabric edges.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -8536,9 +8829,9 @@ zeFabricEdgeGetExp(
 ///         + `nullptr == phVertexB`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFabricEdgeGetVerticesExp(
-    ze_fabric_edge_handle_t hEdge,                  ///< [in] handle of the fabric edge instance
-    ze_fabric_vertex_handle_t* phVertexA,           ///< [out] fabric vertex connected to one end of the given fabric edge.
-    ze_fabric_vertex_handle_t* phVertexB            ///< [out] fabric vertex connected to other end of the given fabric edge.
+    ze_fabric_edge_handle_t hEdge,                                          ///< [in] handle of the fabric edge instance
+    ze_fabric_vertex_handle_t* phVertexA,                                   ///< [out] fabric vertex connected to one end of the given fabric edge.
+    ze_fabric_vertex_handle_t* phVertexB                                    ///< [out] fabric vertex connected to other end of the given fabric edge.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -8560,8 +8853,8 @@ zeFabricEdgeGetVerticesExp(
 ///         + `nullptr == pEdgeProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeFabricEdgeGetPropertiesExp(
-    ze_fabric_edge_handle_t hEdge,                  ///< [in] handle of the fabric edge
-    ze_fabric_edge_exp_properties_t* pEdgeProperties///< [in,out] query result for fabric edge properties
+    ze_fabric_edge_handle_t hEdge,                                          ///< [in] handle of the fabric edge
+    ze_fabric_edge_exp_properties_t* pEdgeProperties                        ///< [in,out] query result for fabric edge properties
     );
 
 #if !defined(__GNUC__)
@@ -8591,28 +8884,28 @@ typedef enum _ze_device_memory_properties_ext_version_t
 /// @brief Memory module types
 typedef enum _ze_device_memory_ext_type_t
 {
-    ZE_DEVICE_MEMORY_EXT_TYPE_HBM = 0,              ///< HBM memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_HBM2 = 1,             ///< HBM2 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_DDR = 2,              ///< DDR memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_DDR2 = 3,             ///< DDR2 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_DDR3 = 4,             ///< DDR3 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_DDR4 = 5,             ///< DDR4 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_DDR5 = 6,             ///< DDR5 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR = 7,            ///< LPDDR memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR3 = 8,           ///< LPDDR3 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR4 = 9,           ///< LPDDR4 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR5 = 10,          ///< LPDDR5 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_SRAM = 11,            ///< SRAM memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_L1 = 12,              ///< L1 cache
-    ZE_DEVICE_MEMORY_EXT_TYPE_L3 = 13,              ///< L3 cache
-    ZE_DEVICE_MEMORY_EXT_TYPE_GRF = 14,             ///< Execution unit register file
-    ZE_DEVICE_MEMORY_EXT_TYPE_SLM = 15,             ///< Execution unit shared local memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR4 = 16,           ///< GDDR4 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR5 = 17,           ///< GDDR5 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR5X = 18,          ///< GDDR5X memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR6 = 19,           ///< GDDR6 memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR6X = 20,          ///< GDDR6X memory
-    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR7 = 21,           ///< GDDR7 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_HBM = 0,                                      ///< HBM memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_HBM2 = 1,                                     ///< HBM2 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR = 2,                                      ///< DDR memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR2 = 3,                                     ///< DDR2 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR3 = 4,                                     ///< DDR3 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR4 = 5,                                     ///< DDR4 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR5 = 6,                                     ///< DDR5 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR = 7,                                    ///< LPDDR memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR3 = 8,                                   ///< LPDDR3 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR4 = 9,                                   ///< LPDDR4 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR5 = 10,                                  ///< LPDDR5 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_SRAM = 11,                                    ///< SRAM memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_L1 = 12,                                      ///< L1 cache
+    ZE_DEVICE_MEMORY_EXT_TYPE_L3 = 13,                                      ///< L3 cache
+    ZE_DEVICE_MEMORY_EXT_TYPE_GRF = 14,                                     ///< Execution unit register file
+    ZE_DEVICE_MEMORY_EXT_TYPE_SLM = 15,                                     ///< Execution unit shared local memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR4 = 16,                                   ///< GDDR4 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR5 = 17,                                   ///< GDDR5 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR5X = 18,                                  ///< GDDR5X memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR6 = 19,                                   ///< GDDR6 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR6X = 20,                                  ///< GDDR6X memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR7 = 21,                                   ///< GDDR7 memory
     ZE_DEVICE_MEMORY_EXT_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_memory_ext_type_t;
@@ -8625,16 +8918,16 @@ typedef enum _ze_device_memory_ext_type_t
 ///       the `pNext` member of ::ze_device_memory_properties_t
 typedef struct _ze_device_memory_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_memory_ext_type_t type;               ///< [out] The memory type
-    uint64_t physicalSize;                          ///< [out] Physical memory size in bytes. A value of 0 indicates that this
-                                                    ///< property is not known. However, a call to $sMemoryGetState() will
-                                                    ///< correctly return the total size of usable memory.
-    uint32_t readBandwidth;                         ///< [out] Design bandwidth for reads
-    uint32_t writeBandwidth;                        ///< [out] Design bandwidth for writes
-    ze_bandwidth_unit_t bandwidthUnit;              ///< [out] bandwidth unit
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_memory_ext_type_t type;                                       ///< [out] The memory type
+    uint64_t physicalSize;                                                  ///< [out] Physical memory size in bytes. A value of 0 indicates that this
+                                                                            ///< property is not known. However, a call to ::zesMemoryGetState() will
+                                                                            ///< correctly return the total size of usable memory.
+    uint32_t readBandwidth;                                                 ///< [out] Design bandwidth for reads
+    uint32_t writeBandwidth;                                                ///< [out] Design bandwidth for writes
+    ze_bandwidth_unit_t bandwidthUnit;                                      ///< [out] bandwidth unit
 
 } ze_device_memory_ext_properties_t;
 
@@ -8655,7 +8948,7 @@ typedef struct _ze_device_memory_ext_properties_t
 /// @brief Bfloat16 Conversions Extension Version(s)
 typedef enum _ze_bfloat16_conversions_ext_version_t
 {
-    ZE_BFLOAT16_CONVERSIONS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_BFLOAT16_CONVERSIONS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),      ///< version 1.0
     ZE_BFLOAT16_CONVERSIONS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
     ZE_BFLOAT16_CONVERSIONS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
@@ -8678,8 +8971,8 @@ typedef enum _ze_bfloat16_conversions_ext_version_t
 /// @brief Device IP Version Extension Version(s)
 typedef enum _ze_device_ip_version_version_t
 {
-    ZE_DEVICE_IP_VERSION_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_DEVICE_IP_VERSION_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_DEVICE_IP_VERSION_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),             ///< version 1.0
+    ZE_DEVICE_IP_VERSION_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),         ///< latest known version
     ZE_DEVICE_IP_VERSION_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_device_ip_version_version_t;
@@ -8688,16 +8981,16 @@ typedef enum _ze_device_ip_version_version_t
 /// @brief Device IP version queried using ::zeDeviceGetProperties
 /// 
 /// @details
-///     - This structure may be returned from ::zeDeviceGetProperties via
+///     - This structure may be returned from ::zeDeviceGetProperties via the
 ///       `pNext` member of ::ze_device_properties_t
 typedef struct _ze_device_ip_version_ext_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t ipVersion;                             ///< [out] Device IP version. The meaning of the device IP version is
-                                                    ///< implementation-defined, but newer devices should have a higher
-                                                    ///< version than older devices.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t ipVersion;                                                     ///< [out] Device IP version. The meaning of the device IP version is
+                                                                            ///< implementation-defined, but newer devices should have a higher
+                                                                            ///< version than older devices.
 
 } ze_device_ip_version_ext_t;
 
@@ -8733,15 +9026,19 @@ typedef enum _ze_kernel_max_group_size_properties_ext_version_t
 ///       max group size properties.
 typedef struct _ze_kernel_max_group_size_properties_ext_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t maxGroupSize;                          ///< [out] maximum group size that can be used to execute the kernel. This
-                                                    ///< value may be less than or equal to the `maxTotalGroupSize` member of
-                                                    ///< ::ze_device_compute_properties_t.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t maxGroupSize;                                                  ///< [out] maximum group size that can be used to execute the kernel. This
+                                                                            ///< value may be less than or equal to the `maxTotalGroupSize` member of
+                                                                            ///< ::ze_device_compute_properties_t.
 
 } ze_kernel_max_group_size_properties_ext_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief compiler-independent type
+typedef ze_kernel_max_group_size_properties_ext_t ze_kernel_max_group_size_ext_properties_t;
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -8759,8 +9056,8 @@ typedef struct _ze_kernel_max_group_size_properties_ext_t
 /// @brief Sub-Allocations Properties Extension Version(s)
 typedef enum _ze_sub_allocations_exp_version_t
 {
-    ZE_SUB_ALLOCATIONS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
-    ZE_SUB_ALLOCATIONS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
+    ZE_SUB_ALLOCATIONS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),           ///< version 1.0
+    ZE_SUB_ALLOCATIONS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),       ///< latest known version
     ZE_SUB_ALLOCATIONS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } ze_sub_allocations_exp_version_t;
@@ -8769,8 +9066,8 @@ typedef enum _ze_sub_allocations_exp_version_t
 /// @brief Properties returned for a sub-allocation
 typedef struct _ze_sub_allocation_t
 {
-    void* base;                                     ///< [in,out][optional] base address of the sub-allocation
-    size_t size;                                    ///< [in,out][optional] size of the allocation
+    void* base;                                                             ///< [in,out][optional] base address of the sub-allocation
+    size_t size;                                                            ///< [in,out][optional] size of the allocation
 
 } ze_sub_allocation_t;
 
@@ -8778,21 +9075,21 @@ typedef struct _ze_sub_allocation_t
 /// @brief Sub-Allocations Properties
 /// 
 /// @details
-///     - This structure may be passed to ::zeMemGetAllocProperties, via `pNext`
-///       member of ::ze_memory_allocation_properties_t.
+///     - This structure may be passed to ::zeMemGetAllocProperties, via the
+///       `pNext` member of ::ze_memory_allocation_properties_t.
 typedef struct _ze_memory_sub_allocations_exp_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t* pCount;                               ///< [in,out] pointer to the number of sub-allocations.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of sub-allocations on which the allocation has been divided.
-                                                    ///< if count is greater than the number of sub-allocations, then the
-                                                    ///< driver shall update the value with the correct number of sub-allocations.
-    ze_sub_allocation_t* pSubAllocations;           ///< [in,out][optional][range(0, *pCount)] array of properties for sub-allocations.
-                                                    ///< if count is less than the number of sub-allocations available, then
-                                                    ///< driver shall only retrieve properties for that number of sub-allocations.
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t* pCount;                                                       ///< [in,out] pointer to the number of sub-allocations.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sub-allocations on which the allocation has been divided.
+                                                                            ///< if count is greater than the number of sub-allocations, then the
+                                                                            ///< driver shall update the value with the correct number of sub-allocations.
+    ze_sub_allocation_t* pSubAllocations;                                   ///< [in,out][optional][range(0, *pCount)] array of properties for sub-allocations.
+                                                                            ///< if count is less than the number of sub-allocations available, then
+                                                                            ///< driver shall only retrieve properties for that number of sub-allocations.
 
 } ze_memory_sub_allocations_exp_properties_t;
 
@@ -8824,8 +9121,8 @@ typedef enum _ze_event_query_kernel_timestamps_ext_version_t
 typedef uint32_t ze_event_query_kernel_timestamps_ext_flags_t;
 typedef enum _ze_event_query_kernel_timestamps_ext_flag_t
 {
-    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_FLAG_KERNEL = ZE_BIT(0),   ///< Kernel timestamp results
-    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_FLAG_SYNCHRONIZED = ZE_BIT(1), ///< Device event timestamps synchronized to the host time domain
+    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_FLAG_KERNEL = ZE_BIT(0),           ///< Kernel timestamp results
+    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_FLAG_SYNCHRONIZED = ZE_BIT(1),     ///< Device event timestamps synchronized to the host time domain
     ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } ze_event_query_kernel_timestamps_ext_flag_t;
@@ -8834,15 +9131,15 @@ typedef enum _ze_event_query_kernel_timestamps_ext_flag_t
 /// @brief Event query kernel timestamps properties
 /// 
 /// @details
-///     - This structure may be returned from ::zeDeviceGetProperties, via
+///     - This structure may be returned from ::zeDeviceGetProperties, via the
 ///       `pNext` member of ::ze_device_properties_t.
 typedef struct _ze_event_query_kernel_timestamps_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_event_query_kernel_timestamps_ext_flags_t flags; ///< [out] 0 or some combination of
-                                                    ///< ::ze_event_query_kernel_timestamps_ext_flag_t flags
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_event_query_kernel_timestamps_ext_flags_t flags;                     ///< [out] 0 or some combination of
+                                                                            ///< ::ze_event_query_kernel_timestamps_ext_flag_t flags
 
 } ze_event_query_kernel_timestamps_ext_properties_t;
 
@@ -8850,8 +9147,8 @@ typedef struct _ze_event_query_kernel_timestamps_ext_properties_t
 /// @brief Kernel timestamp clock data synchronized to the host time domain
 typedef struct _ze_synchronized_timestamp_data_ext_t
 {
-    uint64_t kernelStart;                           ///< [out] synchronized clock at start of kernel execution
-    uint64_t kernelEnd;                             ///< [out] synchronized clock at end of kernel execution
+    uint64_t kernelStart;                                                   ///< [out] synchronized clock at start of kernel execution
+    uint64_t kernelEnd;                                                     ///< [out] synchronized clock at end of kernel execution
 
 } ze_synchronized_timestamp_data_ext_t;
 
@@ -8859,9 +9156,9 @@ typedef struct _ze_synchronized_timestamp_data_ext_t
 /// @brief Synchronized kernel timestamp result
 typedef struct _ze_synchronized_timestamp_result_ext_t
 {
-    ze_synchronized_timestamp_data_ext_t global;    ///< [out] wall-clock data
-    ze_synchronized_timestamp_data_ext_t context;   ///< [out] context-active data; only includes clocks while device context
-                                                    ///< was actively executing.
+    ze_synchronized_timestamp_data_ext_t global;                            ///< [out] wall-clock data
+    ze_synchronized_timestamp_data_ext_t context;                           ///< [out] context-active data; only includes clocks while device context
+                                                                            ///< was actively executing.
 
 } ze_synchronized_timestamp_result_ext_t;
 
@@ -8869,13 +9166,13 @@ typedef struct _ze_synchronized_timestamp_result_ext_t
 /// @brief Event query kernel timestamps results properties
 typedef struct _ze_event_query_kernel_timestamps_results_ext_properties_t
 {
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_kernel_timestamp_result_t* pKernelTimestampsBuffer;  ///< [in,out][optional][range(0, *pCount)] pointer to destination buffer of
-                                                    ///< kernel timestamp results
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_kernel_timestamp_result_t* pKernelTimestampsBuffer;                  ///< [in,out][optional][range(0, *pCount)] pointer to destination buffer of
+                                                                            ///< kernel timestamp results
     ze_synchronized_timestamp_result_ext_t* pSynchronizedTimestampsBuffer;  ///< [in,out][optional][range(0, *pCount)] pointer to destination buffer of
-                                                    ///< synchronized timestamp results
+                                                                            ///< synchronized timestamp results
 
 } ze_event_query_kernel_timestamps_results_ext_properties_t;
 
@@ -8917,24 +9214,958 @@ typedef struct _ze_event_query_kernel_timestamps_results_ext_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zeEventQueryKernelTimestampsExt(
-    ze_event_handle_t hEvent,                       ///< [in] handle of the event
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device to query
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of event packets available.
-                                                    ///<    - This value is implementation specific.
-                                                    ///<    - if `*pCount` is zero, then the driver shall update the value with
-                                                    ///< the total number of event packets available.
-                                                    ///<    - if `*pCount` is greater than the number of event packets
-                                                    ///< available, the driver shall update the value with the correct value.
-                                                    ///<    - Buffer(s) for query results must be sized by the application to
-                                                    ///< accommodate a minimum of `*pCount` elements.
-    ze_event_query_kernel_timestamps_results_ext_properties_t* pResults ///< [in][optional] pointer to event query properties structure(s).
-                                                    ///<    - This parameter may be null when `*pCount` is zero.
-                                                    ///<    - if `*pCount` is less than the number of event packets available,
-                                                    ///< the driver may only update `*pCount` elements, starting at element zero.
-                                                    ///<    - if `*pCount` is greater than the number of event packets
-                                                    ///< available, the driver may only update the valid elements.
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device to query
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of event packets available.
+                                                                            ///<    - This value is implementation specific.
+                                                                            ///<    - if `*pCount` is zero, then the driver shall update the value with
+                                                                            ///< the total number of event packets available.
+                                                                            ///<    - if `*pCount` is greater than the number of event packets
+                                                                            ///< available, the driver shall update the value with the correct value.
+                                                                            ///<    - Buffer(s) for query results must be sized by the application to
+                                                                            ///< accommodate a minimum of `*pCount` elements.
+    ze_event_query_kernel_timestamps_results_ext_properties_t* pResults     ///< [in,out][optional][range(0, *pCount)] pointer to event query
+                                                                            ///< properties structure(s).
+                                                                            ///<    - This parameter may be null when `*pCount` is zero.
+                                                                            ///<    - if `*pCount` is less than the number of event packets available,
+                                                                            ///< the driver may only update `*pCount` elements, starting at element zero.
+                                                                            ///<    - if `*pCount` is greater than the number of event packets
+                                                                            ///< available, the driver may only update the valid elements.
     );
 
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting ray tracing acceleration structure builder.
+#if !defined(__GNUC__)
+#pragma region RTASBuilder
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_RTAS_BUILDER_EXP_NAME
+/// @brief Ray Tracing Acceleration Structure Builder Extension Name
+#define ZE_RTAS_BUILDER_EXP_NAME  "ZE_experimental_rtas_builder"
+#endif // ZE_RTAS_BUILDER_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray Tracing Acceleration Structure Builder Extension Version(s)
+typedef enum _ze_rtas_builder_exp_version_t
+{
+    ZE_RTAS_BUILDER_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),              ///< version 1.0
+    ZE_RTAS_BUILDER_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),          ///< latest known version
+    ZE_RTAS_BUILDER_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure device flags
+typedef uint32_t ze_rtas_device_exp_flags_t;
+typedef enum _ze_rtas_device_exp_flag_t
+{
+    ZE_RTAS_DEVICE_EXP_FLAG_RESERVED = ZE_BIT(0),                           ///< reserved for future use
+    ZE_RTAS_DEVICE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_device_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure format
+/// 
+/// @details
+///     - This is an opaque ray tracing acceleration structure format
+///       identifier.
+typedef enum _ze_rtas_format_exp_t
+{
+    ZE_RTAS_FORMAT_EXP_INVALID = 0,                                         ///< Invalid acceleration structure format
+    ZE_RTAS_FORMAT_EXP_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_format_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder flags
+typedef uint32_t ze_rtas_builder_exp_flags_t;
+typedef enum _ze_rtas_builder_exp_flag_t
+{
+    ZE_RTAS_BUILDER_EXP_FLAG_RESERVED = ZE_BIT(0),                          ///< Reserved for future use
+    ZE_RTAS_BUILDER_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder parallel operation flags
+typedef uint32_t ze_rtas_parallel_operation_exp_flags_t;
+typedef enum _ze_rtas_parallel_operation_exp_flag_t
+{
+    ZE_RTAS_PARALLEL_OPERATION_EXP_FLAG_RESERVED = ZE_BIT(0),               ///< Reserved for future use
+    ZE_RTAS_PARALLEL_OPERATION_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_parallel_operation_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder geometry flags
+typedef uint32_t ze_rtas_builder_geometry_exp_flags_t;
+typedef enum _ze_rtas_builder_geometry_exp_flag_t
+{
+    ZE_RTAS_BUILDER_GEOMETRY_EXP_FLAG_NON_OPAQUE = ZE_BIT(0),               ///< non-opaque geometries invoke an any-hit shader
+    ZE_RTAS_BUILDER_GEOMETRY_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_geometry_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Packed ray tracing acceleration structure builder geometry flags (see
+///        ::ze_rtas_builder_geometry_exp_flags_t)
+typedef uint8_t ze_rtas_builder_packed_geometry_exp_flags_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder instance flags
+typedef uint32_t ze_rtas_builder_instance_exp_flags_t;
+typedef enum _ze_rtas_builder_instance_exp_flag_t
+{
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_TRIANGLE_CULL_DISABLE = ZE_BIT(0),    ///< disables culling of front-facing and back-facing triangles
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = ZE_BIT(1),  ///< reverses front and back face of triangles
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_TRIANGLE_FORCE_OPAQUE = ZE_BIT(2),    ///< forces instanced geometry to be opaque, unless ray flag forces it to
+                                                                            ///< be non-opaque
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_TRIANGLE_FORCE_NON_OPAQUE = ZE_BIT(3),///< forces instanced geometry to be non-opaque, unless ray flag forces it
+                                                                            ///< to be opaque
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_instance_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Packed ray tracing acceleration structure builder instance flags (see
+///        ::ze_rtas_builder_instance_exp_flags_t)
+typedef uint8_t ze_rtas_builder_packed_instance_exp_flags_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder build operation flags
+/// 
+/// @details
+///     - These flags allow the application to tune the acceleration structure
+///       build operation.
+///     - The acceleration structure builder implementation might choose to use
+///       spatial splitting to split large or long primitives into smaller
+///       pieces. This may result in any-hit shaders being invoked multiple
+///       times for non-opaque primitives, unless
+///       ::ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION is specified.
+///     - Usage of any of these flags may reduce ray tracing performance.
+typedef uint32_t ze_rtas_builder_build_op_exp_flags_t;
+typedef enum _ze_rtas_builder_build_op_exp_flag_t
+{
+    ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_COMPACT = ZE_BIT(0),                  ///< build more compact acceleration structure
+    ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = ZE_BIT(1),   ///< guarantees single any-hit shader invocation per primitive
+    ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_build_op_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder build quality hint
+/// 
+/// @details
+///     - Depending on use case different quality modes for acceleration
+///       structure build are supported.
+///     - A low-quality build builds an acceleration structure fast, but at the
+///       cost of some reduction in ray tracing performance. This mode is
+///       recommended for dynamic content, such as animated characters.
+///     - A medium-quality build uses a compromise between build quality and ray
+///       tracing performance. This mode should be used by default.
+///     - Higher ray tracing performance can be achieved by using a high-quality
+///       build, but acceleration structure build performance might be
+///       significantly reduced.
+typedef enum _ze_rtas_builder_build_quality_hint_exp_t
+{
+    ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_LOW = 0,                         ///< build low-quality acceleration structure (fast)
+    ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_MEDIUM = 1,                      ///< build medium-quality acceleration structure (slower)
+    ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_HIGH = 2,                        ///< build high-quality acceleration structure (slow)
+    ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_build_quality_hint_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder geometry type
+typedef enum _ze_rtas_builder_geometry_type_exp_t
+{
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES = 0,                        ///< triangle mesh geometry type
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS = 1,                            ///< quad mesh geometry type
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL = 2,                       ///< procedural geometry type
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE = 3,                         ///< instance geometry type
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_geometry_type_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Packed ray tracing acceleration structure builder geometry type (see
+///        ::ze_rtas_builder_geometry_type_exp_t)
+typedef uint8_t ze_rtas_builder_packed_geometry_type_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure data buffer element format
+/// 
+/// @details
+///     - Specifies the format of data buffer elements.
+///     - Data buffers may contain instancing transform matrices, triangle/quad
+///       vertex indices, etc...
+typedef enum _ze_rtas_builder_input_data_format_exp_t
+{
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3 = 0,                       ///< 3-component float vector (see ::ze_rtas_float3_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_COLUMN_MAJOR = 1,        ///< 3x4 affine transformation in column-major format (see
+                                                                            ///< ::ze_rtas_transform_float3x4_column_major_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR = 2,///< 3x4 affine transformation in column-major format (see
+                                                                            ///< ::ze_rtas_transform_float3x4_aligned_column_major_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ROW_MAJOR = 3,           ///< 3x4 affine transformation in row-major format (see
+                                                                            ///< ::ze_rtas_transform_float3x4_row_major_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_AABB = 4,                         ///< 3-dimensional axis-aligned bounding-box (see ::ze_rtas_aabb_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_TRIANGLE_INDICES_UINT32 = 5,      ///< Unsigned 32-bit triangle indices (see
+                                                                            ///< ::ze_rtas_triangle_indices_uint32_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_QUAD_INDICES_UINT32 = 6,          ///< Unsigned 32-bit quad indices (see ::ze_rtas_quad_indices_uint32_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_input_data_format_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Packed ray tracing acceleration structure data buffer element format
+///        (see ::ze_rtas_builder_input_data_format_exp_t)
+typedef uint8_t ze_rtas_builder_packed_input_data_format_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of ray tracing acceleration structure builder object
+typedef struct _ze_rtas_builder_exp_handle_t *ze_rtas_builder_exp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of ray tracing acceleration structure builder parallel
+///        operation object
+typedef struct _ze_rtas_parallel_operation_exp_handle_t *ze_rtas_parallel_operation_exp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder descriptor
+typedef struct _ze_rtas_builder_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_builder_exp_version_t builderVersion;                           ///< [in] ray tracing acceleration structure builder version
+
+} ze_rtas_builder_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder properties
+typedef struct _ze_rtas_builder_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_builder_exp_flags_t flags;                                      ///< [out] ray tracing acceleration structure builder flags
+    size_t rtasBufferSizeBytesExpected;                                     ///< [out] expected size (in bytes) required for acceleration structure buffer
+                                                                            ///<    - When using an acceleration structure buffer of this size, the
+                                                                            ///< build is expected to succeed; however, it is possible that the build
+                                                                            ///< may fail with ::ZE_RESULT_EXP_RTAS_BUILD_RETRY
+    size_t rtasBufferSizeBytesMaxRequired;                                  ///< [out] worst-case size (in bytes) required for acceleration structure buffer
+                                                                            ///<    - When using an acceleration structure buffer of this size, the
+                                                                            ///< build is guaranteed to not run out of memory.
+    size_t scratchBufferSizeBytes;                                          ///< [out] scratch buffer size (in bytes) required for acceleration
+                                                                            ///< structure build.
+
+} ze_rtas_builder_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder parallel operation
+///        properties
+typedef struct _ze_rtas_parallel_operation_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_parallel_operation_exp_flags_t flags;                           ///< [out] ray tracing acceleration structure builder parallel operation
+                                                                            ///< flags
+    uint32_t maxConcurrency;                                                ///< [out] maximum number of threads that may join the parallel operation
+
+} ze_rtas_parallel_operation_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure device properties
+/// 
+/// @details
+///     - This structure may be passed to ::zeDeviceGetProperties, via `pNext`
+///       member of ::ze_device_properties_t.
+///     - The implementation shall populate `format` with a value other than
+///       ::ZE_RTAS_FORMAT_EXP_INVALID when the device supports ray tracing.
+typedef struct _ze_rtas_device_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_device_exp_flags_t flags;                                       ///< [out] ray tracing acceleration structure device flags
+    ze_rtas_format_exp_t rtasFormat;                                        ///< [out] ray tracing acceleration structure format
+    uint32_t rtasBufferAlignment;                                           ///< [out] required alignment of acceleration structure buffer
+
+} ze_rtas_device_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief A 3-component vector type
+typedef struct _ze_rtas_float3_exp_t
+{
+    float x;                                                                ///< [in] x-coordinate of float3 vector
+    float y;                                                                ///< [in] y-coordinate of float3 vector
+    float z;                                                                ///< [in] z-coordinate of float3 vector
+
+} ze_rtas_float3_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief 3x4 affine transformation in column-major layout
+/// 
+/// @details
+///     - A 3x4 affine transformation in column major layout, consisting of vectors
+///          - vx=(vx_x, vx_y, vx_z),
+///          - vy=(vy_x, vy_y, vy_z),
+///          - vz=(vz_x, vz_y, vz_z), and
+///          - p=(p_x, p_y, p_z)
+///     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+///       z*vz + p`.
+typedef struct _ze_rtas_transform_float3x4_column_major_exp_t
+{
+    float vx_x;                                                             ///< [in] element 0 of column 0 of 3x4 matrix
+    float vx_y;                                                             ///< [in] element 1 of column 0 of 3x4 matrix
+    float vx_z;                                                             ///< [in] element 2 of column 0 of 3x4 matrix
+    float vy_x;                                                             ///< [in] element 0 of column 1 of 3x4 matrix
+    float vy_y;                                                             ///< [in] element 1 of column 1 of 3x4 matrix
+    float vy_z;                                                             ///< [in] element 2 of column 1 of 3x4 matrix
+    float vz_x;                                                             ///< [in] element 0 of column 2 of 3x4 matrix
+    float vz_y;                                                             ///< [in] element 1 of column 2 of 3x4 matrix
+    float vz_z;                                                             ///< [in] element 2 of column 2 of 3x4 matrix
+    float p_x;                                                              ///< [in] element 0 of column 3 of 3x4 matrix
+    float p_y;                                                              ///< [in] element 1 of column 3 of 3x4 matrix
+    float p_z;                                                              ///< [in] element 2 of column 3 of 3x4 matrix
+
+} ze_rtas_transform_float3x4_column_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief 3x4 affine transformation in column-major layout with aligned column
+///        vectors
+/// 
+/// @details
+///     - A 3x4 affine transformation in column major layout, consisting of vectors
+///        - vx=(vx_x, vx_y, vx_z),
+///        - vy=(vy_x, vy_y, vy_z),
+///        - vz=(vz_x, vz_y, vz_z), and
+///        - p=(p_x, p_y, p_z)
+///     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+///       z*vz + p`.
+///     - The column vectors are aligned to 16-bytes and pad members are
+///       ignored.
+typedef struct _ze_rtas_transform_float3x4_aligned_column_major_exp_t
+{
+    float vx_x;                                                             ///< [in] element 0 of column 0 of 3x4 matrix
+    float vx_y;                                                             ///< [in] element 1 of column 0 of 3x4 matrix
+    float vx_z;                                                             ///< [in] element 2 of column 0 of 3x4 matrix
+    float pad0;                                                             ///< [in] ignored padding
+    float vy_x;                                                             ///< [in] element 0 of column 1 of 3x4 matrix
+    float vy_y;                                                             ///< [in] element 1 of column 1 of 3x4 matrix
+    float vy_z;                                                             ///< [in] element 2 of column 1 of 3x4 matrix
+    float pad1;                                                             ///< [in] ignored padding
+    float vz_x;                                                             ///< [in] element 0 of column 2 of 3x4 matrix
+    float vz_y;                                                             ///< [in] element 1 of column 2 of 3x4 matrix
+    float vz_z;                                                             ///< [in] element 2 of column 2 of 3x4 matrix
+    float pad2;                                                             ///< [in] ignored padding
+    float p_x;                                                              ///< [in] element 0 of column 3 of 3x4 matrix
+    float p_y;                                                              ///< [in] element 1 of column 3 of 3x4 matrix
+    float p_z;                                                              ///< [in] element 2 of column 3 of 3x4 matrix
+    float pad3;                                                             ///< [in] ignored padding
+
+} ze_rtas_transform_float3x4_aligned_column_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief 3x4 affine transformation in row-major layout
+/// 
+/// @details
+///     - A 3x4 affine transformation in row-major layout, consisting of vectors
+///          - vx=(vx_x, vx_y, vx_z),
+///          - vy=(vy_x, vy_y, vy_z),
+///          - vz=(vz_x, vz_y, vz_z), and
+///          - p=(p_x, p_y, p_z)
+///     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+///       z*vz + p`.
+typedef struct _ze_rtas_transform_float3x4_row_major_exp_t
+{
+    float vx_x;                                                             ///< [in] element 0 of row 0 of 3x4 matrix
+    float vy_x;                                                             ///< [in] element 1 of row 0 of 3x4 matrix
+    float vz_x;                                                             ///< [in] element 2 of row 0 of 3x4 matrix
+    float p_x;                                                              ///< [in] element 3 of row 0 of 3x4 matrix
+    float vx_y;                                                             ///< [in] element 0 of row 1 of 3x4 matrix
+    float vy_y;                                                             ///< [in] element 1 of row 1 of 3x4 matrix
+    float vz_y;                                                             ///< [in] element 2 of row 1 of 3x4 matrix
+    float p_y;                                                              ///< [in] element 3 of row 1 of 3x4 matrix
+    float vx_z;                                                             ///< [in] element 0 of row 2 of 3x4 matrix
+    float vy_z;                                                             ///< [in] element 1 of row 2 of 3x4 matrix
+    float vz_z;                                                             ///< [in] element 2 of row 2 of 3x4 matrix
+    float p_z;                                                              ///< [in] element 3 of row 2 of 3x4 matrix
+
+} ze_rtas_transform_float3x4_row_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief A 3-dimensional axis-aligned bounding-box with lower and upper bounds
+///        in each dimension
+typedef struct _ze_rtas_aabb_exp_t
+{
+    ze_rtas_float3_exp_t lower;                                             ///< [in] lower bounds of AABB
+    ze_rtas_float3_exp_t upper;                                             ///< [in] upper bounds of AABB
+
+} ze_rtas_aabb_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Triangle represented using 3 vertex indices
+/// 
+/// @details
+///     - Represents a triangle using 3 vertex indices that index into a vertex
+///       array that needs to be provided together with the index array.
+///     - The linear barycentric u/v parametrization of the triangle is defined as:
+///          - (u=0, v=0) at v0,
+///          - (u=1, v=0) at v1, and
+///          - (u=0, v=1) at v2
+typedef struct _ze_rtas_triangle_indices_uint32_exp_t
+{
+    uint32_t v0;                                                            ///< [in] first index pointing to the first triangle vertex in vertex array
+    uint32_t v1;                                                            ///< [in] second index pointing to the second triangle vertex in vertex
+                                                                            ///< array
+    uint32_t v2;                                                            ///< [in] third index pointing to the third triangle vertex in vertex array
+
+} ze_rtas_triangle_indices_uint32_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Quad represented using 4 vertex indices
+/// 
+/// @details
+///     - Represents a quad composed of 4 indices that index into a vertex array
+///       that needs to be provided together with the index array.
+///     - A quad is a triangle pair represented using 4 vertex indices v0, v1,
+///       v2, v3.
+///       The first triangle is made out of indices v0, v1, v3 and the second triangle
+///       from indices v2, v3, v1. The piecewise linear barycentric u/v parametrization
+///       of the quad is defined as:
+///          - (u=0, v=0) at v0,
+///          - (u=1, v=0) at v1,
+///          - (u=0, v=1) at v3, and
+///          - (u=1, v=1) at v2
+///       This is achieved by correcting the u'/v' coordinates of the second
+///       triangle by
+///       *u = 1-u'* and *v = 1-v'*, yielding a piecewise linear parametrization.
+typedef struct _ze_rtas_quad_indices_uint32_exp_t
+{
+    uint32_t v0;                                                            ///< [in] first index pointing to the first quad vertex in vertex array
+    uint32_t v1;                                                            ///< [in] second index pointing to the second quad vertex in vertex array
+    uint32_t v2;                                                            ///< [in] third index pointing to the third quad vertex in vertex array
+    uint32_t v3;                                                            ///< [in] fourth index pointing to the fourth quad vertex in vertex array
+
+} ze_rtas_quad_indices_uint32_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder geometry info
+typedef struct _ze_rtas_builder_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type
+
+} ze_rtas_builder_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder triangle mesh geometry info
+/// 
+/// @details
+///     - The linear barycentric u/v parametrization of the triangle is defined as:
+///          - (u=0, v=0) at v0,
+///          - (u=1, v=0) at v1, and
+///          - (u=0, v=1) at v2
+typedef struct _ze_rtas_builder_triangles_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES
+    ze_rtas_builder_packed_geometry_exp_flags_t geometryFlags;              ///< [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                            ///< bits representing the geometry flags for all primitives of this
+                                                                            ///< geometry
+    uint8_t geometryMask;                                                   ///< [in] 8-bit geometry mask for ray masking
+    ze_rtas_builder_packed_input_data_format_exp_t triangleFormat;          ///< [in] format of triangle buffer data, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_TRIANGLE_INDICES_UINT32
+    ze_rtas_builder_packed_input_data_format_exp_t vertexFormat;            ///< [in] format of vertex buffer data, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3
+    uint32_t triangleCount;                                                 ///< [in] number of triangles in triangle buffer
+    uint32_t vertexCount;                                                   ///< [in] number of vertices in vertex buffer
+    uint32_t triangleStride;                                                ///< [in] stride (in bytes) of triangles in triangle buffer
+    uint32_t vertexStride;                                                  ///< [in] stride (in bytes) of vertices in vertex buffer
+    void* pTriangleBuffer;                                                  ///< [in] pointer to array of triangle indices in specified format
+    void* pVertexBuffer;                                                    ///< [in] pointer to array of triangle vertices in specified format
+
+} ze_rtas_builder_triangles_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder quad mesh geometry info
+/// 
+/// @details
+///     - A quad is a triangle pair represented using 4 vertex indices v0, v1,
+///       v2, v3.
+///       The first triangle is made out of indices v0, v1, v3 and the second triangle
+///       from indices v2, v3, v1. The piecewise linear barycentric u/v parametrization
+///       of the quad is defined as:
+///          - (u=0, v=0) at v0,
+///          - (u=1, v=0) at v1,
+///          - (u=0, v=1) at v3, and
+///          - (u=1, v=1) at v2
+///       This is achieved by correcting the u'/v' coordinates of the second
+///       triangle by
+///       *u = 1-u'* and *v = 1-v'*, yielding a piecewise linear parametrization.
+typedef struct _ze_rtas_builder_quads_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type, must be ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS
+    ze_rtas_builder_packed_geometry_exp_flags_t geometryFlags;              ///< [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                            ///< bits representing the geometry flags for all primitives of this
+                                                                            ///< geometry
+    uint8_t geometryMask;                                                   ///< [in] 8-bit geometry mask for ray masking
+    ze_rtas_builder_packed_input_data_format_exp_t quadFormat;              ///< [in] format of quad buffer data, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_QUAD_INDICES_UINT32
+    ze_rtas_builder_packed_input_data_format_exp_t vertexFormat;            ///< [in] format of vertex buffer data, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3
+    uint32_t quadCount;                                                     ///< [in] number of quads in quad buffer
+    uint32_t vertexCount;                                                   ///< [in] number of vertices in vertex buffer
+    uint32_t quadStride;                                                    ///< [in] stride (in bytes) of quads in quad buffer
+    uint32_t vertexStride;                                                  ///< [in] stride (in bytes) of vertices in vertex buffer
+    void* pQuadBuffer;                                                      ///< [in] pointer to array of quad indices in specified format
+    void* pVertexBuffer;                                                    ///< [in] pointer to array of quad vertices in specified format
+
+} ze_rtas_builder_quads_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief AABB callback function parameters
+typedef struct _ze_rtas_geometry_aabbs_exp_cb_params_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t primID;                                                        ///< [in] first primitive to return bounds for
+    uint32_t primIDCount;                                                   ///< [in] number of primitives to return bounds for
+    void* pGeomUserPtr;                                                     ///< [in] pointer provided through geometry descriptor
+    void* pBuildUserPtr;                                                    ///< [in] pointer provided through ::zeRTASBuilderBuildExp function
+    ze_rtas_aabb_exp_t* pBoundsOut;                                         ///< [out] destination buffer to write AABB bounds to
+
+} ze_rtas_geometry_aabbs_exp_cb_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function pointer type to return AABBs for a range of
+///        procedural primitives
+typedef void (*ze_rtas_geometry_aabbs_cb_exp_t)(
+        ze_rtas_geometry_aabbs_exp_cb_params_t* params                          ///< [in] callback function parameters structure
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder procedural primitives
+///        geometry info
+/// 
+/// @details
+///     - A host-side bounds callback function is invoked by the acceleration
+///       structure builder to query the bounds of procedural primitives on
+///       demand. The callback is passed some `pGeomUserPtr` that can point to
+///       an application-side representation of the procedural primitives.
+///       Further, a second `pBuildUserPtr`, which is set by a parameter to
+///       ::zeRTASBuilderBuildExp, is passed to the callback. This allows the
+///       build to change the bounds of the procedural geometry, for example, to
+///       build a BVH only over a short time range to implement multi-segment
+///       motion blur.
+typedef struct _ze_rtas_builder_procedural_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL
+    ze_rtas_builder_packed_geometry_exp_flags_t geometryFlags;              ///< [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                            ///< bits representing the geometry flags for all primitives of this
+                                                                            ///< geometry
+    uint8_t geometryMask;                                                   ///< [in] 8-bit geometry mask for ray masking
+    uint8_t reserved;                                                       ///< [in] reserved for future use
+    uint32_t primCount;                                                     ///< [in] number of primitives in geometry
+    ze_rtas_geometry_aabbs_cb_exp_t pfnGetBoundsCb;                         ///< [in] pointer to callback function to get the axis-aligned bounding-box
+                                                                            ///< for a range of primitives
+    void* pGeomUserPtr;                                                     ///< [in] user data pointer passed to callback
+
+} ze_rtas_builder_procedural_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder instance geometry info
+typedef struct _ze_rtas_builder_instance_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE
+    ze_rtas_builder_packed_instance_exp_flags_t instanceFlags;              ///< [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                            ///< bits representing the geometry flags for all primitives of this
+                                                                            ///< geometry
+    uint8_t geometryMask;                                                   ///< [in] 8-bit geometry mask for ray masking
+    ze_rtas_builder_packed_input_data_format_exp_t transformFormat;         ///< [in] format of the specified transformation
+    uint32_t instanceUserID;                                                ///< [in] user-specified identifier for the instance
+    void* pTransform;                                                       ///< [in] object-to-world instance transformation in specified format
+    ze_rtas_aabb_exp_t* pBounds;                                            ///< [in] object-space axis-aligned bounding-box of the instanced
+                                                                            ///< acceleration structure
+    void* pAccelerationStructure;                                           ///< [in] pointer to acceleration structure to instantiate
+
+} ze_rtas_builder_instance_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief 
+typedef struct _ze_rtas_builder_build_op_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_format_exp_t rtasFormat;                                        ///< [in] ray tracing acceleration structure format
+    ze_rtas_builder_build_quality_hint_exp_t buildQuality;                  ///< [in] acceleration structure build quality hint
+    ze_rtas_builder_build_op_exp_flags_t buildFlags;                        ///< [in] 0 or some combination of ::ze_rtas_builder_build_op_exp_flag_t
+                                                                            ///< flags
+    const ze_rtas_builder_geometry_info_exp_t** ppGeometries;               ///< [in][optional][range(0, `numGeometries`)] NULL or a valid array of
+                                                                            ///< pointers to geometry infos
+    uint32_t numGeometries;                                                 ///< [in] number of geometries in geometry infos array, can be zero when
+                                                                            ///< `ppGeometries` is NULL
+
+} ze_rtas_builder_build_op_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a ray tracing acceleration structure builder object
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support ::ZE_experimental_rtas_builder
+///       extension.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pDescriptor`
+///         + `nullptr == phBuilder`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_RTAS_BUILDER_EXP_VERSION_CURRENT < pDescriptor->builderVersion`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASBuilderCreateExp(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of driver object
+    const ze_rtas_builder_exp_desc_t* pDescriptor,                          ///< [in] pointer to builder descriptor
+    ze_rtas_builder_exp_handle_t* phBuilder                                 ///< [out] handle of builder object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves ray tracing acceleration structure builder properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hBuilder`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pBuildOpDescriptor`
+///         + `nullptr == pProperties`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_RTAS_FORMAT_EXP_INVALID < pBuildOpDescriptor->rtasFormat`
+///         + `::ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_HIGH < pBuildOpDescriptor->buildQuality`
+///         + `0x3 < pBuildOpDescriptor->buildFlags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASBuilderGetBuildPropertiesExp(
+    ze_rtas_builder_exp_handle_t hBuilder,                                  ///< [in] handle of builder object
+    const ze_rtas_builder_build_op_exp_desc_t* pBuildOpDescriptor,          ///< [in] pointer to build operation descriptor
+    ze_rtas_builder_exp_properties_t* pProperties                           ///< [in,out] query result for builder properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Checks ray tracing acceleration structure format compatibility
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_RTAS_FORMAT_EXP_INVALID < rtasFormatA`
+///         + `::ZE_RTAS_FORMAT_EXP_INVALID < rtasFormatB`
+///     - ::ZE_RESULT_SUCCESS
+///         + An acceleration structure built with `rtasFormatA` is compatible with devices that report `rtasFormatB`.
+///     - ::ZE_RESULT_EXP_ERROR_OPERANDS_INCOMPATIBLE
+///         + An acceleration structure built with `rtasFormatA` is **not** compatible with devices that report `rtasFormatB`.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverRTASFormatCompatibilityCheckExp(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of driver object
+    ze_rtas_format_exp_t rtasFormatA,                                       ///< [in] operand A
+    ze_rtas_format_exp_t rtasFormatB                                        ///< [in] operand B
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Build ray tracing acceleration structure
+/// 
+/// @details
+///     - This function builds an acceleration structure of the scene consisting
+///       of the specified geometry information and writes the acceleration
+///       structure to the provided destination buffer. All types of geometries
+///       can get freely mixed inside a scene.
+///     - It is the user's responsibility to manage the acceleration structure
+///       buffer allocation, de-allocation, and potential prefetching to the
+///       device memory. The required size of the acceleration structure buffer
+///       can be queried with the ::zeRTASBuilderGetBuildPropertiesExp function.
+///       The acceleration structure buffer must be a shared USM allocation and
+///       should be present on the host at build time. The referenced scene data
+///       (index- and vertex- buffers) can be standard host allocations, and
+///       will not be referenced into by the build acceleration structure.
+///     - Before an acceleration structure can be built, the user must allocate
+///       the memory for the acceleration structure buffer and scratch buffer
+///       using sizes based on a query for the estimated size properties.
+///     - When using the "worst-case" size for the acceleration structure
+///       buffer, the acceleration structure construction will never fail with ::ZE_RESULT_EXP_RTAS_BUILD_RETRY.
+///     - When using the "expected" size for the acceleration structure buffer,
+///       the acceleration structure construction may fail with
+///       ::ZE_RESULT_EXP_RTAS_BUILD_RETRY. If this happens, the user may resize
+///       their acceleration structure buffer using the returned
+///       `*pRtasBufferSizeBytes` value, which will be updated with an improved
+///       size estimate that will likely result in a successful build.
+///     - The acceleration structure construction is run on the host and is
+///       synchronous, thus after the function returns with a successful result,
+///       the acceleration structure may be used.
+///     - All provided data buffers must be host-accessible.
+///     - The acceleration structure buffer must be a USM allocation.
+///     - A successfully constructed acceleration structure is entirely
+///       self-contained. There is no requirement for input data to persist
+///       beyond build completion.
+///     - A successfully constructed acceleration structure is non-copyable.
+///     - Acceleration structure construction may be parallelized by passing a
+///       valid handle to a parallel operation object and joining that parallel
+///       operation using ::zeRTASParallelOperationJoinExp with user-provided
+///       worker threads.
+///     - **Additional Notes**
+///        - "The geometry infos array, geometry infos, and scratch buffer must
+///       all be standard host memory allocations."
+///        - "A pointer to a geometry info can be a null pointer, in which case
+///       the geometry is treated as empty."
+///        - "If no parallel operation handle is provided, the build is run
+///       sequentially on the current thread."
+///        - "A parallel operation object may only be associated with a single
+///       acceleration structure build at a time."
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hBuilder`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pBuildOpDescriptor`
+///         + `nullptr == pScratchBuffer`
+///         + `nullptr == pRtasBuffer`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_RTAS_FORMAT_EXP_INVALID < pBuildOpDescriptor->rtasFormat`
+///         + `::ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_HIGH < pBuildOpDescriptor->buildQuality`
+///         + `0x3 < pBuildOpDescriptor->buildFlags`
+///     - ::ZE_RESULT_EXP_RTAS_BUILD_DEFERRED
+///         + Acceleration structure build completion is deferred to parallel operation join.
+///     - ::ZE_RESULT_EXP_RTAS_BUILD_RETRY
+///         + Acceleration structure build failed due to insufficient resources, retry the build operation with a larger acceleration structure buffer allocation.
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + Acceleration structure build failed due to parallel operation object participation in another build operation.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASBuilderBuildExp(
+    ze_rtas_builder_exp_handle_t hBuilder,                                  ///< [in] handle of builder object
+    const ze_rtas_builder_build_op_exp_desc_t* pBuildOpDescriptor,          ///< [in] pointer to build operation descriptor
+    void* pScratchBuffer,                                                   ///< [in][range(0, `scratchBufferSizeBytes`)] scratch buffer to be used
+                                                                            ///< during acceleration structure construction
+    size_t scratchBufferSizeBytes,                                          ///< [in] size of scratch buffer, in bytes
+    void* pRtasBuffer,                                                      ///< [in] pointer to destination buffer
+    size_t rtasBufferSizeBytes,                                             ///< [in] destination buffer size, in bytes
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation,             ///< [in][optional] handle to parallel operation object
+    void* pBuildUserPtr,                                                    ///< [in][optional] pointer passed to callbacks
+    ze_rtas_aabb_exp_t* pBounds,                                            ///< [in,out][optional] pointer to destination address for acceleration
+                                                                            ///< structure bounds
+    size_t* pRtasBufferSizeBytes                                            ///< [out][optional] updated acceleration structure size requirement, in
+                                                                            ///< bytes
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a ray tracing acceleration structure builder object
+/// 
+/// @details
+///     - The implementation of this function may immediately release any
+///       internal Host and Device resources associated with this builder.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same builder handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hBuilder`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASBuilderDestroyExp(
+    ze_rtas_builder_exp_handle_t hBuilder                                   ///< [in][release] handle of builder object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a ray tracing acceleration structure builder parallel
+///        operation object
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support ::ZE_experimental_rtas_builder
+///       extension.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phParallelOperation`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASParallelOperationCreateExp(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of driver object
+    ze_rtas_parallel_operation_exp_handle_t* phParallelOperation            ///< [out] handle of parallel operation object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves ray tracing acceleration structure builder parallel
+///        operation properties
+/// 
+/// @details
+///     - The application must first bind the parallel operation object to a
+///       build operation before it may query the parallel operation properties.
+///       In other words, the application must first call
+///       ::zeRTASBuilderBuildExp with **hParallelOperation** before calling
+///       this function.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hParallelOperation`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASParallelOperationGetPropertiesExp(
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation,             ///< [in] handle of parallel operation object
+    ze_rtas_parallel_operation_exp_properties_t* pProperties                ///< [in,out] query result for parallel operation properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Joins a parallel build operation
+/// 
+/// @details
+///     - All worker threads return the same error code for the parallel build
+///       operation upon build completion
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hParallelOperation`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASParallelOperationJoinExp(
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation              ///< [in] handle of parallel operation object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a ray tracing acceleration structure builder parallel
+///        operation object
+/// 
+/// @details
+///     - The implementation of this function may immediately release any
+///       internal Host and Device resources associated with this parallel
+///       operation.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same parallel operation handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hParallelOperation`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASParallelOperationDestroyExp(
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation              ///< [in][release] handle of parallel operation object to destroy
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Counter-based Event Pools
+#if !defined(__GNUC__)
+#pragma region counterbasedeventpool
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME
+/// @brief Counter-based Event Pools Extension Name
+#define ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME  "ZE_experimental_event_pool_counter_based"
+#endif // ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Counter-based Event Pools Extension Version(s)
+typedef enum _ze_event_pool_counter_based_exp_version_t
+{
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_pool_counter_based_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported event flags for defining counter-based event pools.
+typedef uint32_t ze_event_pool_counter_based_exp_flags_t;
+typedef enum _ze_event_pool_counter_based_exp_flag_t
+{
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_IMMEDIATE = ZE_BIT(0),             ///< Counter-based event pool is used for immediate command lists (default)
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE = ZE_BIT(1),         ///< Counter-based event pool is for non-immediate command lists
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_pool_counter_based_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event pool descriptor for counter-based events. This structure may be
+///        passed to ::zeEventPoolCreate as pNext member of
+///        ::ze_event_pool_desc_t.
+typedef struct _ze_event_pool_counter_based_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_event_pool_counter_based_exp_flags_t flags;                          ///< [in] mode flags.
+                                                                            ///< must be 0 (default) or a valid value of ::ze_event_pool_counter_based_exp_flag_t
+                                                                            ///< default behavior is counter-based event pool is only used for
+                                                                            ///< immediate command lists.
+
+} ze_event_pool_counter_based_exp_desc_t;
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
diff --git a/deps/level_zero/include/zes_api.h b/deps/level_zero/include/zes_api.h
index 24462237a..f7a8e9a0e 100644
--- a/deps/level_zero/include/zes_api.h
+++ b/deps/level_zero/include/zes_api.h
@@ -5,7 +5,7 @@
  * SPDX-License-Identifier: MIT
  *
  * @file zes_api.h
- * @version v1.6-r1.6.3
+ * @version v1.8-r1.8.0
  *
  */
 #ifndef _ZES_API_H
@@ -101,47 +101,55 @@ typedef struct _zes_overclock_handle_t *zes_overclock_handle_t;
 /// @brief Defines structure types
 typedef enum _zes_structure_type_t
 {
-    ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,     ///< ::zes_device_properties_t
-    ZES_STRUCTURE_TYPE_PCI_PROPERTIES = 0x2,        ///< ::zes_pci_properties_t
-    ZES_STRUCTURE_TYPE_PCI_BAR_PROPERTIES = 0x3,    ///< ::zes_pci_bar_properties_t
-    ZES_STRUCTURE_TYPE_DIAG_PROPERTIES = 0x4,       ///< ::zes_diag_properties_t
-    ZES_STRUCTURE_TYPE_ENGINE_PROPERTIES = 0x5,     ///< ::zes_engine_properties_t
-    ZES_STRUCTURE_TYPE_FABRIC_PORT_PROPERTIES = 0x6,///< ::zes_fabric_port_properties_t
-    ZES_STRUCTURE_TYPE_FAN_PROPERTIES = 0x7,        ///< ::zes_fan_properties_t
-    ZES_STRUCTURE_TYPE_FIRMWARE_PROPERTIES = 0x8,   ///< ::zes_firmware_properties_t
-    ZES_STRUCTURE_TYPE_FREQ_PROPERTIES = 0x9,       ///< ::zes_freq_properties_t
-    ZES_STRUCTURE_TYPE_LED_PROPERTIES = 0xa,        ///< ::zes_led_properties_t
-    ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,        ///< ::zes_mem_properties_t
-    ZES_STRUCTURE_TYPE_PERF_PROPERTIES = 0xc,       ///< ::zes_perf_properties_t
-    ZES_STRUCTURE_TYPE_POWER_PROPERTIES = 0xd,      ///< ::zes_power_properties_t
-    ZES_STRUCTURE_TYPE_PSU_PROPERTIES = 0xe,        ///< ::zes_psu_properties_t
-    ZES_STRUCTURE_TYPE_RAS_PROPERTIES = 0xf,        ///< ::zes_ras_properties_t
-    ZES_STRUCTURE_TYPE_SCHED_PROPERTIES = 0x10,     ///< ::zes_sched_properties_t
-    ZES_STRUCTURE_TYPE_SCHED_TIMEOUT_PROPERTIES = 0x11, ///< ::zes_sched_timeout_properties_t
-    ZES_STRUCTURE_TYPE_SCHED_TIMESLICE_PROPERTIES = 0x12,   ///< ::zes_sched_timeslice_properties_t
-    ZES_STRUCTURE_TYPE_STANDBY_PROPERTIES = 0x13,   ///< ::zes_standby_properties_t
-    ZES_STRUCTURE_TYPE_TEMP_PROPERTIES = 0x14,      ///< ::zes_temp_properties_t
-    ZES_STRUCTURE_TYPE_DEVICE_STATE = 0x15,         ///< ::zes_device_state_t
-    ZES_STRUCTURE_TYPE_PROCESS_STATE = 0x16,        ///< ::zes_process_state_t
-    ZES_STRUCTURE_TYPE_PCI_STATE = 0x17,            ///< ::zes_pci_state_t
-    ZES_STRUCTURE_TYPE_FABRIC_PORT_CONFIG = 0x18,   ///< ::zes_fabric_port_config_t
-    ZES_STRUCTURE_TYPE_FABRIC_PORT_STATE = 0x19,    ///< ::zes_fabric_port_state_t
-    ZES_STRUCTURE_TYPE_FAN_CONFIG = 0x1a,           ///< ::zes_fan_config_t
-    ZES_STRUCTURE_TYPE_FREQ_STATE = 0x1b,           ///< ::zes_freq_state_t
-    ZES_STRUCTURE_TYPE_OC_CAPABILITIES = 0x1c,      ///< ::zes_oc_capabilities_t
-    ZES_STRUCTURE_TYPE_LED_STATE = 0x1d,            ///< ::zes_led_state_t
-    ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,            ///< ::zes_mem_state_t
-    ZES_STRUCTURE_TYPE_PSU_STATE = 0x1f,            ///< ::zes_psu_state_t
-    ZES_STRUCTURE_TYPE_BASE_STATE = 0x20,           ///< ::zes_base_state_t
-    ZES_STRUCTURE_TYPE_RAS_CONFIG = 0x21,           ///< ::zes_ras_config_t
-    ZES_STRUCTURE_TYPE_RAS_STATE = 0x22,            ///< ::zes_ras_state_t
-    ZES_STRUCTURE_TYPE_TEMP_CONFIG = 0x23,          ///< ::zes_temp_config_t
-    ZES_STRUCTURE_TYPE_PCI_BAR_PROPERTIES_1_2 = 0x24,   ///< ::zes_pci_bar_properties_1_2_t
-    ZES_STRUCTURE_TYPE_DEVICE_ECC_DESC = 0x25,      ///< ::zes_device_ecc_desc_t
-    ZES_STRUCTURE_TYPE_DEVICE_ECC_PROPERTIES = 0x26,///< ::zes_device_ecc_properties_t
-    ZES_STRUCTURE_TYPE_POWER_LIMIT_EXT_DESC = 0x27, ///< ::zes_power_limit_ext_desc_t
-    ZES_STRUCTURE_TYPE_POWER_EXT_PROPERTIES = 0x28, ///< ::zes_power_ext_properties_t
-    ZES_STRUCTURE_TYPE_OVERCLOCK_PROPERTIES = 0x29, ///< ::zes_overclock_properties_t
+    ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,                             ///< ::zes_device_properties_t
+    ZES_STRUCTURE_TYPE_PCI_PROPERTIES = 0x2,                                ///< ::zes_pci_properties_t
+    ZES_STRUCTURE_TYPE_PCI_BAR_PROPERTIES = 0x3,                            ///< ::zes_pci_bar_properties_t
+    ZES_STRUCTURE_TYPE_DIAG_PROPERTIES = 0x4,                               ///< ::zes_diag_properties_t
+    ZES_STRUCTURE_TYPE_ENGINE_PROPERTIES = 0x5,                             ///< ::zes_engine_properties_t
+    ZES_STRUCTURE_TYPE_FABRIC_PORT_PROPERTIES = 0x6,                        ///< ::zes_fabric_port_properties_t
+    ZES_STRUCTURE_TYPE_FAN_PROPERTIES = 0x7,                                ///< ::zes_fan_properties_t
+    ZES_STRUCTURE_TYPE_FIRMWARE_PROPERTIES = 0x8,                           ///< ::zes_firmware_properties_t
+    ZES_STRUCTURE_TYPE_FREQ_PROPERTIES = 0x9,                               ///< ::zes_freq_properties_t
+    ZES_STRUCTURE_TYPE_LED_PROPERTIES = 0xa,                                ///< ::zes_led_properties_t
+    ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,                                ///< ::zes_mem_properties_t
+    ZES_STRUCTURE_TYPE_PERF_PROPERTIES = 0xc,                               ///< ::zes_perf_properties_t
+    ZES_STRUCTURE_TYPE_POWER_PROPERTIES = 0xd,                              ///< ::zes_power_properties_t
+    ZES_STRUCTURE_TYPE_PSU_PROPERTIES = 0xe,                                ///< ::zes_psu_properties_t
+    ZES_STRUCTURE_TYPE_RAS_PROPERTIES = 0xf,                                ///< ::zes_ras_properties_t
+    ZES_STRUCTURE_TYPE_SCHED_PROPERTIES = 0x10,                             ///< ::zes_sched_properties_t
+    ZES_STRUCTURE_TYPE_SCHED_TIMEOUT_PROPERTIES = 0x11,                     ///< ::zes_sched_timeout_properties_t
+    ZES_STRUCTURE_TYPE_SCHED_TIMESLICE_PROPERTIES = 0x12,                   ///< ::zes_sched_timeslice_properties_t
+    ZES_STRUCTURE_TYPE_STANDBY_PROPERTIES = 0x13,                           ///< ::zes_standby_properties_t
+    ZES_STRUCTURE_TYPE_TEMP_PROPERTIES = 0x14,                              ///< ::zes_temp_properties_t
+    ZES_STRUCTURE_TYPE_DEVICE_STATE = 0x15,                                 ///< ::zes_device_state_t
+    ZES_STRUCTURE_TYPE_PROCESS_STATE = 0x16,                                ///< ::zes_process_state_t
+    ZES_STRUCTURE_TYPE_PCI_STATE = 0x17,                                    ///< ::zes_pci_state_t
+    ZES_STRUCTURE_TYPE_FABRIC_PORT_CONFIG = 0x18,                           ///< ::zes_fabric_port_config_t
+    ZES_STRUCTURE_TYPE_FABRIC_PORT_STATE = 0x19,                            ///< ::zes_fabric_port_state_t
+    ZES_STRUCTURE_TYPE_FAN_CONFIG = 0x1a,                                   ///< ::zes_fan_config_t
+    ZES_STRUCTURE_TYPE_FREQ_STATE = 0x1b,                                   ///< ::zes_freq_state_t
+    ZES_STRUCTURE_TYPE_OC_CAPABILITIES = 0x1c,                              ///< ::zes_oc_capabilities_t
+    ZES_STRUCTURE_TYPE_LED_STATE = 0x1d,                                    ///< ::zes_led_state_t
+    ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,                                    ///< ::zes_mem_state_t
+    ZES_STRUCTURE_TYPE_PSU_STATE = 0x1f,                                    ///< ::zes_psu_state_t
+    ZES_STRUCTURE_TYPE_BASE_STATE = 0x20,                                   ///< ::zes_base_state_t
+    ZES_STRUCTURE_TYPE_RAS_CONFIG = 0x21,                                   ///< ::zes_ras_config_t
+    ZES_STRUCTURE_TYPE_RAS_STATE = 0x22,                                    ///< ::zes_ras_state_t
+    ZES_STRUCTURE_TYPE_TEMP_CONFIG = 0x23,                                  ///< ::zes_temp_config_t
+    ZES_STRUCTURE_TYPE_PCI_BAR_PROPERTIES_1_2 = 0x24,                       ///< ::zes_pci_bar_properties_1_2_t
+    ZES_STRUCTURE_TYPE_DEVICE_ECC_DESC = 0x25,                              ///< ::zes_device_ecc_desc_t
+    ZES_STRUCTURE_TYPE_DEVICE_ECC_PROPERTIES = 0x26,                        ///< ::zes_device_ecc_properties_t
+    ZES_STRUCTURE_TYPE_POWER_LIMIT_EXT_DESC = 0x27,                         ///< ::zes_power_limit_ext_desc_t
+    ZES_STRUCTURE_TYPE_POWER_EXT_PROPERTIES = 0x28,                         ///< ::zes_power_ext_properties_t
+    ZES_STRUCTURE_TYPE_OVERCLOCK_PROPERTIES = 0x29,                         ///< ::zes_overclock_properties_t
+    ZES_STRUCTURE_TYPE_FABRIC_PORT_ERROR_COUNTERS = 0x2a,                   ///< ::zes_fabric_port_error_counters_t
+    ZES_STRUCTURE_TYPE_ENGINE_EXT_PROPERTIES = 0x2b,                        ///< ::zes_engine_ext_properties_t
+    ZES_STRUCTURE_TYPE_RESET_PROPERTIES = 0x2c,                             ///< ::zes_reset_properties_t
+    ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,                        ///< ::zes_device_ext_properties_t
+    ZES_STRUCTURE_TYPE_DEVICE_UUID = 0x2e,                                  ///< ::zes_uuid_t
+    ZES_STRUCTURE_TYPE_POWER_DOMAIN_EXP_PROPERTIES = 0x00020001,            ///< ::zes_power_domain_exp_properties_t
+    ZES_STRUCTURE_TYPE_MEM_TIMESTAMP_BITS_EXP = 0x00020002,                 ///< ::zes_mem_timestamp_bits_exp_t
+    ZES_STRUCTURE_TYPE_MEMORY_PAGE_OFFLINE_STATE_EXP = 0x00020003,          ///< ::zes_mem_page_offline_state_exp_t
     ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } zes_structure_type_t;
@@ -150,9 +158,9 @@ typedef enum _zes_structure_type_t
 /// @brief Base for all properties types
 typedef struct _zes_base_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
 
 } zes_base_properties_t;
 
@@ -160,9 +168,9 @@ typedef struct _zes_base_properties_t
 /// @brief Base for all descriptor types
 typedef struct _zes_base_desc_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
 
 } zes_base_desc_t;
 
@@ -170,9 +178,9 @@ typedef struct _zes_base_desc_t
 /// @brief Base for all state types
 typedef struct _zes_base_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
 
 } zes_base_state_t;
 
@@ -180,9 +188,9 @@ typedef struct _zes_base_state_t
 /// @brief Base for all config types
 typedef struct _zes_base_config_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
 
 } zes_base_config_t;
 
@@ -190,9 +198,9 @@ typedef struct _zes_base_config_t
 /// @brief Base for all capability types
 typedef struct _zes_base_capability_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
 
 } zes_base_capability_t;
 
@@ -216,14 +224,30 @@ typedef struct _zes_base_config_t zes_base_config_t;
 /// @brief Forward-declare zes_base_capability_t
 typedef struct _zes_base_capability_t zes_base_capability_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_driver_extension_properties_t
+typedef struct _zes_driver_extension_properties_t zes_driver_extension_properties_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Forward-declare zes_device_state_t
 typedef struct _zes_device_state_t zes_device_state_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_reset_properties_t
+typedef struct _zes_reset_properties_t zes_reset_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_uuid_t
+typedef struct _zes_uuid_t zes_uuid_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Forward-declare zes_device_properties_t
 typedef struct _zes_device_properties_t zes_device_properties_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_device_ext_properties_t
+typedef struct _zes_device_ext_properties_t zes_device_ext_properties_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Forward-declare zes_process_state_t
 typedef struct _zes_process_state_t zes_process_state_t;
@@ -320,6 +344,10 @@ typedef struct _zes_fabric_port_state_t zes_fabric_port_state_t;
 /// @brief Forward-declare zes_fabric_port_throughput_t
 typedef struct _zes_fabric_port_throughput_t zes_fabric_port_throughput_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_port_error_counters_t
+typedef struct _zes_fabric_port_error_counters_t zes_fabric_port_error_counters_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Forward-declare zes_fan_speed_t
 typedef struct _zes_fan_speed_t zes_fan_speed_t;
@@ -388,6 +416,10 @@ typedef struct _zes_mem_state_t zes_mem_state_t;
 /// @brief Forward-declare zes_mem_bandwidth_t
 typedef struct _zes_mem_bandwidth_t zes_mem_bandwidth_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_ext_bandwidth_t
+typedef struct _zes_mem_ext_bandwidth_t zes_mem_ext_bandwidth_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Forward-declare zes_perf_properties_t
 typedef struct _zes_perf_properties_t zes_perf_properties_t;
@@ -472,6 +504,26 @@ typedef struct _zes_power_limit_ext_desc_t zes_power_limit_ext_desc_t;
 /// @brief Forward-declare zes_power_ext_properties_t
 typedef struct _zes_power_ext_properties_t zes_power_ext_properties_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_engine_ext_properties_t
+typedef struct _zes_engine_ext_properties_t zes_engine_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_ras_state_exp_t
+typedef struct _zes_ras_state_exp_t zes_ras_state_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_page_offline_state_exp_t
+typedef struct _zes_mem_page_offline_state_exp_t zes_mem_page_offline_state_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_timestamp_bits_exp_t
+typedef struct _zes_mem_timestamp_bits_exp_t zes_mem_timestamp_bits_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_domain_exp_properties_t
+typedef struct _zes_power_domain_exp_properties_t zes_power_domain_exp_properties_t;
+
 
 #if !defined(__GNUC__)
 #pragma endregion
@@ -485,7 +537,7 @@ typedef struct _zes_power_ext_properties_t zes_power_ext_properties_t;
 typedef uint32_t zes_init_flags_t;
 typedef enum _zes_init_flag_t
 {
-    ZES_INIT_FLAG_PLACEHOLDER = ZE_BIT(0),          ///< placeholder for future use
+    ZES_INIT_FLAG_PLACEHOLDER = ZE_BIT(0),                                  ///< placeholder for future use
     ZES_INIT_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_init_flag_t;
@@ -521,8 +573,8 @@ typedef enum _zes_init_flag_t
 ///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesInit(
-    zes_init_flags_t flags                          ///< [in] initialization flags.
-                                                    ///< currently unused, must be 0 (default).
+    zes_init_flags_t flags                                                  ///< [in] initialization flags.
+                                                                            ///< currently unused, must be 0 (default).
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -547,15 +599,88 @@ zesInit(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDriverGet(
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of sysman driver instances.
-                                                    ///< if count is zero, then the loader shall update the value with the
-                                                    ///< total number of sysman drivers available.
-                                                    ///< if count is greater than the number of sysman drivers available, then
-                                                    ///< the loader shall update the value with the correct number of sysman
-                                                    ///< drivers available.
-    zes_driver_handle_t* phDrivers                  ///< [in,out][optional][range(0, *pCount)] array of sysman driver instance handles.
-                                                    ///< if count is less than the number of sysman drivers available, then the
-                                                    ///< loader shall only retrieve that number of sysman drivers.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sysman driver instances.
+                                                                            ///< if count is zero, then the loader shall update the value with the
+                                                                            ///< total number of sysman drivers available.
+                                                                            ///< if count is greater than the number of sysman drivers available, then
+                                                                            ///< the loader shall update the value with the correct number of sysman
+                                                                            ///< drivers available.
+    zes_driver_handle_t* phDrivers                                          ///< [in,out][optional][range(0, *pCount)] array of sysman driver instance handles.
+                                                                            ///< if count is less than the number of sysman drivers available, then the
+                                                                            ///< loader shall only retrieve that number of sysman drivers.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MAX_EXTENSION_NAME
+/// @brief Maximum extension name string size
+#define ZES_MAX_EXTENSION_NAME  256
+#endif // ZES_MAX_EXTENSION_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties queried using ::zesDriverGetExtensionProperties
+typedef struct _zes_driver_extension_properties_t
+{
+    char name[ZES_MAX_EXTENSION_NAME];                                      ///< [out] extension name
+    uint32_t version;                                                       ///< [out] extension version using ::ZE_MAKE_VERSION
+
+} zes_driver_extension_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves extension properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDriverGetExtensionProperties(
+    zes_driver_handle_t hDriver,                                            ///< [in] handle of the driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of extension properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of extension properties available.
+                                                                            ///< if count is greater than the number of extension properties available,
+                                                                            ///< then the driver shall update the value with the correct number of
+                                                                            ///< extension properties available.
+    zes_driver_extension_properties_t* pExtensionProperties                 ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< extension properties.
+                                                                            ///< if count is less than the number of extension properties available,
+                                                                            ///< then driver shall only retrieve that number of extension properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves function pointer for vendor-specific or experimental
+///        extensions
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == name`
+///         + `nullptr == ppFunctionAddress`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDriverGetExtensionFunctionAddress(
+    zes_driver_handle_t hDriver,                                            ///< [in] handle of the driver instance
+    const char* name,                                                       ///< [in] extension function name
+    void** ppFunctionAddress                                                ///< [out] pointer to function pointer
     );
 
 #if !defined(__GNUC__)
@@ -572,8 +697,8 @@ zesDriverGet(
 ///     - Multiple calls to this function will return identical sysman device
 ///       handles, in the same order.
 ///     - The number and order of handles returned from this function is NOT
-///       affected by the ::ZE_AFFINITY_MASK or ::ZE_ENABLE_PCI_ID_DEVICE_ORDER
-///       environment variables.
+///       affected by the ::ZE_AFFINITY_MASK, ::ZE_ENABLE_PCI_ID_DEVICE_ORDER,
+///       or ::ZE_FLAT_DEVICE_HIERARCHY environment variables.
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
 /// 
@@ -589,16 +714,16 @@ zesDriverGet(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceGet(
-    zes_driver_handle_t hDriver,                    ///< [in] handle of the sysman driver instance
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of sysman devices.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of sysman devices available.
-                                                    ///< if count is greater than the number of sysman devices available, then
-                                                    ///< the driver shall update the value with the correct number of sysman
-                                                    ///< devices available.
-    zes_device_handle_t* phDevices                  ///< [in,out][optional][range(0, *pCount)] array of handle of sysman devices.
-                                                    ///< if count is less than the number of sysman devices available, then
-                                                    ///< driver shall only retrieve that number of sysman devices.
+    zes_driver_handle_t hDriver,                                            ///< [in] handle of the sysman driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sysman devices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sysman devices available.
+                                                                            ///< if count is greater than the number of sysman devices available, then
+                                                                            ///< the driver shall update the value with the correct number of sysman
+                                                                            ///< devices available.
+    zes_device_handle_t* phDevices                                          ///< [in,out][optional][range(0, *pCount)] array of handle of sysman devices.
+                                                                            ///< if count is less than the number of sysman devices available, then
+                                                                            ///< driver shall only retrieve that number of sysman devices.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -607,17 +732,23 @@ zesDeviceGet(
 #define ZES_STRING_PROPERTY_SIZE  64
 #endif // ZES_STRING_PROPERTY_SIZE
 
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MAX_UUID_SIZE
+/// @brief Maximum device universal unique id (UUID) size in bytes.
+#define ZES_MAX_UUID_SIZE  16
+#endif // ZES_MAX_UUID_SIZE
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Types of accelerator engines
 typedef uint32_t zes_engine_type_flags_t;
 typedef enum _zes_engine_type_flag_t
 {
-    ZES_ENGINE_TYPE_FLAG_OTHER = ZE_BIT(0),         ///< Undefined types of accelerators.
-    ZES_ENGINE_TYPE_FLAG_COMPUTE = ZE_BIT(1),       ///< Engines that process compute kernels only (no 3D content).
-    ZES_ENGINE_TYPE_FLAG_3D = ZE_BIT(2),            ///< Engines that process 3D content only (no compute kernels).
-    ZES_ENGINE_TYPE_FLAG_MEDIA = ZE_BIT(3),         ///< Engines that process media workloads.
-    ZES_ENGINE_TYPE_FLAG_DMA = ZE_BIT(4),           ///< Engines that copy blocks of data.
-    ZES_ENGINE_TYPE_FLAG_RENDER = ZE_BIT(5),        ///< Engines that can process both 3D content and compute kernels.
+    ZES_ENGINE_TYPE_FLAG_OTHER = ZE_BIT(0),                                 ///< Undefined types of accelerators.
+    ZES_ENGINE_TYPE_FLAG_COMPUTE = ZE_BIT(1),                               ///< Engines that process compute kernels only (no 3D content).
+    ZES_ENGINE_TYPE_FLAG_3D = ZE_BIT(2),                                    ///< Engines that process 3D content only (no compute kernels).
+    ZES_ENGINE_TYPE_FLAG_MEDIA = ZE_BIT(3),                                 ///< Engines that process media workloads.
+    ZES_ENGINE_TYPE_FLAG_DMA = ZE_BIT(4),                                   ///< Engines that copy blocks of data.
+    ZES_ENGINE_TYPE_FLAG_RENDER = ZE_BIT(5),                                ///< Engines that can process both 3D content and compute kernels.
     ZES_ENGINE_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_engine_type_flag_t;
@@ -626,9 +757,9 @@ typedef enum _zes_engine_type_flag_t
 /// @brief Device repair status
 typedef enum _zes_repair_status_t
 {
-    ZES_REPAIR_STATUS_UNSUPPORTED = 0,              ///< The device does not support in-field repairs.
-    ZES_REPAIR_STATUS_NOT_PERFORMED = 1,            ///< The device has never been repaired.
-    ZES_REPAIR_STATUS_PERFORMED = 2,                ///< The device has been repaired.
+    ZES_REPAIR_STATUS_UNSUPPORTED = 0,                                      ///< The device does not support in-field repairs.
+    ZES_REPAIR_STATUS_NOT_PERFORMED = 1,                                    ///< The device has never been repaired.
+    ZES_REPAIR_STATUS_PERFORMED = 2,                                        ///< The device has been repaired.
     ZES_REPAIR_STATUS_FORCE_UINT32 = 0x7fffffff
 
 } zes_repair_status_t;
@@ -638,57 +769,132 @@ typedef enum _zes_repair_status_t
 typedef uint32_t zes_reset_reason_flags_t;
 typedef enum _zes_reset_reason_flag_t
 {
-    ZES_RESET_REASON_FLAG_WEDGED = ZE_BIT(0),       ///< The device needs to be reset because one or more parts of the hardware
-                                                    ///< is wedged
-    ZES_RESET_REASON_FLAG_REPAIR = ZE_BIT(1),       ///< The device needs to be reset in order to complete in-field repairs
+    ZES_RESET_REASON_FLAG_WEDGED = ZE_BIT(0),                               ///< The device needs to be reset because one or more parts of the hardware
+                                                                            ///< is wedged
+    ZES_RESET_REASON_FLAG_REPAIR = ZE_BIT(1),                               ///< The device needs to be reset in order to complete in-field repairs
     ZES_RESET_REASON_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_reset_reason_flag_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device reset type
+typedef enum _zes_reset_type_t
+{
+    ZES_RESET_TYPE_WARM = 0,                                                ///< Apply warm reset
+    ZES_RESET_TYPE_COLD = 1,                                                ///< Apply cold reset
+    ZES_RESET_TYPE_FLR = 2,                                                 ///< Apply FLR reset
+    ZES_RESET_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_reset_type_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Device state
 typedef struct _zes_device_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_reset_reason_flags_t reset;                 ///< [out] Indicates if the device needs to be reset and for what reasons.
-                                                    ///< returns 0 (none) or combination of ::zes_reset_reason_flag_t
-    zes_repair_status_t repaired;                   ///< [out] Indicates if the device has been repaired
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_reset_reason_flags_t reset;                                         ///< [out] Indicates if the device needs to be reset and for what reasons.
+                                                                            ///< returns 0 (none) or combination of ::zes_reset_reason_flag_t
+    zes_repair_status_t repaired;                                           ///< [out] Indicates if the device has been repaired
 
 } zes_device_state_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device reset properties
+typedef struct _zes_reset_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t force;                                                        ///< [in] If set to true, all applications that are currently using the
+                                                                            ///< device will be forcibly killed.
+    zes_reset_type_t resetType;                                             ///< [in] Type of reset needs to be performed
+
+} zes_reset_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device universal unique id (UUID)
+typedef struct _zes_uuid_t
+{
+    uint8_t id[ZES_MAX_UUID_SIZE];                                          ///< [out] opaque data representing a device UUID
+
+} zes_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device types
+typedef enum _zes_device_type_t
+{
+    ZES_DEVICE_TYPE_GPU = 1,                                                ///< Graphics Processing Unit
+    ZES_DEVICE_TYPE_CPU = 2,                                                ///< Central Processing Unit
+    ZES_DEVICE_TYPE_FPGA = 3,                                               ///< Field Programmable Gate Array
+    ZES_DEVICE_TYPE_MCA = 4,                                                ///< Memory Copy Accelerator
+    ZES_DEVICE_TYPE_VPU = 5,                                                ///< Vision Processing Unit
+    ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_device_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device property flags
+typedef uint32_t zes_device_property_flags_t;
+typedef enum _zes_device_property_flag_t
+{
+    ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),                        ///< Device is integrated with the Host.
+    ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),                         ///< Device handle used for query represents a sub-device.
+    ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),                               ///< Device supports error correction memory access.
+    ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),                    ///< Device supports on-demand page-faulting.
+    ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_device_property_flag_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Device properties
 typedef struct _zes_device_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_properties_t core;                    ///< [out] Core device properties
-    uint32_t numSubdevices;                         ///< [out] Number of sub-devices. A value of 0 indicates that this device
-                                                    ///< doesn't have sub-devices.
-    char serialNumber[ZES_STRING_PROPERTY_SIZE];    ///< [out] Manufacturing serial number (NULL terminated string value). Will
-                                                    ///< be set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char boardNumber[ZES_STRING_PROPERTY_SIZE];     ///< [out] Manufacturing board number (NULL terminated string value). Will
-                                                    ///< be set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char brandName[ZES_STRING_PROPERTY_SIZE];       ///< [out] Brand name of the device (NULL terminated string value). Will be
-                                                    ///< set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char modelName[ZES_STRING_PROPERTY_SIZE];       ///< [out] Model name of the device (NULL terminated string value). Will be
-                                                    ///< set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char vendorName[ZES_STRING_PROPERTY_SIZE];      ///< [out] Vendor name of the device (NULL terminated string value). Will
-                                                    ///< be set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char driverVersion[ZES_STRING_PROPERTY_SIZE];   ///< [out] Installed driver version (NULL terminated string value). Will be
-                                                    ///< set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_properties_t core;                                            ///< [out] (Deprecated, use ::zes_uuid_t in the extended structure) Core
+                                                                            ///< device properties
+    uint32_t numSubdevices;                                                 ///< [out] Number of sub-devices. A value of 0 indicates that this device
+                                                                            ///< doesn't have sub-devices.
+    char serialNumber[ZES_STRING_PROPERTY_SIZE];                            ///< [out] Manufacturing serial number (NULL terminated string value). This
+                                                                            ///< value is intended to reflect the Part ID/SoC ID assigned by
+                                                                            ///< manufacturer that is unique for a SoC. Will be set to the string
+                                                                            ///< "unknown" if this cannot be determined for the device.
+    char boardNumber[ZES_STRING_PROPERTY_SIZE];                             ///< [out] Manufacturing board number (NULL terminated string value).
+                                                                            ///< Alternatively "boardSerialNumber", this value is intended to reflect
+                                                                            ///< the string printed on board label by manufacturer. Will be set to the
+                                                                            ///< string "unknown" if this cannot be determined for the device.
+    char brandName[ZES_STRING_PROPERTY_SIZE];                               ///< [out] Brand name of the device (NULL terminated string value). Will be
+                                                                            ///< set to the string "unknown" if this cannot be determined for the
+                                                                            ///< device.
+    char modelName[ZES_STRING_PROPERTY_SIZE];                               ///< [out] Model name of the device (NULL terminated string value). Will be
+                                                                            ///< set to the string "unknown" if this cannot be determined for the
+                                                                            ///< device.
+    char vendorName[ZES_STRING_PROPERTY_SIZE];                              ///< [out] Vendor name of the device (NULL terminated string value). Will
+                                                                            ///< be set to the string "unknown" if this cannot be determined for the
+                                                                            ///< device.
+    char driverVersion[ZES_STRING_PROPERTY_SIZE];                           ///< [out] Installed driver version (NULL terminated string value). Will be
+                                                                            ///< set to the string "unknown" if this cannot be determined for the
+                                                                            ///< device.
 
 } zes_device_properties_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device properties
+typedef struct _zes_device_ext_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_uuid_t uuid;                                                        ///< [out] universal unique identifier. Note: uuid obtained from Sysman API
+                                                                            ///< is the same as from core API. Subdevices will have their own uuid.
+    zes_device_type_t type;                                                 ///< [out] generic device type
+    zes_device_property_flags_t flags;                                      ///< [out] 0 (none) or a valid combination of ::zes_device_property_flag_t
+
+} zes_device_ext_properties_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Get properties about the device
 /// 
@@ -708,8 +914,8 @@ typedef struct _zes_device_properties_t
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceGetProperties(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_device_properties_t* pProperties            ///< [in,out] Structure that will contain information about the device.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_device_properties_t* pProperties                                    ///< [in,out] Structure that will contain information about the device.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -732,8 +938,8 @@ zesDeviceGetProperties(
 ///         + `nullptr == pState`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceGetState(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_device_state_t* pState                      ///< [in,out] Structure that will contain information about the device.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_device_state_t* pState                                              ///< [in,out] Structure that will contain information about the device.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -746,8 +952,9 @@ zesDeviceGetState(
 ///       this function.
 ///     - If the force argument is specified, all applications using the device
 ///       will be forcibly killed.
-///     - The function will block until the device has restarted or a timeout
-///       occurred waiting for the reset to complete.
+///     - The function will block until the device has restarted or an
+///       implementation defined timeout occurred waiting for the reset to
+///       complete.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -759,13 +966,51 @@ zesDeviceGetState(
 ///         + `nullptr == hDevice`
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to perform this operation.
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - "Reset cannot be performed because applications are using this device."
-///     - ::ZE_RESULT_ERROR_UNKNOWN - "There were problems unloading the device driver, performing a bus reset or reloading the device driver."
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + Reset cannot be performed because applications are using this device.
+///     - ::ZE_RESULT_ERROR_UNKNOWN
+///         + There were problems unloading the device driver, performing a bus reset or reloading the device driver.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceReset(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle for the device
-    ze_bool_t force                                 ///< [in] If set to true, all applications that are currently using the
-                                                    ///< device will be forcibly killed.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle for the device
+    ze_bool_t force                                                         ///< [in] If set to true, all applications that are currently using the
+                                                                            ///< device will be forcibly killed.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reset device extension
+/// 
+/// @details
+///     - Performs a PCI bus reset of the device. This will result in all
+///       current device state being lost.
+///     - Prior to calling this function, user is responsible for closing
+///       applications using the device unless force argument is specified.
+///     - If the force argument is specified, all applications using the device
+///       will be forcibly killed.
+///     - The function will block until the device has restarted or a
+///       implementation specific timeout occurred waiting for the reset to
+///       complete.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to perform this operation.
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + Reset cannot be performed because applications are using this device.
+///     - ::ZE_RESULT_ERROR_UNKNOWN
+///         + There were problems unloading the device driver, performing a bus reset or reloading the device driver.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceResetExt(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle for the device
+    zes_reset_properties_t* pProperties                                     ///< [in] Device reset properties to apply
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -777,15 +1022,15 @@ zesDeviceReset(
 ///       and the path to the executable.
 typedef struct _zes_process_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t processId;                             ///< [out] Host OS process ID.
-    uint64_t memSize;                               ///< [out] Device memory size in bytes allocated by this process (may not
-                                                    ///< necessarily be resident on the device at the time of reading).
-    uint64_t sharedSize;                            ///< [out] The size of shared device memory mapped into this process (may
-                                                    ///< not necessarily be resident on the device at the time of reading).
-    zes_engine_type_flags_t engines;                ///< [out] Bitfield of accelerator engine types being used by this process.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t processId;                                                     ///< [out] Host OS process ID.
+    uint64_t memSize;                                                       ///< [out] Device memory size in bytes allocated by this process (may not
+                                                                            ///< necessarily be resident on the device at the time of reading).
+    uint64_t sharedSize;                                                    ///< [out] The size of shared device memory mapped into this process (may
+                                                                            ///< not necessarily be resident on the device at the time of reading).
+    zes_engine_type_flags_t engines;                                        ///< [out] Bitfield of accelerator engine types being used by this process.
 
 } zes_process_state_t;
 
@@ -817,27 +1062,27 @@ typedef struct _zes_process_state_t
 ///         + The provided value of pCount is not big enough to store information about all the processes currently attached to the device.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceProcessesGetState(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle for the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of processes.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of processes currently attached to the device.
-                                                    ///< if count is greater than the number of processes currently attached to
-                                                    ///< the device, then the driver shall update the value with the correct
-                                                    ///< number of processes.
-    zes_process_state_t* pProcesses                 ///< [in,out][optional][range(0, *pCount)] array of process information.
-                                                    ///< if count is less than the number of processes currently attached to
-                                                    ///< the device, then the driver shall only retrieve information about that
-                                                    ///< number of processes. In this case, the return code will ::ZE_RESULT_ERROR_INVALID_SIZE.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle for the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of processes.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of processes currently attached to the device.
+                                                                            ///< if count is greater than the number of processes currently attached to
+                                                                            ///< the device, then the driver shall update the value with the correct
+                                                                            ///< number of processes.
+    zes_process_state_t* pProcesses                                         ///< [in,out][optional][range(0, *pCount)] array of process information.
+                                                                            ///< if count is less than the number of processes currently attached to
+                                                                            ///< the device, then the driver shall only retrieve information about that
+                                                                            ///< number of processes. In this case, the return code will ::ZE_RESULT_ERROR_INVALID_SIZE.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief PCI address
 typedef struct _zes_pci_address_t
 {
-    uint32_t domain;                                ///< [out] BDF domain
-    uint32_t bus;                                   ///< [out] BDF bus
-    uint32_t device;                                ///< [out] BDF device
-    uint32_t function;                              ///< [out] BDF function
+    uint32_t domain;                                                        ///< [out] BDF domain
+    uint32_t bus;                                                           ///< [out] BDF bus
+    uint32_t device;                                                        ///< [out] BDF device
+    uint32_t function;                                                      ///< [out] BDF function
 
 } zes_pci_address_t;
 
@@ -845,12 +1090,12 @@ typedef struct _zes_pci_address_t
 /// @brief PCI speed
 typedef struct _zes_pci_speed_t
 {
-    int32_t gen;                                    ///< [out] The link generation. A value of -1 means that this property is
-                                                    ///< unknown.
-    int32_t width;                                  ///< [out] The number of lanes. A value of -1 means that this property is
-                                                    ///< unknown.
-    int64_t maxBandwidth;                           ///< [out] The maximum bandwidth in bytes/sec (sum of all lanes). A value
-                                                    ///< of -1 means that this property is unknown.
+    int32_t gen;                                                            ///< [out] The link generation. A value of -1 means that this property is
+                                                                            ///< unknown.
+    int32_t width;                                                          ///< [out] The number of lanes. A value of -1 means that this property is
+                                                                            ///< unknown.
+    int64_t maxBandwidth;                                                   ///< [out] The maximum bandwidth in bytes/sec (sum of all lanes). A value
+                                                                            ///< of -1 means that this property is unknown.
 
 } zes_pci_speed_t;
 
@@ -858,18 +1103,18 @@ typedef struct _zes_pci_speed_t
 /// @brief Static PCI properties
 typedef struct _zes_pci_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_pci_address_t address;                      ///< [out] The BDF address
-    zes_pci_speed_t maxSpeed;                       ///< [out] Fastest port configuration supported by the device (sum of all
-                                                    ///< lanes)
-    ze_bool_t haveBandwidthCounters;                ///< [out] Indicates if ::zes_pci_stats_t.rxCounter and
-                                                    ///< ::zes_pci_stats_t.txCounter will have valid values
-    ze_bool_t havePacketCounters;                   ///< [out] Indicates if ::zes_pci_stats_t.packetCounter will have valid
-                                                    ///< values
-    ze_bool_t haveReplayCounters;                   ///< [out] Indicates if ::zes_pci_stats_t.replayCounter will have valid
-                                                    ///< values
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_address_t address;                                              ///< [out] The BDF address
+    zes_pci_speed_t maxSpeed;                                               ///< [out] Fastest port configuration supported by the device (sum of all
+                                                                            ///< lanes)
+    ze_bool_t haveBandwidthCounters;                                        ///< [out] Indicates whether the `rxCounter` and `txCounter` members of
+                                                                            ///< ::zes_pci_stats_t will have valid values
+    ze_bool_t havePacketCounters;                                           ///< [out] Indicates whether the `packetCounter` member of
+                                                                            ///< ::zes_pci_stats_t will have a valid value
+    ze_bool_t haveReplayCounters;                                           ///< [out] Indicates whether the `replayCounter` member of
+                                                                            ///< ::zes_pci_stats_t will have a valid value
 
 } zes_pci_properties_t;
 
@@ -877,11 +1122,11 @@ typedef struct _zes_pci_properties_t
 /// @brief PCI link status
 typedef enum _zes_pci_link_status_t
 {
-    ZES_PCI_LINK_STATUS_UNKNOWN = 0,                ///< The link status could not be determined
-    ZES_PCI_LINK_STATUS_GOOD = 1,                   ///< The link is up and operating as expected
-    ZES_PCI_LINK_STATUS_QUALITY_ISSUES = 2,         ///< The link is up but has quality and/or bandwidth degradation
-    ZES_PCI_LINK_STATUS_STABILITY_ISSUES = 3,       ///< The link has stability issues and preventing workloads making forward
-                                                    ///< progress
+    ZES_PCI_LINK_STATUS_UNKNOWN = 0,                                        ///< The link status could not be determined
+    ZES_PCI_LINK_STATUS_GOOD = 1,                                           ///< The link is up and operating as expected
+    ZES_PCI_LINK_STATUS_QUALITY_ISSUES = 2,                                 ///< The link is up but has quality and/or bandwidth degradation
+    ZES_PCI_LINK_STATUS_STABILITY_ISSUES = 3,                               ///< The link has stability issues and preventing workloads making forward
+                                                                            ///< progress
     ZES_PCI_LINK_STATUS_FORCE_UINT32 = 0x7fffffff
 
 } zes_pci_link_status_t;
@@ -891,8 +1136,8 @@ typedef enum _zes_pci_link_status_t
 typedef uint32_t zes_pci_link_qual_issue_flags_t;
 typedef enum _zes_pci_link_qual_issue_flag_t
 {
-    ZES_PCI_LINK_QUAL_ISSUE_FLAG_REPLAYS = ZE_BIT(0),   ///< A significant number of replays are occurring
-    ZES_PCI_LINK_QUAL_ISSUE_FLAG_SPEED = ZE_BIT(1), ///< There is a degradation in the maximum bandwidth of the link
+    ZES_PCI_LINK_QUAL_ISSUE_FLAG_REPLAYS = ZE_BIT(0),                       ///< A significant number of replays are occurring
+    ZES_PCI_LINK_QUAL_ISSUE_FLAG_SPEED = ZE_BIT(1),                         ///< There is a degradation in the maximum bandwidth of the link
     ZES_PCI_LINK_QUAL_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_pci_link_qual_issue_flag_t;
@@ -902,7 +1147,7 @@ typedef enum _zes_pci_link_qual_issue_flag_t
 typedef uint32_t zes_pci_link_stab_issue_flags_t;
 typedef enum _zes_pci_link_stab_issue_flag_t
 {
-    ZES_PCI_LINK_STAB_ISSUE_FLAG_RETRAINING = ZE_BIT(0),///< Link retraining has occurred to deal with quality issues
+    ZES_PCI_LINK_STAB_ISSUE_FLAG_RETRAINING = ZE_BIT(0),                    ///< Link retraining has occurred to deal with quality issues
     ZES_PCI_LINK_STAB_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_pci_link_stab_issue_flag_t;
@@ -911,21 +1156,21 @@ typedef enum _zes_pci_link_stab_issue_flag_t
 /// @brief Dynamic PCI state
 typedef struct _zes_pci_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_pci_link_status_t status;                   ///< [out] The current status of the port
-    zes_pci_link_qual_issue_flags_t qualityIssues;  ///< [out] If status is ::ZES_PCI_LINK_STATUS_QUALITY_ISSUES, 
-                                                    ///< then this gives a combination of ::zes_pci_link_qual_issue_flag_t for
-                                                    ///< quality issues that have been detected;
-                                                    ///< otherwise, 0 indicates there are no quality issues with the link at
-                                                    ///< this time."
-    zes_pci_link_stab_issue_flags_t stabilityIssues;///< [out] If status is ::ZES_PCI_LINK_STATUS_STABILITY_ISSUES, 
-                                                    ///< then this gives a combination of ::zes_pci_link_stab_issue_flag_t for
-                                                    ///< reasons for the connection instability;
-                                                    ///< otherwise, 0 indicates there are no connection stability issues at
-                                                    ///< this time."
-    zes_pci_speed_t speed;                          ///< [out] The current port configure speed
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_link_status_t status;                                           ///< [out] The current status of the port
+    zes_pci_link_qual_issue_flags_t qualityIssues;                          ///< [out] If status is ::ZES_PCI_LINK_STATUS_QUALITY_ISSUES, 
+                                                                            ///< then this gives a combination of ::zes_pci_link_qual_issue_flag_t for
+                                                                            ///< quality issues that have been detected;
+                                                                            ///< otherwise, 0 indicates there are no quality issues with the link at
+                                                                            ///< this time."
+    zes_pci_link_stab_issue_flags_t stabilityIssues;                        ///< [out] If status is ::ZES_PCI_LINK_STATUS_STABILITY_ISSUES, 
+                                                                            ///< then this gives a combination of ::zes_pci_link_stab_issue_flag_t for
+                                                                            ///< reasons for the connection instability;
+                                                                            ///< otherwise, 0 indicates there are no connection stability issues at
+                                                                            ///< this time."
+    zes_pci_speed_t speed;                                                  ///< [out] The current port configure speed
 
 } zes_pci_state_t;
 
@@ -933,9 +1178,9 @@ typedef struct _zes_pci_state_t
 /// @brief PCI bar types
 typedef enum _zes_pci_bar_type_t
 {
-    ZES_PCI_BAR_TYPE_MMIO = 0,                      ///< MMIO registers
-    ZES_PCI_BAR_TYPE_ROM = 1,                       ///< ROM aperture
-    ZES_PCI_BAR_TYPE_MEM = 2,                       ///< Device memory
+    ZES_PCI_BAR_TYPE_MMIO = 0,                                              ///< MMIO registers
+    ZES_PCI_BAR_TYPE_ROM = 1,                                               ///< ROM aperture
+    ZES_PCI_BAR_TYPE_MEM = 2,                                               ///< Device memory
     ZES_PCI_BAR_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } zes_pci_bar_type_t;
@@ -944,13 +1189,13 @@ typedef enum _zes_pci_bar_type_t
 /// @brief Properties of a pci bar
 typedef struct _zes_pci_bar_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_pci_bar_type_t type;                        ///< [out] The type of bar
-    uint32_t index;                                 ///< [out] The index of the bar
-    uint64_t base;                                  ///< [out] Base address of the bar.
-    uint64_t size;                                  ///< [out] Size of the bar.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_bar_type_t type;                                                ///< [out] The type of bar
+    uint32_t index;                                                         ///< [out] The index of the bar
+    uint64_t base;                                                          ///< [out] Base address of the bar.
+    uint64_t size;                                                          ///< [out] Size of the bar.
 
 } zes_pci_bar_properties_t;
 
@@ -958,15 +1203,15 @@ typedef struct _zes_pci_bar_properties_t
 /// @brief Properties of a pci bar, including the resizable bar.
 typedef struct _zes_pci_bar_properties_1_2_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_pci_bar_type_t type;                        ///< [out] The type of bar
-    uint32_t index;                                 ///< [out] The index of the bar
-    uint64_t base;                                  ///< [out] Base address of the bar.
-    uint64_t size;                                  ///< [out] Size of the bar.
-    ze_bool_t resizableBarSupported;                ///< [out] Support for Resizable Bar on this device.
-    ze_bool_t resizableBarEnabled;                  ///< [out] Resizable Bar enabled on this device
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_bar_type_t type;                                                ///< [out] The type of bar
+    uint32_t index;                                                         ///< [out] The index of the bar
+    uint64_t base;                                                          ///< [out] Base address of the bar.
+    uint64_t size;                                                          ///< [out] Size of the bar.
+    ze_bool_t resizableBarSupported;                                        ///< [out] Support for Resizable Bar on this device.
+    ze_bool_t resizableBarEnabled;                                          ///< [out] Resizable Bar enabled on this device
 
 } zes_pci_bar_properties_1_2_t;
 
@@ -983,27 +1228,27 @@ typedef struct _zes_pci_bar_properties_1_2_t
 ///       s1.timestamp))
 typedef struct _zes_pci_stats_t
 {
-    uint64_t timestamp;                             ///< [out] Monotonic timestamp counter in microseconds when the measurement
-                                                    ///< was made.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
-    uint64_t replayCounter;                         ///< [out] Monotonic counter for the number of replay packets (sum of all
-                                                    ///< lanes). Will always be 0 if ::zes_pci_properties_t.haveReplayCounters
-                                                    ///< is FALSE.
-    uint64_t packetCounter;                         ///< [out] Monotonic counter for the number of packets (sum of all lanes).
-                                                    ///< Will always be 0 if ::zes_pci_properties_t.havePacketCounters is
-                                                    ///< FALSE.
-    uint64_t rxCounter;                             ///< [out] Monotonic counter for the number of bytes received (sum of all
-                                                    ///< lanes). Will always be 0 if
-                                                    ///< ::zes_pci_properties_t.haveBandwidthCounters is FALSE.
-    uint64_t txCounter;                             ///< [out] Monotonic counter for the number of bytes transmitted (including
-                                                    ///< replays) (sum of all lanes). Will always be 0 if
-                                                    ///< ::zes_pci_properties_t.haveBandwidthCounters is FALSE.
-    zes_pci_speed_t speed;                          ///< [out] The current speed of the link (sum of all lanes)
+    uint64_t timestamp;                                                     ///< [out] Monotonic timestamp counter in microseconds when the measurement
+                                                                            ///< was made.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
+    uint64_t replayCounter;                                                 ///< [out] Monotonic counter for the number of replay packets (sum of all
+                                                                            ///< lanes). Will always be 0 when the `haveReplayCounters` member of
+                                                                            ///< ::zes_pci_properties_t is FALSE.
+    uint64_t packetCounter;                                                 ///< [out] Monotonic counter for the number of packets (sum of all lanes).
+                                                                            ///< Will always be 0 when the `havePacketCounters` member of
+                                                                            ///< ::zes_pci_properties_t is FALSE.
+    uint64_t rxCounter;                                                     ///< [out] Monotonic counter for the number of bytes received (sum of all
+                                                                            ///< lanes). Will always be 0 when the `haveBandwidthCounters` member of
+                                                                            ///< ::zes_pci_properties_t is FALSE.
+    uint64_t txCounter;                                                     ///< [out] Monotonic counter for the number of bytes transmitted (including
+                                                                            ///< replays) (sum of all lanes). Will always be 0 when the
+                                                                            ///< `haveBandwidthCounters` member of ::zes_pci_properties_t is FALSE.
+    zes_pci_speed_t speed;                                                  ///< [out] The current speed of the link (sum of all lanes)
 
 } zes_pci_stats_t;
 
@@ -1026,8 +1271,8 @@ typedef struct _zes_pci_stats_t
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDevicePciGetProperties(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_pci_properties_t* pProperties               ///< [in,out] Will contain the PCI properties.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_pci_properties_t* pProperties                                       ///< [in,out] Will contain the PCI properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1049,8 +1294,8 @@ zesDevicePciGetProperties(
 ///         + `nullptr == pState`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDevicePciGetState(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_pci_state_t* pState                         ///< [in,out] Will contain the PCI properties.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_pci_state_t* pState                                                 ///< [in,out] Will contain the PCI properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1072,16 +1317,16 @@ zesDevicePciGetState(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDevicePciGetBars(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of PCI bars.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of PCI bars that are setup.
-                                                    ///< if count is greater than the number of PCI bars that are setup, then
-                                                    ///< the driver shall update the value with the correct number of PCI bars.
-    zes_pci_bar_properties_t* pProperties           ///< [in,out][optional][range(0, *pCount)] array of information about setup
-                                                    ///< PCI bars.
-                                                    ///< if count is less than the number of PCI bars that are setup, then the
-                                                    ///< driver shall only retrieve information about that number of PCI bars.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of PCI bars.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of PCI bars that are setup.
+                                                                            ///< if count is greater than the number of PCI bars that are setup, then
+                                                                            ///< the driver shall update the value with the correct number of PCI bars.
+    zes_pci_bar_properties_t* pProperties                                   ///< [in,out][optional][range(0, *pCount)] array of information about setup
+                                                                            ///< PCI bars.
+                                                                            ///< if count is less than the number of PCI bars that are setup, then the
+                                                                            ///< driver shall only retrieve information about that number of PCI bars.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1105,8 +1350,8 @@ zesDevicePciGetBars(
 ///         + User does not have permissions to query this telemetry.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDevicePciGetStats(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_pci_stats_t* pStats                         ///< [in,out] Will contain a snapshot of the latest stats.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_pci_stats_t* pStats                                                 ///< [in,out] Will contain a snapshot of the latest stats.
     );
 
 #if !defined(__GNUC__)
@@ -1120,15 +1365,15 @@ zesDevicePciGetStats(
 /// @brief Overclock domains.
 typedef enum _zes_overclock_domain_t
 {
-    ZES_OVERCLOCK_DOMAIN_CARD = 1,                  ///< Overclocking card level properties such as temperature limits.
-    ZES_OVERCLOCK_DOMAIN_PACKAGE = 2,               ///< Overclocking package level properties such as power limits.
-    ZES_OVERCLOCK_DOMAIN_GPU_ALL = 4,               ///< Overclocking a GPU that has all accelerator assets on the same PLL/VR.
-    ZES_OVERCLOCK_DOMAIN_GPU_RENDER_COMPUTE = 8,    ///< Overclocking a GPU with render and compute assets on the same PLL/VR.
-    ZES_OVERCLOCK_DOMAIN_GPU_RENDER = 16,           ///< Overclocking a GPU with render assets on its own PLL/VR.
-    ZES_OVERCLOCK_DOMAIN_GPU_COMPUTE = 32,          ///< Overclocking a GPU with compute assets on its own PLL/VR.
-    ZES_OVERCLOCK_DOMAIN_GPU_MEDIA = 64,            ///< Overclocking a GPU with media assets on its own PLL/VR.
-    ZES_OVERCLOCK_DOMAIN_VRAM = 128,                ///< Overclocking device local memory.
-    ZES_OVERCLOCK_DOMAIN_ADM = 256,                 ///< Overclocking LLC/L4 cache.
+    ZES_OVERCLOCK_DOMAIN_CARD = 1,                                          ///< Overclocking card level properties such as temperature limits.
+    ZES_OVERCLOCK_DOMAIN_PACKAGE = 2,                                       ///< Overclocking package level properties such as power limits.
+    ZES_OVERCLOCK_DOMAIN_GPU_ALL = 4,                                       ///< Overclocking a GPU that has all accelerator assets on the same PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_GPU_RENDER_COMPUTE = 8,                            ///< Overclocking a GPU with render and compute assets on the same PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_GPU_RENDER = 16,                                   ///< Overclocking a GPU with render assets on its own PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_GPU_COMPUTE = 32,                                  ///< Overclocking a GPU with compute assets on its own PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_GPU_MEDIA = 64,                                    ///< Overclocking a GPU with media assets on its own PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_VRAM = 128,                                        ///< Overclocking device local memory.
+    ZES_OVERCLOCK_DOMAIN_ADM = 256,                                         ///< Overclocking LLC/L4 cache.
     ZES_OVERCLOCK_DOMAIN_FORCE_UINT32 = 0x7fffffff
 
 } zes_overclock_domain_t;
@@ -1137,21 +1382,21 @@ typedef enum _zes_overclock_domain_t
 /// @brief Overclock controls.
 typedef enum _zes_overclock_control_t
 {
-    ZES_OVERCLOCK_CONTROL_VF = 1,                   ///< This control permits setting a custom V-F curve.
-    ZES_OVERCLOCK_CONTROL_FREQ_OFFSET = 2,          ///< The V-F curve of the overclock domain can be shifted up or down using
-                                                    ///< this control.
-    ZES_OVERCLOCK_CONTROL_VMAX_OFFSET = 4,          ///< This control is used to increase the permitted voltage above the
-                                                    ///< shipped voltage maximum.
-    ZES_OVERCLOCK_CONTROL_FREQ = 8,                 ///< This control permits direct changes to the operating frequency.
-    ZES_OVERCLOCK_CONTROL_VOLT_LIMIT = 16,          ///< This control prevents frequencies that would push the voltage above
-                                                    ///< this value, typically used by V-F scanners.
-    ZES_OVERCLOCK_CONTROL_POWER_SUSTAINED_LIMIT = 32,   ///< This control changes the sustained power limit (PL1).
-    ZES_OVERCLOCK_CONTROL_POWER_BURST_LIMIT = 64,   ///< This control changes the burst power limit (PL2).
-    ZES_OVERCLOCK_CONTROL_POWER_PEAK_LIMIT = 128,   ///< his control changes the peak power limit (PL4).
-    ZES_OVERCLOCK_CONTROL_ICCMAX_LIMIT = 256,       ///< This control changes the value of IccMax..
-    ZES_OVERCLOCK_CONTROL_TEMP_LIMIT = 512,         ///< This control changes the value of TjMax.
-    ZES_OVERCLOCK_CONTROL_ITD_DISABLE = 1024,       ///< This control permits disabling the adaptive voltage feature ITD
-    ZES_OVERCLOCK_CONTROL_ACM_DISABLE = 2048,       ///< This control permits disabling the adaptive voltage feature ACM.
+    ZES_OVERCLOCK_CONTROL_VF = 1,                                           ///< This control permits setting a custom V-F curve.
+    ZES_OVERCLOCK_CONTROL_FREQ_OFFSET = 2,                                  ///< The V-F curve of the overclock domain can be shifted up or down using
+                                                                            ///< this control.
+    ZES_OVERCLOCK_CONTROL_VMAX_OFFSET = 4,                                  ///< This control is used to increase the permitted voltage above the
+                                                                            ///< shipped voltage maximum.
+    ZES_OVERCLOCK_CONTROL_FREQ = 8,                                         ///< This control permits direct changes to the operating frequency.
+    ZES_OVERCLOCK_CONTROL_VOLT_LIMIT = 16,                                  ///< This control prevents frequencies that would push the voltage above
+                                                                            ///< this value, typically used by V-F scanners.
+    ZES_OVERCLOCK_CONTROL_POWER_SUSTAINED_LIMIT = 32,                       ///< This control changes the sustained power limit (PL1).
+    ZES_OVERCLOCK_CONTROL_POWER_BURST_LIMIT = 64,                           ///< This control changes the burst power limit (PL2).
+    ZES_OVERCLOCK_CONTROL_POWER_PEAK_LIMIT = 128,                           ///< his control changes the peak power limit (PL4).
+    ZES_OVERCLOCK_CONTROL_ICCMAX_LIMIT = 256,                               ///< This control changes the value of IccMax..
+    ZES_OVERCLOCK_CONTROL_TEMP_LIMIT = 512,                                 ///< This control changes the value of TjMax.
+    ZES_OVERCLOCK_CONTROL_ITD_DISABLE = 1024,                               ///< This control permits disabling the adaptive voltage feature ITD
+    ZES_OVERCLOCK_CONTROL_ACM_DISABLE = 2048,                               ///< This control permits disabling the adaptive voltage feature ACM.
     ZES_OVERCLOCK_CONTROL_FORCE_UINT32 = 0x7fffffff
 
 } zes_overclock_control_t;
@@ -1160,12 +1405,12 @@ typedef enum _zes_overclock_control_t
 /// @brief Overclock modes.
 typedef enum _zes_overclock_mode_t
 {
-    ZES_OVERCLOCK_MODE_MODE_OFF = 0,                ///< Overclock mode is off
-    ZES_OVERCLOCK_MODE_MODE_STOCK = 2,              ///< Stock (manufacturing settings) are being used.
-    ZES_OVERCLOCK_MODE_MODE_ON = 3,                 ///< Overclock mode is on.
-    ZES_OVERCLOCK_MODE_MODE_UNAVAILABLE = 4,        ///< Overclocking is unavailable at this time since the system is running
-                                                    ///< on battery.
-    ZES_OVERCLOCK_MODE_MODE_DISABLED = 5,           ///< Overclock mode is disabled.
+    ZES_OVERCLOCK_MODE_MODE_OFF = 0,                                        ///< Overclock mode is off
+    ZES_OVERCLOCK_MODE_MODE_STOCK = 2,                                      ///< Stock (manufacturing settings) are being used.
+    ZES_OVERCLOCK_MODE_MODE_ON = 3,                                         ///< Overclock mode is on.
+    ZES_OVERCLOCK_MODE_MODE_UNAVAILABLE = 4,                                ///< Overclocking is unavailable at this time since the system is running
+                                                                            ///< on battery.
+    ZES_OVERCLOCK_MODE_MODE_DISABLED = 5,                                   ///< Overclock mode is disabled.
     ZES_OVERCLOCK_MODE_FORCE_UINT32 = 0x7fffffff
 
 } zes_overclock_mode_t;
@@ -1174,11 +1419,11 @@ typedef enum _zes_overclock_mode_t
 /// @brief Overclock control states.
 typedef enum _zes_control_state_t
 {
-    ZES_CONTROL_STATE_STATE_UNSET = 0,              ///< No overclock control has not been changed by the driver since the last
-                                                    ///< boot/reset.
-    ZES_CONTROL_STATE_STATE_ACTIVE = 2,             ///< The overclock control has been set and it is active.
-    ZES_CONTROL_STATE_STATE_DISABLED = 3,           ///< The overclock control value has been disabled due to the current power
-                                                    ///< configuration (typically when running on DC).
+    ZES_CONTROL_STATE_STATE_UNSET = 0,                                      ///< No overclock control has not been changed by the driver since the last
+                                                                            ///< boot/reset.
+    ZES_CONTROL_STATE_STATE_ACTIVE = 2,                                     ///< The overclock control has been set and it is active.
+    ZES_CONTROL_STATE_STATE_DISABLED = 3,                                   ///< The overclock control value has been disabled due to the current power
+                                                                            ///< configuration (typically when running on DC).
     ZES_CONTROL_STATE_FORCE_UINT32 = 0x7fffffff
 
 } zes_control_state_t;
@@ -1187,11 +1432,11 @@ typedef enum _zes_control_state_t
 /// @brief Overclock pending actions.
 typedef enum _zes_pending_action_t
 {
-    ZES_PENDING_ACTION_PENDING_NONE = 0,            ///< There no pending actions. .
-    ZES_PENDING_ACTION_PENDING_IMMINENT = 1,        ///< The requested change is in progress and should complete soon.
-    ZES_PENDING_ACTION_PENDING_COLD_RESET = 2,      ///< The requested change requires a device cold reset (hotplug, system
-                                                    ///< boot).
-    ZES_PENDING_ACTION_PENDING_WARM_RESET = 3,      ///< The requested change requires a device warm reset (PCIe FLR).
+    ZES_PENDING_ACTION_PENDING_NONE = 0,                                    ///< There no pending actions. .
+    ZES_PENDING_ACTION_PENDING_IMMINENT = 1,                                ///< The requested change is in progress and should complete soon.
+    ZES_PENDING_ACTION_PENDING_COLD_RESET = 2,                              ///< The requested change requires a device cold reset (hotplug, system
+                                                                            ///< boot).
+    ZES_PENDING_ACTION_PENDING_WARM_RESET = 3,                              ///< The requested change requires a device warm reset (PCIe FLR).
     ZES_PENDING_ACTION_FORCE_UINT32 = 0x7fffffff
 
 } zes_pending_action_t;
@@ -1200,13 +1445,13 @@ typedef enum _zes_pending_action_t
 /// @brief Overclock V-F curve programing.
 typedef enum _zes_vf_program_type_t
 {
-    ZES_VF_PROGRAM_TYPE_VF_ARBITRARY = 0,           ///< Can program an arbitrary number of V-F points up to the maximum number
-                                                    ///< and each point can have arbitrary voltage and frequency values within
-                                                    ///< the min/max/step limits
-    ZES_VF_PROGRAM_TYPE_VF_FREQ_FIXED = 1,          ///< Can only program the voltage for the V-F points that it reads back -
-                                                    ///< the frequency of those points cannot be changed
-    ZES_VF_PROGRAM_TYPE_VF_VOLT_FIXED = 2,          ///< Can only program the frequency for the V-F points that is reads back -
-                                                    ///< the voltage of each point cannot be changed.
+    ZES_VF_PROGRAM_TYPE_VF_ARBITRARY = 0,                                   ///< Can program an arbitrary number of V-F points up to the maximum number
+                                                                            ///< and each point can have arbitrary voltage and frequency values within
+                                                                            ///< the min/max/step limits
+    ZES_VF_PROGRAM_TYPE_VF_FREQ_FIXED = 1,                                  ///< Can only program the voltage for the V-F points that it reads back -
+                                                                            ///< the frequency of those points cannot be changed
+    ZES_VF_PROGRAM_TYPE_VF_VOLT_FIXED = 2,                                  ///< Can only program the frequency for the V-F points that is reads back -
+                                                                            ///< the voltage of each point cannot be changed.
     ZES_VF_PROGRAM_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } zes_vf_program_type_t;
@@ -1215,8 +1460,8 @@ typedef enum _zes_vf_program_type_t
 /// @brief VF type
 typedef enum _zes_vf_type_t
 {
-    ZES_VF_TYPE_VOLT = 0,                           ///< VF Voltage point
-    ZES_VF_TYPE_FREQ = 1,                           ///< VF Frequency point
+    ZES_VF_TYPE_VOLT = 0,                                                   ///< VF Voltage point
+    ZES_VF_TYPE_FREQ = 1,                                                   ///< VF Frequency point
     ZES_VF_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } zes_vf_type_t;
@@ -1225,9 +1470,9 @@ typedef enum _zes_vf_type_t
 /// @brief VF type
 typedef enum _zes_vf_array_type_t
 {
-    ZES_VF_ARRAY_TYPE_USER_VF_ARRAY = 0,            ///< User V-F array
-    ZES_VF_ARRAY_TYPE_DEFAULT_VF_ARRAY = 1,         ///< Default V-F array
-    ZES_VF_ARRAY_TYPE_LIVE_VF_ARRAY = 2,            ///< Live V-F array
+    ZES_VF_ARRAY_TYPE_USER_VF_ARRAY = 0,                                    ///< User V-F array
+    ZES_VF_ARRAY_TYPE_DEFAULT_VF_ARRAY = 1,                                 ///< Default V-F array
+    ZES_VF_ARRAY_TYPE_LIVE_VF_ARRAY = 2,                                    ///< Live V-F array
     ZES_VF_ARRAY_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } zes_vf_array_type_t;
@@ -1240,16 +1485,16 @@ typedef enum _zes_vf_array_type_t
 ///       part of the domain.
 typedef struct _zes_overclock_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_overclock_domain_t domainType;              ///< [out] The hardware block that this overclock domain controls (GPU,
-                                                    ///< VRAM, ...)
-    uint32_t AvailableControls;                     ///< [out] Returns the overclock controls that are supported (a bit for
-                                                    ///< each of enum ::zes_overclock_control_t). If no bits are set, the
-                                                    ///< domain doesn't support overclocking.
-    zes_vf_program_type_t VFProgramType;            ///< [out] Type of V-F curve programming that is permitted:.
-    uint32_t NumberOfVFPoints;                      ///< [out] Number of VF points that can be programmed - max_num_points
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_overclock_domain_t domainType;                                      ///< [out] The hardware block that this overclock domain controls (GPU,
+                                                                            ///< VRAM, ...)
+    uint32_t AvailableControls;                                             ///< [out] Returns the overclock controls that are supported (a bit for
+                                                                            ///< each of enum ::zes_overclock_control_t). If no bits are set, the
+                                                                            ///< domain doesn't support overclocking.
+    zes_vf_program_type_t VFProgramType;                                    ///< [out] Type of V-F curve programming that is permitted:.
+    uint32_t NumberOfVFPoints;                                              ///< [out] Number of VF points that can be programmed - max_num_points
 
 } zes_overclock_properties_t;
 
@@ -1261,16 +1506,16 @@ typedef struct _zes_overclock_properties_t
 ///       overclock domain.
 typedef struct _zes_control_property_t
 {
-    double MinValue;                                ///< [out]  This provides information about the limits of the control value
-                                                    ///< so that the driver can calculate the set of valid values.
-    double MaxValue;                                ///< [out]  This provides information about the limits of the control value
-                                                    ///< so that the driver can calculate the set of valid values.
-    double StepValue;                               ///< [out]  This provides information about the limits of the control value
-                                                    ///< so that the driver can calculate the set of valid values.
-    double RefValue;                                ///< [out] The reference value provides the anchor point, UIs can combine
-                                                    ///< this with the user offset request to show the anticipated improvement.
-    double DefaultValue;                            ///< [out] The shipped out-of-box position of this control. Driver can
-                                                    ///< request this value at any time to return to the out-of-box behavior.
+    double MinValue;                                                        ///< [out]  This provides information about the limits of the control value
+                                                                            ///< so that the driver can calculate the set of valid values.
+    double MaxValue;                                                        ///< [out]  This provides information about the limits of the control value
+                                                                            ///< so that the driver can calculate the set of valid values.
+    double StepValue;                                                       ///< [out]  This provides information about the limits of the control value
+                                                                            ///< so that the driver can calculate the set of valid values.
+    double RefValue;                                                        ///< [out] The reference value provides the anchor point, UIs can combine
+                                                                            ///< this with the user offset request to show the anticipated improvement.
+    double DefaultValue;                                                    ///< [out] The shipped out-of-box position of this control. Driver can
+                                                                            ///< request this value at any time to return to the out-of-box behavior.
 
 } zes_control_property_t;
 
@@ -1282,18 +1527,18 @@ typedef struct _zes_control_property_t
 ///       overclock domain.
 typedef struct _zes_vf_property_t
 {
-    double MinFreq;                                 ///< [out] Read the minimum frequency that can be be programmed in the
-                                                    ///< custom V-F point..
-    double MaxFreq;                                 ///< [out] Read the maximum frequency that can be be programmed in the
-                                                    ///< custom V-F point..
-    double StepFreq;                                ///< [out] Read the frequency step that can be be programmed in the custom
-                                                    ///< V-F point..
-    double MinVolt;                                 ///< [out] Read the minimum voltage that can be be programmed in the custom
-                                                    ///< V-F point..
-    double MaxVolt;                                 ///< [out] Read the maximum voltage that can be be programmed in the custom
-                                                    ///< V-F point..
-    double StepVolt;                                ///< [out] Read the voltage step that can be be programmed in the custom
-                                                    ///< V-F point.
+    double MinFreq;                                                         ///< [out] Read the minimum frequency that can be be programmed in the
+                                                                            ///< custom V-F point..
+    double MaxFreq;                                                         ///< [out] Read the maximum frequency that can be be programmed in the
+                                                                            ///< custom V-F point..
+    double StepFreq;                                                        ///< [out] Read the frequency step that can be be programmed in the custom
+                                                                            ///< V-F point..
+    double MinVolt;                                                         ///< [out] Read the minimum voltage that can be be programmed in the custom
+                                                                            ///< V-F point..
+    double MaxVolt;                                                         ///< [out] Read the maximum voltage that can be be programmed in the custom
+                                                                            ///< V-F point..
+    double StepVolt;                                                        ///< [out] Read the voltage step that can be be programmed in the custom
+                                                                            ///< V-F point.
 
 } zes_vf_property_t;
 
@@ -1317,7 +1562,7 @@ typedef struct _zes_vf_property_t
 ///         + This product does not support overclocking
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceSetOverclockWaiver(
-    zes_device_handle_t hDevice                     ///< [in] Sysman handle of the device.
+    zes_device_handle_t hDevice                                             ///< [in] Sysman handle of the device.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1341,10 +1586,10 @@ zesDeviceSetOverclockWaiver(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceGetOverclockDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pOverclockDomains                     ///< [in,out] Returns the overclock domains that are supported (a bit for
-                                                    ///< each of enum ::zes_overclock_domain_t). If no bits are set, the device
-                                                    ///< doesn't support overclocking.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pOverclockDomains                                             ///< [in,out] Returns the overclock domains that are supported (a bit for
+                                                                            ///< each of enum ::zes_overclock_domain_t). If no bits are set, the device
+                                                                            ///< doesn't support overclocking.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1371,11 +1616,11 @@ zesDeviceGetOverclockDomains(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceGetOverclockControls(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_overclock_domain_t domainType,              ///< [in] Domain type.
-    uint32_t* pAvailableControls                    ///< [in,out] Returns the overclock controls that are supported for the
-                                                    ///< specified overclock domain (a bit for each of enum
-                                                    ///< ::zes_overclock_control_t).
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_overclock_domain_t domainType,                                      ///< [in] Domain type.
+    uint32_t* pAvailableControls                                            ///< [in,out] Returns the overclock controls that are supported for the
+                                                                            ///< specified overclock domain (a bit for each of enum
+                                                                            ///< ::zes_overclock_control_t).
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1398,9 +1643,9 @@ zesDeviceGetOverclockControls(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceResetOverclockSettings(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    ze_bool_t onShippedState                        ///< [in] True will reset to shipped state; false will reset to
-                                                    ///< manufacturing state
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    ze_bool_t onShippedState                                                ///< [in] True will reset to shipped state; false will reset to
+                                                                            ///< manufacturing state
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1428,13 +1673,13 @@ zesDeviceResetOverclockSettings(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceReadOverclockState(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_overclock_mode_t* pOverclockMode,           ///< [out] One of overclock mode.
-    ze_bool_t* pWaiverSetting,                      ///< [out] Waiver setting: 0 = Waiver not set, 1 = waiver has been set.
-    ze_bool_t* pOverclockState,                     ///< [out] Current settings 0 =manufacturing state, 1= shipped state)..
-    zes_pending_action_t* pPendingAction,           ///< [out] This enum is returned when the driver attempts to set an
-                                                    ///< overclock control or reset overclock settings.
-    ze_bool_t* pPendingReset                        ///< [out] Pending reset 0 =manufacturing state, 1= shipped state)..
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_overclock_mode_t* pOverclockMode,                                   ///< [out] One of overclock mode.
+    ze_bool_t* pWaiverSetting,                                              ///< [out] Waiver setting: 0 = Waiver not set, 1 = waiver has been set.
+    ze_bool_t* pOverclockState,                                             ///< [out] Current settings 0 =manufacturing state, 1= shipped state)..
+    zes_pending_action_t* pPendingAction,                                   ///< [out] This enum is returned when the driver attempts to set an
+                                                                            ///< overclock control or reset overclock settings.
+    ze_bool_t* pPendingReset                                                ///< [out] Pending reset 0 =manufacturing state, 1= shipped state)..
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1456,18 +1701,18 @@ zesDeviceReadOverclockState(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumOverclockDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_overclock_handle_t* phDomainHandle          ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_overclock_handle_t* phDomainHandle                                  ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1491,8 +1736,8 @@ zesDeviceEnumOverclockDomains(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockGetDomainProperties(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component domain.
-    zes_overclock_properties_t* pDomainProperties   ///< [in,out] The overclock properties for the specified domain.
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_properties_t* pDomainProperties                           ///< [in,out] The overclock properties for the specified domain.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1516,8 +1761,8 @@ zesOverclockGetDomainProperties(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockGetDomainVFProperties(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component domain.
-    zes_vf_property_t* pVFProperties                ///< [in,out] The VF min,max,step for a specified domain.
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_vf_property_t* pVFProperties                                        ///< [in,out] The VF min,max,step for a specified domain.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1543,9 +1788,9 @@ zesOverclockGetDomainVFProperties(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockGetDomainControlProperties(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component domain.
-    zes_overclock_control_t DomainControl,          ///< [in] Handle for the component.
-    zes_control_property_t* pControlProperties      ///< [in,out] overclock control values.
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Handle for the component.
+    zes_control_property_t* pControlProperties                              ///< [in,out] overclock control values.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1571,9 +1816,9 @@ zesOverclockGetDomainControlProperties(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockGetControlCurrentValue(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component.
-    zes_overclock_control_t DomainControl,          ///< [in] Overclock Control.
-    double* pValue                                  ///< [in,out] Getting overclock control value for the specified control.
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Overclock Control.
+    double* pValue                                                          ///< [in,out] Getting overclock control value for the specified control.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1599,10 +1844,10 @@ zesOverclockGetControlCurrentValue(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockGetControlPendingValue(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component domain.
-    zes_overclock_control_t DomainControl,          ///< [in] Overclock Control.
-    double* pValue                                  ///< [out] Returns the pending value for a given control. The units and
-                                                    ///< format of the value depend on the control type.
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Overclock Control.
+    double* pValue                                                          ///< [out] Returns the pending value for a given control. The units and
+                                                                            ///< format of the value depend on the control type.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1628,11 +1873,11 @@ zesOverclockGetControlPendingValue(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockSetControlUserValue(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component domain.
-    zes_overclock_control_t DomainControl,          ///< [in] Domain Control.
-    double pValue,                                  ///< [in] The new value of the control. The units and format of the value
-                                                    ///< depend on the control type.
-    zes_pending_action_t* pPendingAction            ///< [out] Pending overclock setting.
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Domain Control.
+    double pValue,                                                          ///< [in] The new value of the control. The units and format of the value
+                                                                            ///< depend on the control type.
+    zes_pending_action_t* pPendingAction                                    ///< [out] Pending overclock setting.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1659,10 +1904,10 @@ zesOverclockSetControlUserValue(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockGetControlState(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component domain.
-    zes_overclock_control_t DomainControl,          ///< [in] Domain Control.
-    zes_control_state_t* pControlState,             ///< [out] Current overclock control state.
-    zes_pending_action_t* pPendingAction            ///< [out] Pending overclock setting.
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Domain Control.
+    zes_control_state_t* pControlState,                                     ///< [out] Current overclock control state.
+    zes_pending_action_t* pPendingAction                                    ///< [out] Pending overclock setting.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1690,12 +1935,12 @@ zesOverclockGetControlState(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockGetVFPointValues(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component domain.
-    zes_vf_type_t VFType,                           ///< [in] Voltage or Freqency point to read.
-    zes_vf_array_type_t VFArrayType,                ///< [in] User,Default or Live VF array to read from
-    uint32_t PointIndex,                            ///< [in] Point index - number between (0, max_num_points - 1).
-    uint32_t* PointValue                            ///< [out] Returns the frequency in 1kHz units or voltage in millivolt
-                                                    ///< units from the custom V-F curve at the specified zero-based index 
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_vf_type_t VFType,                                                   ///< [in] Voltage or Freqency point to read.
+    zes_vf_array_type_t VFArrayType,                                        ///< [in] User,Default or Live VF array to read from
+    uint32_t PointIndex,                                                    ///< [in] Point index - number between (0, max_num_points - 1).
+    uint32_t* PointValue                                                    ///< [out] Returns the frequency in 1kHz units or voltage in millivolt
+                                                                            ///< units from the custom V-F curve at the specified zero-based index 
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1719,11 +1964,11 @@ zesOverclockGetVFPointValues(
 ///         + Overclocking is not supported on this control domain
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesOverclockSetVFPointValues(
-    zes_overclock_handle_t hDomainHandle,           ///< [in] Handle for the component domain.
-    zes_vf_type_t VFType,                           ///< [in] Voltage or Freqency point to read.
-    uint32_t PointIndex,                            ///< [in] Point index - number between (0, max_num_points - 1).
-    uint32_t PointValue                             ///< [in] Writes frequency in 1kHz units or voltage in millivolt units to
-                                                    ///< custom V-F curve at the specified zero-based index 
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_vf_type_t VFType,                                                   ///< [in] Voltage or Freqency point to read.
+    uint32_t PointIndex,                                                    ///< [in] Point index - number between (0, max_num_points - 1).
+    uint32_t PointValue                                                     ///< [in] Writes frequency in 1kHz units or voltage in millivolt units to
+                                                                            ///< custom V-F curve at the specified zero-based index 
     );
 
 #if !defined(__GNUC__)
@@ -1737,11 +1982,11 @@ zesOverclockSetVFPointValues(
 /// @brief Diagnostic results
 typedef enum _zes_diag_result_t
 {
-    ZES_DIAG_RESULT_NO_ERRORS = 0,                  ///< Diagnostic completed without finding errors to repair
-    ZES_DIAG_RESULT_ABORT = 1,                      ///< Diagnostic had problems running tests
-    ZES_DIAG_RESULT_FAIL_CANT_REPAIR = 2,           ///< Diagnostic had problems setting up repairs
-    ZES_DIAG_RESULT_REBOOT_FOR_REPAIR = 3,          ///< Diagnostics found errors, setup for repair and reboot is required to
-                                                    ///< complete the process
+    ZES_DIAG_RESULT_NO_ERRORS = 0,                                          ///< Diagnostic completed without finding errors to repair
+    ZES_DIAG_RESULT_ABORT = 1,                                              ///< Diagnostic had problems running tests
+    ZES_DIAG_RESULT_FAIL_CANT_REPAIR = 2,                                   ///< Diagnostic had problems setting up repairs
+    ZES_DIAG_RESULT_REBOOT_FOR_REPAIR = 3,                                  ///< Diagnostics found errors, setup for repair and reboot is required to
+                                                                            ///< complete the process
     ZES_DIAG_RESULT_FORCE_UINT32 = 0x7fffffff
 
 } zes_diag_result_t;
@@ -1762,8 +2007,8 @@ typedef enum _zes_diag_result_t
 /// @brief Diagnostic test
 typedef struct _zes_diag_test_t
 {
-    uint32_t index;                                 ///< [out] Index of the test
-    char name[ZES_STRING_PROPERTY_SIZE];            ///< [out] Name of the test
+    uint32_t index;                                                         ///< [out] Index of the test
+    char name[ZES_STRING_PROPERTY_SIZE];                                    ///< [out] Name of the test
 
 } zes_diag_test_t;
 
@@ -1771,16 +2016,16 @@ typedef struct _zes_diag_test_t
 /// @brief Diagnostics test suite properties
 typedef struct _zes_diag_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    char name[ZES_STRING_PROPERTY_SIZE];            ///< [out] Name of the diagnostics test suite
-    ze_bool_t haveTests;                            ///< [out] Indicates if this test suite has individual tests which can be
-                                                    ///< run separately (use the function ::zesDiagnosticsGetTests() to get the
-                                                    ///< list of these tests)
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    char name[ZES_STRING_PROPERTY_SIZE];                                    ///< [out] Name of the diagnostics test suite
+    ze_bool_t haveTests;                                                    ///< [out] Indicates if this test suite has individual tests which can be
+                                                                            ///< run separately (use the function ::zesDiagnosticsGetTests() to get the
+                                                                            ///< list of these tests)
 
 } zes_diag_properties_t;
 
@@ -1803,18 +2048,18 @@ typedef struct _zes_diag_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumDiagnosticTestSuites(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_diag_handle_t* phDiagnostics                ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_diag_handle_t* phDiagnostics                                        ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1836,19 +2081,19 @@ zesDeviceEnumDiagnosticTestSuites(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDiagnosticsGetProperties(
-    zes_diag_handle_t hDiagnostics,                 ///< [in] Handle for the component.
-    zes_diag_properties_t* pProperties              ///< [in,out] Structure describing the properties of a diagnostics test
-                                                    ///< suite
+    zes_diag_handle_t hDiagnostics,                                         ///< [in] Handle for the component.
+    zes_diag_properties_t* pProperties                                      ///< [in,out] Structure describing the properties of a diagnostics test
+                                                                            ///< suite
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Get individual tests that can be run separately. Not all test suites
-///        permit running individual tests - check
-///        ::zes_diag_properties_t.haveTests
+///        permit running individual tests, check the `haveTests` member of
+///        ::zes_diag_properties_t.
 /// 
 /// @details
 ///     - The list of available tests is returned in order of increasing test
-///       index ::zes_diag_test_t.index.
+///       index (see the `index` member of ::zes_diag_test_t).
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
 /// 
@@ -1864,16 +2109,16 @@ zesDiagnosticsGetProperties(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDiagnosticsGetTests(
-    zes_diag_handle_t hDiagnostics,                 ///< [in] Handle for the component.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of tests.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of tests that are available.
-                                                    ///< if count is greater than the number of tests that are available, then
-                                                    ///< the driver shall update the value with the correct number of tests.
-    zes_diag_test_t* pTests                         ///< [in,out][optional][range(0, *pCount)] array of information about
-                                                    ///< individual tests sorted by increasing value of ::zes_diag_test_t.index.
-                                                    ///< if count is less than the number of tests that are available, then the
-                                                    ///< driver shall only retrieve that number of tests.
+    zes_diag_handle_t hDiagnostics,                                         ///< [in] Handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of tests.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of tests that are available.
+                                                                            ///< if count is greater than the number of tests that are available, then
+                                                                            ///< the driver shall update the value with the correct number of tests.
+    zes_diag_test_t* pTests                                                 ///< [in,out][optional][range(0, *pCount)] array of information about
+                                                                            ///< individual tests sorted by increasing value of the `index` member of ::zes_diag_test_t.
+                                                                            ///< if count is less than the number of tests that are available, then the
+                                                                            ///< driver shall only retrieve that number of tests.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1884,8 +2129,8 @@ zesDiagnosticsGetTests(
 ///       information. Gracefully close any running workloads before initiating.
 ///     - To run all tests in a test suite, set start =
 ///       ::ZES_DIAG_FIRST_TEST_INDEX and end = ::ZES_DIAG_LAST_TEST_INDEX.
-///     - If the test suite permits running individual tests,
-///       ::zes_diag_properties_t.haveTests will be true. In this case, the
+///     - If the test suite permits running individual tests, the `haveTests`
+///       member of ::zes_diag_properties_t will be true. In this case, the
 ///       function ::zesDiagnosticsGetTests() can be called to get the list of
 ///       tests and corresponding indices that can be supplied to the arguments
 ///       start and end in this function.
@@ -1906,12 +2151,12 @@ zesDiagnosticsGetTests(
 ///         + User does not have permissions to perform diagnostics.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDiagnosticsRunTests(
-    zes_diag_handle_t hDiagnostics,                 ///< [in] Handle for the component.
-    uint32_t startIndex,                            ///< [in] The index of the first test to run. Set to
-                                                    ///< ::ZES_DIAG_FIRST_TEST_INDEX to start from the beginning.
-    uint32_t endIndex,                              ///< [in] The index of the last test to run. Set to
-                                                    ///< ::ZES_DIAG_LAST_TEST_INDEX to complete all tests after the start test.
-    zes_diag_result_t* pResult                      ///< [in,out] The result of the diagnostics
+    zes_diag_handle_t hDiagnostics,                                         ///< [in] Handle for the component.
+    uint32_t startIndex,                                                    ///< [in] The index of the first test to run. Set to
+                                                                            ///< ::ZES_DIAG_FIRST_TEST_INDEX to start from the beginning.
+    uint32_t endIndex,                                                      ///< [in] The index of the last test to run. Set to
+                                                                            ///< ::ZES_DIAG_LAST_TEST_INDEX to complete all tests after the start test.
+    zes_diag_result_t* pResult                                              ///< [in,out] The result of the diagnostics
     );
 
 #if !defined(__GNUC__)
@@ -1925,9 +2170,9 @@ zesDiagnosticsRunTests(
 /// @brief ECC State
 typedef enum _zes_device_ecc_state_t
 {
-    ZES_DEVICE_ECC_STATE_UNAVAILABLE = 0,           ///< None
-    ZES_DEVICE_ECC_STATE_ENABLED = 1,               ///< ECC enabled.
-    ZES_DEVICE_ECC_STATE_DISABLED = 2,              ///< ECC disabled.
+    ZES_DEVICE_ECC_STATE_UNAVAILABLE = 0,                                   ///< None
+    ZES_DEVICE_ECC_STATE_ENABLED = 1,                                       ///< ECC enabled.
+    ZES_DEVICE_ECC_STATE_DISABLED = 2,                                      ///< ECC disabled.
     ZES_DEVICE_ECC_STATE_FORCE_UINT32 = 0x7fffffff
 
 } zes_device_ecc_state_t;
@@ -1936,10 +2181,10 @@ typedef enum _zes_device_ecc_state_t
 /// @brief State Change Requirements
 typedef enum _zes_device_action_t
 {
-    ZES_DEVICE_ACTION_NONE = 0,                     ///< No action.
-    ZES_DEVICE_ACTION_WARM_CARD_RESET = 1,          ///< Warm reset of the card.
-    ZES_DEVICE_ACTION_COLD_CARD_RESET = 2,          ///< Cold reset of the card.
-    ZES_DEVICE_ACTION_COLD_SYSTEM_REBOOT = 3,       ///< Cold reboot of the system.
+    ZES_DEVICE_ACTION_NONE = 0,                                             ///< No action.
+    ZES_DEVICE_ACTION_WARM_CARD_RESET = 1,                                  ///< Warm reset of the card.
+    ZES_DEVICE_ACTION_COLD_CARD_RESET = 2,                                  ///< Cold reset of the card.
+    ZES_DEVICE_ACTION_COLD_SYSTEM_REBOOT = 3,                               ///< Cold reboot of the system.
     ZES_DEVICE_ACTION_FORCE_UINT32 = 0x7fffffff
 
 } zes_device_action_t;
@@ -1948,10 +2193,10 @@ typedef enum _zes_device_action_t
 /// @brief ECC State Descriptor
 typedef struct _zes_device_ecc_desc_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_device_ecc_state_t state;                   ///< [out] ECC state
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_device_ecc_state_t state;                                           ///< [out] ECC state
 
 } zes_device_ecc_desc_t;
 
@@ -1959,12 +2204,12 @@ typedef struct _zes_device_ecc_desc_t
 /// @brief ECC State
 typedef struct _zes_device_ecc_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_device_ecc_state_t currentState;            ///< [out] Current ECC state
-    zes_device_ecc_state_t pendingState;            ///< [out] Pending ECC state
-    zes_device_action_t pendingAction;              ///< [out] Pending action
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_device_ecc_state_t currentState;                                    ///< [out] Current ECC state
+    zes_device_ecc_state_t pendingState;                                    ///< [out] Pending ECC state
+    zes_device_action_t pendingAction;                                      ///< [out] Pending action
 
 } zes_device_ecc_properties_t;
 
@@ -1987,8 +2232,8 @@ typedef struct _zes_device_ecc_properties_t
 ///         + `nullptr == pAvailable`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEccAvailable(
-    zes_device_handle_t hDevice,                    ///< [in] Handle for the component.
-    ze_bool_t* pAvailable                           ///< [out] ECC functionality is available (true)/unavailable (false).
+    zes_device_handle_t hDevice,                                            ///< [in] Handle for the component.
+    ze_bool_t* pAvailable                                                   ///< [out] ECC functionality is available (true)/unavailable (false).
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2010,8 +2255,8 @@ zesDeviceEccAvailable(
 ///         + `nullptr == pConfigurable`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEccConfigurable(
-    zes_device_handle_t hDevice,                    ///< [in] Handle for the component.
-    ze_bool_t* pConfigurable                        ///< [out] ECC can be enabled/disabled (true)/enabled/disabled (false).
+    zes_device_handle_t hDevice,                                            ///< [in] Handle for the component.
+    ze_bool_t* pConfigurable                                                ///< [out] ECC can be enabled/disabled (true)/enabled/disabled (false).
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2033,8 +2278,8 @@ zesDeviceEccConfigurable(
 ///         + `nullptr == pState`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceGetEccState(
-    zes_device_handle_t hDevice,                    ///< [in] Handle for the component.
-    zes_device_ecc_properties_t* pState             ///< [out] ECC state, pending state, and pending action for state change.
+    zes_device_handle_t hDevice,                                            ///< [in] Handle for the component.
+    zes_device_ecc_properties_t* pState                                     ///< [out] ECC state, pending state, and pending action for state change.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2063,9 +2308,9 @@ zesDeviceGetEccState(
 ///         + User must look at the pendingAction attribute of pState & perform the action required to complete the ECC state change.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceSetEccState(
-    zes_device_handle_t hDevice,                    ///< [in] Handle for the component.
-    const zes_device_ecc_desc_t* newState,          ///< [in] Pointer to desired ECC state.
-    zes_device_ecc_properties_t* pState             ///< [out] ECC state, pending state, and pending action for state change.
+    zes_device_handle_t hDevice,                                            ///< [in] Handle for the component.
+    const zes_device_ecc_desc_t* newState,                                  ///< [in] Pointer to desired ECC state.
+    zes_device_ecc_properties_t* pState                                     ///< [out] ECC state, pending state, and pending action for state change.
     );
 
 #if !defined(__GNUC__)
@@ -2079,52 +2324,44 @@ zesDeviceSetEccState(
 /// @brief Accelerator engine groups
 typedef enum _zes_engine_group_t
 {
-    ZES_ENGINE_GROUP_ALL = 0,                       ///< Access information about all engines combined.
-    ZES_ENGINE_GROUP_COMPUTE_ALL = 1,               ///< Access information about all compute engines combined. Compute engines
-                                                    ///< can only process compute kernels (no 3D content).
-    ZES_ENGINE_GROUP_MEDIA_ALL = 2,                 ///< Access information about all media engines combined.
-    ZES_ENGINE_GROUP_COPY_ALL = 3,                  ///< Access information about all copy (blitter) engines combined.
-    ZES_ENGINE_GROUP_COMPUTE_SINGLE = 4,            ///< Access information about a single compute engine - this is an engine
-                                                    ///< that can process compute kernels. Note that single engines may share
-                                                    ///< the same underlying accelerator resources as other engines so activity
-                                                    ///< of such an engine may not be indicative of the underlying resource
-                                                    ///< utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    ZES_ENGINE_GROUP_RENDER_SINGLE = 5,             ///< Access information about a single render engine - this is an engine
-                                                    ///< that can process both 3D content and compute kernels. Note that single
-                                                    ///< engines may share the same underlying accelerator resources as other
-                                                    ///< engines so activity of such an engine may not be indicative of the
-                                                    ///< underlying resource utilization - use
-                                                    ///< ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE = 6,       ///< Access information about a single media decode engine. Note that
-                                                    ///< single engines may share the same underlying accelerator resources as
-                                                    ///< other engines so activity of such an engine may not be indicative of
-                                                    ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ///< for that.
-    ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE = 7,       ///< Access information about a single media encode engine. Note that
-                                                    ///< single engines may share the same underlying accelerator resources as
-                                                    ///< other engines so activity of such an engine may not be indicative of
-                                                    ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ///< for that.
-    ZES_ENGINE_GROUP_COPY_SINGLE = 8,               ///< Access information about a single media encode engine. Note that
-                                                    ///< single engines may share the same underlying accelerator resources as
-                                                    ///< other engines so activity of such an engine may not be indicative of
-                                                    ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_COPY_ALL
-                                                    ///< for that.
-    ZES_ENGINE_GROUP_MEDIA_ENHANCEMENT_SINGLE = 9,  ///< Access information about a single media enhancement engine. Note that
-                                                    ///< single engines may share the same underlying accelerator resources as
-                                                    ///< other engines so activity of such an engine may not be indicative of
-                                                    ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ///< for that.
-    ZES_ENGINE_GROUP_3D_SINGLE = 10,                ///< Access information about a single 3D engine - this is an engine that
-                                                    ///< can process 3D content only. Note that single engines may share the
-                                                    ///< same underlying accelerator resources as other engines so activity of
-                                                    ///< such an engine may not be indicative of the underlying resource
-                                                    ///< utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL = 11,    ///< Access information about all 3D/render/compute engines combined.
-    ZES_ENGINE_GROUP_RENDER_ALL = 12,               ///< Access information about all render engines combined. Render engines
-                                                    ///< are those than process both 3D content and compute kernels.
-    ZES_ENGINE_GROUP_3D_ALL = 13,                   ///< Access information about all 3D engines combined. 3D engines can
-                                                    ///< process 3D content only (no compute kernels).
+    ZES_ENGINE_GROUP_ALL = 0,                                               ///< Access information about all engines combined.
+    ZES_ENGINE_GROUP_COMPUTE_ALL = 1,                                       ///< Access information about all compute engines combined. Compute engines
+                                                                            ///< can only process compute kernels (no 3D content).
+    ZES_ENGINE_GROUP_MEDIA_ALL = 2,                                         ///< Access information about all media engines combined.
+    ZES_ENGINE_GROUP_COPY_ALL = 3,                                          ///< Access information about all copy (blitter) engines combined.
+    ZES_ENGINE_GROUP_COMPUTE_SINGLE = 4,                                    ///< Access information about a single compute engine - this is an engine
+                                                                            ///< that can process compute kernels. Note that single engines may share
+                                                                            ///< the same underlying accelerator resources as other engines so activity
+                                                                            ///< of such an engine may not be indicative of the underlying resource
+                                                                            ///< utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
+    ZES_ENGINE_GROUP_RENDER_SINGLE = 5,                                     ///< Access information about a single render engine - this is an engine
+                                                                            ///< that can process both 3D content and compute kernels. Note that single
+                                                                            ///< engines may share the same underlying accelerator resources as other
+                                                                            ///< engines so activity of such an engine may not be indicative of the
+                                                                            ///< underlying resource utilization - use
+                                                                            ///< ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
+    ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE = 6,                               ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE = 7,                               ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_COPY_SINGLE = 8,                                       ///< Access information about a single media encode engine. Note that
+                                                                            ///< single engines may share the same underlying accelerator resources as
+                                                                            ///< other engines so activity of such an engine may not be indicative of
+                                                                            ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_COPY_ALL
+                                                                            ///< for that.
+    ZES_ENGINE_GROUP_MEDIA_ENHANCEMENT_SINGLE = 9,                          ///< Access information about a single media enhancement engine. Note that
+                                                                            ///< single engines may share the same underlying accelerator resources as
+                                                                            ///< other engines so activity of such an engine may not be indicative of
+                                                                            ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
+                                                                            ///< for that.
+    ZES_ENGINE_GROUP_3D_SINGLE = 10,                                        ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL = 11,                            ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_RENDER_ALL = 12,                                       ///< Access information about all render engines combined. Render engines
+                                                                            ///< are those than process both 3D content and compute kernels.
+    ZES_ENGINE_GROUP_3D_ALL = 13,                                           ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_MEDIA_CODEC_SINGLE = 14,                               ///< Access information about a single media engine. Note that single
+                                                                            ///< engines may share the same underlying accelerator resources as other
+                                                                            ///< engines so activity of such an engine may not be indicative of the
+                                                                            ///< underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL for
+                                                                            ///< that.
     ZES_ENGINE_GROUP_FORCE_UINT32 = 0x7fffffff
 
 } zes_engine_group_t;
@@ -2133,13 +2370,13 @@ typedef enum _zes_engine_group_t
 /// @brief Engine group properties
 typedef struct _zes_engine_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_engine_group_t type;                        ///< [out] The engine group
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_engine_group_t type;                                                ///< [out] The engine group
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
 
 } zes_engine_properties_t;
 
@@ -2152,16 +2389,16 @@ typedef struct _zes_engine_properties_t
 ///       (s2.timestamp - s1.timestamp)
 typedef struct _zes_engine_stats_t
 {
-    uint64_t activeTime;                            ///< [out] Monotonic counter for time in microseconds that this resource is
-                                                    ///< actively running workloads.
-    uint64_t timestamp;                             ///< [out] Monotonic timestamp counter in microseconds when activeTime
-                                                    ///< counter was sampled.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
+    uint64_t activeTime;                                                    ///< [out] Monotonic counter where the resource is actively running workloads.
+                                                                            ///< Time units are implementation specific since the activeTime value is
+                                                                            ///< only intended for calculating utilization percentage as noted above.
+    uint64_t timestamp;                                                     ///< [out] Monotonic counter when activeTime counter was sampled.
+                                                                            ///< This timestamp should only be used to calculate delta between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
 
 } zes_engine_stats_t;
 
@@ -2184,18 +2421,18 @@ typedef struct _zes_engine_stats_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumEngineGroups(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_engine_handle_t* phEngine                   ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_engine_handle_t* phEngine                                           ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2217,14 +2454,16 @@ zesDeviceEnumEngineGroups(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesEngineGetProperties(
-    zes_engine_handle_t hEngine,                    ///< [in] Handle for the component.
-    zes_engine_properties_t* pProperties            ///< [in,out] The properties for the specified engine group.
+    zes_engine_handle_t hEngine,                                            ///< [in] Handle for the component.
+    zes_engine_properties_t* pProperties                                    ///< [in,out] The properties for the specified engine group.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the activity stats for an engine group
+/// @brief Get the activity stats for an engine group.
 /// 
 /// @details
+///     - This function also returns the engine activity inside a Virtual
+///       Machine (VM), in the presence of hardware virtualization (SRIOV)
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
 /// 
@@ -2240,9 +2479,9 @@ zesEngineGetProperties(
 ///         + `nullptr == pStats`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesEngineGetActivity(
-    zes_engine_handle_t hEngine,                    ///< [in] Handle for the component.
-    zes_engine_stats_t* pStats                      ///< [in,out] Will contain a snapshot of the engine group activity
-                                                    ///< counters.
+    zes_engine_handle_t hEngine,                                            ///< [in] Handle for the component.
+    zes_engine_stats_t* pStats                                              ///< [in,out] Will contain a snapshot of the engine group activity
+                                                                            ///< counters.
     );
 
 #if !defined(__GNUC__)
@@ -2257,33 +2496,33 @@ zesEngineGetActivity(
 typedef uint32_t zes_event_type_flags_t;
 typedef enum _zes_event_type_flag_t
 {
-    ZES_EVENT_TYPE_FLAG_DEVICE_DETACH = ZE_BIT(0),  ///< Event is triggered when the device is no longer available (due to a
-                                                    ///< reset or being disabled).
-    ZES_EVENT_TYPE_FLAG_DEVICE_ATTACH = ZE_BIT(1),  ///< Event is triggered after the device is available again.
-    ZES_EVENT_TYPE_FLAG_DEVICE_SLEEP_STATE_ENTER = ZE_BIT(2),   ///< Event is triggered when the driver is about to put the device into a
-                                                    ///< deep sleep state
-    ZES_EVENT_TYPE_FLAG_DEVICE_SLEEP_STATE_EXIT = ZE_BIT(3),///< Event is triggered when the driver is waking the device up from a deep
-                                                    ///< sleep state
-    ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED = ZE_BIT(4), ///< Event is triggered when the frequency starts being throttled
-    ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED = ZE_BIT(5),   ///< Event is triggered when the energy consumption threshold is reached
-                                                    ///< (use ::zesPowerSetEnergyThreshold() to configure).
-    ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL = ZE_BIT(6),  ///< Event is triggered when the critical temperature is reached (use
-                                                    ///< ::zesTemperatureSetConfig() to configure - disabled by default).
-    ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 = ZE_BIT(7),///< Event is triggered when the temperature crosses threshold 1 (use
-                                                    ///< ::zesTemperatureSetConfig() to configure - disabled by default).
-    ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 = ZE_BIT(8),///< Event is triggered when the temperature crosses threshold 2 (use
-                                                    ///< ::zesTemperatureSetConfig() to configure - disabled by default).
-    ZES_EVENT_TYPE_FLAG_MEM_HEALTH = ZE_BIT(9),     ///< Event is triggered when the health of device memory changes.
-    ZES_EVENT_TYPE_FLAG_FABRIC_PORT_HEALTH = ZE_BIT(10),///< Event is triggered when the health of fabric ports change.
-    ZES_EVENT_TYPE_FLAG_PCI_LINK_HEALTH = ZE_BIT(11),   ///< Event is triggered when the health of the PCI link changes.
-    ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS = ZE_BIT(12),///< Event is triggered when accelerator RAS correctable errors cross
-                                                    ///< thresholds (use ::zesRasSetConfig() to configure - disabled by
-                                                    ///< default).
-    ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS = ZE_BIT(13),  ///< Event is triggered when accelerator RAS uncorrectable errors cross
-                                                    ///< thresholds (use ::zesRasSetConfig() to configure - disabled by
-                                                    ///< default).
-    ZES_EVENT_TYPE_FLAG_DEVICE_RESET_REQUIRED = ZE_BIT(14), ///< Event is triggered when the device needs to be reset (use
-                                                    ///< ::zesDeviceGetState() to determine the reasons for the reset).
+    ZES_EVENT_TYPE_FLAG_DEVICE_DETACH = ZE_BIT(0),                          ///< Event is triggered when the device is no longer available (due to a
+                                                                            ///< reset or being disabled).
+    ZES_EVENT_TYPE_FLAG_DEVICE_ATTACH = ZE_BIT(1),                          ///< Event is triggered after the device is available again.
+    ZES_EVENT_TYPE_FLAG_DEVICE_SLEEP_STATE_ENTER = ZE_BIT(2),               ///< Event is triggered when the driver is about to put the device into a
+                                                                            ///< deep sleep state
+    ZES_EVENT_TYPE_FLAG_DEVICE_SLEEP_STATE_EXIT = ZE_BIT(3),                ///< Event is triggered when the driver is waking the device up from a deep
+                                                                            ///< sleep state
+    ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED = ZE_BIT(4),                         ///< Event is triggered when the frequency starts being throttled
+    ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED = ZE_BIT(5),               ///< Event is triggered when the energy consumption threshold is reached
+                                                                            ///< (use ::zesPowerSetEnergyThreshold() to configure).
+    ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL = ZE_BIT(6),                          ///< Event is triggered when the critical temperature is reached (use
+                                                                            ///< ::zesTemperatureSetConfig() to configure - disabled by default).
+    ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 = ZE_BIT(7),                        ///< Event is triggered when the temperature crosses threshold 1 (use
+                                                                            ///< ::zesTemperatureSetConfig() to configure - disabled by default).
+    ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 = ZE_BIT(8),                        ///< Event is triggered when the temperature crosses threshold 2 (use
+                                                                            ///< ::zesTemperatureSetConfig() to configure - disabled by default).
+    ZES_EVENT_TYPE_FLAG_MEM_HEALTH = ZE_BIT(9),                             ///< Event is triggered when the health of device memory changes.
+    ZES_EVENT_TYPE_FLAG_FABRIC_PORT_HEALTH = ZE_BIT(10),                    ///< Event is triggered when the health of fabric ports change.
+    ZES_EVENT_TYPE_FLAG_PCI_LINK_HEALTH = ZE_BIT(11),                       ///< Event is triggered when the health of the PCI link changes.
+    ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS = ZE_BIT(12),                ///< Event is triggered when accelerator RAS correctable errors cross
+                                                                            ///< thresholds (use ::zesRasSetConfig() to configure - disabled by
+                                                                            ///< default).
+    ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS = ZE_BIT(13),              ///< Event is triggered when accelerator RAS uncorrectable errors cross
+                                                                            ///< thresholds (use ::zesRasSetConfig() to configure - disabled by
+                                                                            ///< default).
+    ZES_EVENT_TYPE_FLAG_DEVICE_RESET_REQUIRED = ZE_BIT(14),                 ///< Event is triggered when the device needs to be reset (use
+                                                                            ///< ::zesDeviceGetState() to determine the reasons for the reset).
     ZES_EVENT_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_event_type_flag_t;
@@ -2307,8 +2546,8 @@ typedef enum _zes_event_type_flag_t
 ///         + `0x7fff < events`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEventRegister(
-    zes_device_handle_t hDevice,                    ///< [in] The device handle.
-    zes_event_type_flags_t events                   ///< [in] List of events to listen to.
+    zes_device_handle_t hDevice,                                            ///< [in] The device handle.
+    zes_event_type_flags_t events                                           ///< [in] List of events to listen to.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2336,24 +2575,24 @@ zesDeviceEventRegister(
 ///         + One or more of the supplied device handles belongs to a different driver.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDriverEventListen(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint32_t timeout,                               ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then will check status and return immediately;
-                                                    ///< if UINT32_MAX, then function will not return until events arrive.
-    uint32_t count,                                 ///< [in] Number of device handles in phDevices.
-    zes_device_handle_t* phDevices,                 ///< [in][range(0, count)] Device handles to listen to for events. Only
-                                                    ///< devices from the provided driver handle can be specified in this list.
-    uint32_t* pNumDeviceEvents,                     ///< [in,out] Will contain the actual number of devices in phDevices that
-                                                    ///< generated events. If non-zero, check pEvents to determine the devices
-                                                    ///< and events that were received.
-    zes_event_type_flags_t* pEvents                 ///< [in,out] An array that will continue the list of events for each
-                                                    ///< device listened in phDevices.
-                                                    ///< This array must be at least as big as count.
-                                                    ///< For every device handle in phDevices, this will provide the events
-                                                    ///< that occurred for that device at the same position in this array. If
-                                                    ///< no event was received for a given device, the corresponding array
-                                                    ///< entry will be zero.
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint32_t timeout,                                                       ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then will check status and return immediately;
+                                                                            ///< if `UINT32_MAX`, then function will not return until events arrive.
+    uint32_t count,                                                         ///< [in] Number of device handles in phDevices.
+    zes_device_handle_t* phDevices,                                         ///< [in][range(0, count)] Device handles to listen to for events. Only
+                                                                            ///< devices from the provided driver handle can be specified in this list.
+    uint32_t* pNumDeviceEvents,                                             ///< [in,out] Will contain the actual number of devices in phDevices that
+                                                                            ///< generated events. If non-zero, check pEvents to determine the devices
+                                                                            ///< and events that were received.
+    zes_event_type_flags_t* pEvents                                         ///< [in,out] An array that will continue the list of events for each
+                                                                            ///< device listened in phDevices.
+                                                                            ///< This array must be at least as big as count.
+                                                                            ///< For every device handle in phDevices, this will provide the events
+                                                                            ///< that occurred for that device at the same position in this array. If
+                                                                            ///< no event was received for a given device, the corresponding array
+                                                                            ///< entry will be zero.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2381,24 +2620,24 @@ zesDriverEventListen(
 ///         + One or more of the supplied device handles belongs to a different driver.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDriverEventListenEx(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint64_t timeout,                               ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then will check status and return immediately;
-                                                    ///< if UINT64_MAX, then function will not return until events arrive.
-    uint32_t count,                                 ///< [in] Number of device handles in phDevices.
-    zes_device_handle_t* phDevices,                 ///< [in][range(0, count)] Device handles to listen to for events. Only
-                                                    ///< devices from the provided driver handle can be specified in this list.
-    uint32_t* pNumDeviceEvents,                     ///< [in,out] Will contain the actual number of devices in phDevices that
-                                                    ///< generated events. If non-zero, check pEvents to determine the devices
-                                                    ///< and events that were received.
-    zes_event_type_flags_t* pEvents                 ///< [in,out] An array that will continue the list of events for each
-                                                    ///< device listened in phDevices.
-                                                    ///< This array must be at least as big as count.
-                                                    ///< For every device handle in phDevices, this will provide the events
-                                                    ///< that occurred for that device at the same position in this array. If
-                                                    ///< no event was received for a given device, the corresponding array
-                                                    ///< entry will be zero.
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint64_t timeout,                                                       ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then will check status and return immediately;
+                                                                            ///< if `UINT64_MAX`, then function will not return until events arrive.
+    uint32_t count,                                                         ///< [in] Number of device handles in phDevices.
+    zes_device_handle_t* phDevices,                                         ///< [in][range(0, count)] Device handles to listen to for events. Only
+                                                                            ///< devices from the provided driver handle can be specified in this list.
+    uint32_t* pNumDeviceEvents,                                             ///< [in,out] Will contain the actual number of devices in phDevices that
+                                                                            ///< generated events. If non-zero, check pEvents to determine the devices
+                                                                            ///< and events that were received.
+    zes_event_type_flags_t* pEvents                                         ///< [in,out] An array that will continue the list of events for each
+                                                                            ///< device listened in phDevices.
+                                                                            ///< This array must be at least as big as count.
+                                                                            ///< For every device handle in phDevices, this will provide the events
+                                                                            ///< that occurred for that device at the same position in this array. If
+                                                                            ///< no event was received for a given device, the corresponding array
+                                                                            ///< entry will be zero.
     );
 
 #if !defined(__GNUC__)
@@ -2425,12 +2664,12 @@ zesDriverEventListenEx(
 /// @brief Fabric port status
 typedef enum _zes_fabric_port_status_t
 {
-    ZES_FABRIC_PORT_STATUS_UNKNOWN = 0,             ///< The port status cannot be determined
-    ZES_FABRIC_PORT_STATUS_HEALTHY = 1,             ///< The port is up and operating as expected
-    ZES_FABRIC_PORT_STATUS_DEGRADED = 2,            ///< The port is up but has quality and/or speed degradation
-    ZES_FABRIC_PORT_STATUS_FAILED = 3,              ///< Port connection instabilities are preventing workloads making forward
-                                                    ///< progress
-    ZES_FABRIC_PORT_STATUS_DISABLED = 4,            ///< The port is configured down
+    ZES_FABRIC_PORT_STATUS_UNKNOWN = 0,                                     ///< The port status cannot be determined
+    ZES_FABRIC_PORT_STATUS_HEALTHY = 1,                                     ///< The port is up and operating as expected
+    ZES_FABRIC_PORT_STATUS_DEGRADED = 2,                                    ///< The port is up but has quality and/or speed degradation
+    ZES_FABRIC_PORT_STATUS_FAILED = 3,                                      ///< Port connection instabilities are preventing workloads making forward
+                                                                            ///< progress
+    ZES_FABRIC_PORT_STATUS_DISABLED = 4,                                    ///< The port is configured down
     ZES_FABRIC_PORT_STATUS_FORCE_UINT32 = 0x7fffffff
 
 } zes_fabric_port_status_t;
@@ -2440,8 +2679,8 @@ typedef enum _zes_fabric_port_status_t
 typedef uint32_t zes_fabric_port_qual_issue_flags_t;
 typedef enum _zes_fabric_port_qual_issue_flag_t
 {
-    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_LINK_ERRORS = ZE_BIT(0),///< Excessive link errors are occurring
-    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_SPEED = ZE_BIT(1),  ///< There is a degradation in the bitrate and/or width of the link
+    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_LINK_ERRORS = ZE_BIT(0),                ///< Excessive link errors are occurring
+    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_SPEED = ZE_BIT(1),                      ///< There is a degradation in the bitrate and/or width of the link
     ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_fabric_port_qual_issue_flag_t;
@@ -2451,17 +2690,17 @@ typedef enum _zes_fabric_port_qual_issue_flag_t
 typedef uint32_t zes_fabric_port_failure_flags_t;
 typedef enum _zes_fabric_port_failure_flag_t
 {
-    ZES_FABRIC_PORT_FAILURE_FLAG_FAILED = ZE_BIT(0),///< A previously operating link has failed. Hardware will automatically
-                                                    ///< retrain this port. This state will persist until either the physical
-                                                    ///< connection is removed or the link trains successfully.
-    ZES_FABRIC_PORT_FAILURE_FLAG_TRAINING_TIMEOUT = ZE_BIT(1),  ///< A connection has not been established within an expected time.
-                                                    ///< Hardware will continue to attempt port training. This status will
-                                                    ///< persist until either the physical connection is removed or the link
-                                                    ///< successfully trains.
-    ZES_FABRIC_PORT_FAILURE_FLAG_FLAPPING = ZE_BIT(2),  ///< Port has excessively trained and then transitioned down for some
-                                                    ///< period of time. Driver will allow port to continue to train, but will
-                                                    ///< not enable the port for use until the port has been disabled and
-                                                    ///< subsequently re-enabled using ::zesFabricPortSetConfig().
+    ZES_FABRIC_PORT_FAILURE_FLAG_FAILED = ZE_BIT(0),                        ///< A previously operating link has failed. Hardware will automatically
+                                                                            ///< retrain this port. This state will persist until either the physical
+                                                                            ///< connection is removed or the link trains successfully.
+    ZES_FABRIC_PORT_FAILURE_FLAG_TRAINING_TIMEOUT = ZE_BIT(1),              ///< A connection has not been established within an expected time.
+                                                                            ///< Hardware will continue to attempt port training. This status will
+                                                                            ///< persist until either the physical connection is removed or the link
+                                                                            ///< successfully trains.
+    ZES_FABRIC_PORT_FAILURE_FLAG_FLAPPING = ZE_BIT(2),                      ///< Port has excessively trained and then transitioned down for some
+                                                                            ///< period of time. Driver will allow port to continue to train, but will
+                                                                            ///< not enable the port for use until the port has been disabled and
+                                                                            ///< subsequently re-enabled using ::zesFabricPortSetConfig().
     ZES_FABRIC_PORT_FAILURE_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_fabric_port_failure_flag_t;
@@ -2475,14 +2714,15 @@ typedef enum _zes_fabric_port_failure_flag_t
 ///       in the hardware may result in a different identifier for a given port.
 ///     - The main purpose of this identifier to build up an instantaneous
 ///       topology map of system connectivity. An application should enumerate
-///       all fabric ports and match ::zes_fabric_port_state_t.remotePortId to
-///       ::zes_fabric_port_properties_t.portId.
+///       all fabric ports and match the `remotePortId` member of
+///       ::zes_fabric_port_state_t to the `portId` member of
+///       ::zes_fabric_port_properties_t.
 typedef struct _zes_fabric_port_id_t
 {
-    uint32_t fabricId;                              ///< [out] Unique identifier for the fabric end-point
-    uint32_t attachId;                              ///< [out] Unique identifier for the device attachment point
-    uint8_t portNumber;                             ///< [out] The logical port number (this is typically marked somewhere on
-                                                    ///< the physical device)
+    uint32_t fabricId;                                                      ///< [out] Unique identifier for the fabric end-point
+    uint32_t attachId;                                                      ///< [out] Unique identifier for the device attachment point
+    uint8_t portNumber;                                                     ///< [out] The logical port number (this is typically marked somewhere on
+                                                                            ///< the physical device)
 
 } zes_fabric_port_id_t;
 
@@ -2490,10 +2730,10 @@ typedef struct _zes_fabric_port_id_t
 /// @brief Fabric port speed in one direction
 typedef struct _zes_fabric_port_speed_t
 {
-    int64_t bitRate;                                ///< [out] Bits/sec that the link is operating at. A value of -1 means that
-                                                    ///< this property is unknown.
-    int32_t width;                                  ///< [out] The number of lanes. A value of -1 means that this property is
-                                                    ///< unknown.
+    int64_t bitRate;                                                        ///< [out] Bits/sec that the link is operating at. A value of -1 means that
+                                                                            ///< this property is unknown.
+    int32_t width;                                                          ///< [out] The number of lanes. A value of -1 means that this property is
+                                                                            ///< unknown.
 
 } zes_fabric_port_speed_t;
 
@@ -2501,19 +2741,19 @@ typedef struct _zes_fabric_port_speed_t
 /// @brief Fabric port properties
 typedef struct _zes_fabric_port_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    char model[ZES_MAX_FABRIC_PORT_MODEL_SIZE];     ///< [out] Description of port technology. Will be set to the string
-                                                    ///< "unkown" if this cannot be determined for this port.
-    ze_bool_t onSubdevice;                          ///< [out] True if the port is located on a sub-device; false means that
-                                                    ///< the port is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    zes_fabric_port_id_t portId;                    ///< [out] The unique port identifier
-    zes_fabric_port_speed_t maxRxSpeed;             ///< [out] Maximum speed supported by the receive side of the port (sum of
-                                                    ///< all lanes)
-    zes_fabric_port_speed_t maxTxSpeed;             ///< [out] Maximum speed supported by the transmit side of the port (sum of
-                                                    ///< all lanes)
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    char model[ZES_MAX_FABRIC_PORT_MODEL_SIZE];                             ///< [out] Description of port technology. Will be set to the string
+                                                                            ///< "unkown" if this cannot be determined for this port.
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the port is located on a sub-device; false means that
+                                                                            ///< the port is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_fabric_port_id_t portId;                                            ///< [out] The unique port identifier
+    zes_fabric_port_speed_t maxRxSpeed;                                     ///< [out] Maximum speed supported by the receive side of the port (sum of
+                                                                            ///< all lanes)
+    zes_fabric_port_speed_t maxTxSpeed;                                     ///< [out] Maximum speed supported by the transmit side of the port (sum of
+                                                                            ///< all lanes)
 
 } zes_fabric_port_properties_t;
 
@@ -2521,8 +2761,8 @@ typedef struct _zes_fabric_port_properties_t
 /// @brief Provides information about the fabric link attached to a port
 typedef struct _zes_fabric_link_type_t
 {
-    char desc[ZES_MAX_FABRIC_LINK_TYPE_SIZE];       ///< [out] Description of link technology. Will be set to the string
-                                                    ///< "unkown" if this cannot be determined for this link.
+    char desc[ZES_MAX_FABRIC_LINK_TYPE_SIZE];                               ///< [out] Description of link technology. Will be set to the string
+                                                                            ///< "unkown" if this cannot be determined for this link.
 
 } zes_fabric_link_type_t;
 
@@ -2530,11 +2770,11 @@ typedef struct _zes_fabric_link_type_t
 /// @brief Fabric port configuration
 typedef struct _zes_fabric_port_config_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t enabled;                              ///< [in,out] Port is configured up/down
-    ze_bool_t beaconing;                            ///< [in,out] Beaconing is configured on/off
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t enabled;                                                      ///< [in,out] Port is configured up/down
+    ze_bool_t beaconing;                                                    ///< [in,out] Beaconing is configured on/off
 
 } zes_fabric_port_config_t;
 
@@ -2542,25 +2782,25 @@ typedef struct _zes_fabric_port_config_t
 /// @brief Fabric port state
 typedef struct _zes_fabric_port_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_fabric_port_status_t status;                ///< [out] The current status of the port
-    zes_fabric_port_qual_issue_flags_t qualityIssues;   ///< [out] If status is ::ZES_FABRIC_PORT_STATUS_DEGRADED,
-                                                    ///< then this gives a combination of ::zes_fabric_port_qual_issue_flag_t
-                                                    ///< for quality issues that have been detected;
-                                                    ///< otherwise, 0 indicates there are no quality issues with the link at
-                                                    ///< this time.
-    zes_fabric_port_failure_flags_t failureReasons; ///< [out] If status is ::ZES_FABRIC_PORT_STATUS_FAILED,
-                                                    ///< then this gives a combination of ::zes_fabric_port_failure_flag_t for
-                                                    ///< reasons for the connection instability;
-                                                    ///< otherwise, 0 indicates there are no connection stability issues at
-                                                    ///< this time.
-    zes_fabric_port_id_t remotePortId;              ///< [out] The unique port identifier for the remote connection point if
-                                                    ///< status is ::ZES_FABRIC_PORT_STATUS_HEALTHY,
-                                                    ///< ::ZES_FABRIC_PORT_STATUS_DEGRADED or ::ZES_FABRIC_PORT_STATUS_FAILED
-    zes_fabric_port_speed_t rxSpeed;                ///< [out] Current maximum receive speed (sum of all lanes)
-    zes_fabric_port_speed_t txSpeed;                ///< [out] Current maximum transmit speed (sum of all lanes)
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_fabric_port_status_t status;                                        ///< [out] The current status of the port
+    zes_fabric_port_qual_issue_flags_t qualityIssues;                       ///< [out] If status is ::ZES_FABRIC_PORT_STATUS_DEGRADED,
+                                                                            ///< then this gives a combination of ::zes_fabric_port_qual_issue_flag_t
+                                                                            ///< for quality issues that have been detected;
+                                                                            ///< otherwise, 0 indicates there are no quality issues with the link at
+                                                                            ///< this time.
+    zes_fabric_port_failure_flags_t failureReasons;                         ///< [out] If status is ::ZES_FABRIC_PORT_STATUS_FAILED,
+                                                                            ///< then this gives a combination of ::zes_fabric_port_failure_flag_t for
+                                                                            ///< reasons for the connection instability;
+                                                                            ///< otherwise, 0 indicates there are no connection stability issues at
+                                                                            ///< this time.
+    zes_fabric_port_id_t remotePortId;                                      ///< [out] The unique port identifier for the remote connection point if
+                                                                            ///< status is ::ZES_FABRIC_PORT_STATUS_HEALTHY,
+                                                                            ///< ::ZES_FABRIC_PORT_STATUS_DEGRADED or ::ZES_FABRIC_PORT_STATUS_FAILED
+    zes_fabric_port_speed_t rxSpeed;                                        ///< [out] Current maximum receive speed (sum of all lanes)
+    zes_fabric_port_speed_t txSpeed;                                        ///< [out] Current maximum transmit speed (sum of all lanes)
 
 } zes_fabric_port_state_t;
 
@@ -2568,22 +2808,36 @@ typedef struct _zes_fabric_port_state_t
 /// @brief Fabric port throughput.
 typedef struct _zes_fabric_port_throughput_t
 {
-    uint64_t timestamp;                             ///< [out] Monotonic timestamp counter in microseconds when the measurement
-                                                    ///< was made.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
-    uint64_t rxCounter;                             ///< [out] Monotonic counter for the number of bytes received (sum of all
-                                                    ///< lanes). This includes all protocol overhead, not only the GPU traffic.
-    uint64_t txCounter;                             ///< [out] Monotonic counter for the number of bytes transmitted (sum of
-                                                    ///< all lanes). This includes all protocol overhead, not only the GPU
-                                                    ///< traffic.
+    uint64_t timestamp;                                                     ///< [out] Monotonic timestamp counter in microseconds when the measurement
+                                                                            ///< was made.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
+    uint64_t rxCounter;                                                     ///< [out] Monotonic counter for the number of bytes received (sum of all
+                                                                            ///< lanes). This includes all protocol overhead, not only the GPU traffic.
+    uint64_t txCounter;                                                     ///< [out] Monotonic counter for the number of bytes transmitted (sum of
+                                                                            ///< all lanes). This includes all protocol overhead, not only the GPU
+                                                                            ///< traffic.
 
 } zes_fabric_port_throughput_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric Port Error Counters
+typedef struct _zes_fabric_port_error_counters_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t linkFailureCount;                                              ///< [out] Link Failure Error Count reported per port
+    uint64_t fwCommErrorCount;                                              ///< [out] Firmware Communication Error Count reported per device
+    uint64_t fwErrorCount;                                                  ///< [out] Firmware reported Error Count reported per device
+    uint64_t linkDegradeCount;                                              ///< [out] Link Degrade Error Count reported per port
+
+} zes_fabric_port_error_counters_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Get handle of Fabric ports in a device
 /// 
@@ -2603,18 +2857,18 @@ typedef struct _zes_fabric_port_throughput_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumFabricPorts(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_fabric_port_handle_t* phPort                ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_fabric_port_handle_t* phPort                                        ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2636,8 +2890,8 @@ zesDeviceEnumFabricPorts(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFabricPortGetProperties(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_port_properties_t* pProperties       ///< [in,out] Will contain properties of the Fabric Port.
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_properties_t* pProperties                               ///< [in,out] Will contain properties of the Fabric Port.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2659,9 +2913,9 @@ zesFabricPortGetProperties(
 ///         + `nullptr == pLinkType`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFabricPortGetLinkType(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_link_type_t* pLinkType               ///< [in,out] Will contain details about the link attached to the Fabric
-                                                    ///< port.
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_link_type_t* pLinkType                                       ///< [in,out] Will contain details about the link attached to the Fabric
+                                                                            ///< port.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2683,8 +2937,8 @@ zesFabricPortGetLinkType(
 ///         + `nullptr == pConfig`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFabricPortGetConfig(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_port_config_t* pConfig               ///< [in,out] Will contain configuration of the Fabric Port.
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_config_t* pConfig                                       ///< [in,out] Will contain configuration of the Fabric Port.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2708,8 +2962,8 @@ zesFabricPortGetConfig(
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFabricPortSetConfig(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    const zes_fabric_port_config_t* pConfig         ///< [in] Contains new configuration of the Fabric Port.
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    const zes_fabric_port_config_t* pConfig                                 ///< [in] Contains new configuration of the Fabric Port.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2732,8 +2986,8 @@ zesFabricPortSetConfig(
 ///         + `nullptr == pState`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFabricPortGetState(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_port_state_t* pState                 ///< [in,out] Will contain the current state of the Fabric Port
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_state_t* pState                                         ///< [in,out] Will contain the current state of the Fabric Port
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2757,8 +3011,64 @@ zesFabricPortGetState(
 ///         + User does not have permissions to query this telemetry.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFabricPortGetThroughput(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_port_throughput_t* pThroughput       ///< [in,out] Will contain the Fabric port throughput counters.
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_throughput_t* pThroughput                               ///< [in,out] Will contain the Fabric port throughput counters.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric Port Error Counters
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - The memory backing the arrays for phPorts and ppThroughputs must be
+///       allocated in system memory by the user who is also responsible for
+///       releasing them when they are no longer needed.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPort`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pErrors`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to query this telemetry.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetFabricErrorCounters(
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_error_counters_t* pErrors                               ///< [in,out] Will contain the Fabric port Error counters.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric port throughput from multiple ports in a single call
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phPort`
+///         + `nullptr == pThroughput`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetMultiPortThroughput(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t numPorts,                                                      ///< [in] Number of ports enumerated in function ::zesDeviceEnumFabricPorts
+    zes_fabric_port_handle_t* phPort,                                       ///< [in][range(0, numPorts)] array of fabric port handles provided by user
+                                                                            ///< to gather throughput values. 
+    zes_fabric_port_throughput_t** pThroughput                              ///< [out][range(0, numPorts)] array of fabric port throughput counters
+                                                                            ///< from multiple ports of type ::zes_fabric_port_throughput_t.
     );
 
 #if !defined(__GNUC__)
@@ -2772,10 +3082,10 @@ zesFabricPortGetThroughput(
 /// @brief Fan resource speed mode
 typedef enum _zes_fan_speed_mode_t
 {
-    ZES_FAN_SPEED_MODE_DEFAULT = 0,                 ///< The fan speed is operating using the hardware default settings
-    ZES_FAN_SPEED_MODE_FIXED = 1,                   ///< The fan speed is currently set to a fixed value
-    ZES_FAN_SPEED_MODE_TABLE = 2,                   ///< The fan speed is currently controlled dynamically by hardware based on
-                                                    ///< a temp/speed table
+    ZES_FAN_SPEED_MODE_DEFAULT = 0,                                         ///< The fan speed is operating using the hardware default settings
+    ZES_FAN_SPEED_MODE_FIXED = 1,                                           ///< The fan speed is currently set to a fixed value
+    ZES_FAN_SPEED_MODE_TABLE = 2,                                           ///< The fan speed is currently controlled dynamically by hardware based on
+                                                                            ///< a temp/speed table
     ZES_FAN_SPEED_MODE_FORCE_UINT32 = 0x7fffffff
 
 } zes_fan_speed_mode_t;
@@ -2784,8 +3094,8 @@ typedef enum _zes_fan_speed_mode_t
 /// @brief Fan speed units
 typedef enum _zes_fan_speed_units_t
 {
-    ZES_FAN_SPEED_UNITS_RPM = 0,                    ///< The fan speed is in units of revolutions per minute (rpm)
-    ZES_FAN_SPEED_UNITS_PERCENT = 1,                ///< The fan speed is a percentage of the maximum speed of the fan
+    ZES_FAN_SPEED_UNITS_RPM = 0,                                            ///< The fan speed is in units of revolutions per minute (rpm)
+    ZES_FAN_SPEED_UNITS_PERCENT = 1,                                        ///< The fan speed is a percentage of the maximum speed of the fan
     ZES_FAN_SPEED_UNITS_FORCE_UINT32 = 0x7fffffff
 
 } zes_fan_speed_units_t;
@@ -2794,10 +3104,10 @@ typedef enum _zes_fan_speed_units_t
 /// @brief Fan speed
 typedef struct _zes_fan_speed_t
 {
-    int32_t speed;                                  ///< [in,out] The speed of the fan. On output, a value of -1 indicates that
-                                                    ///< there is no fixed fan speed setting.
-    zes_fan_speed_units_t units;                    ///< [in,out] The units that the fan speed is expressed in. On output, if
-                                                    ///< fan speed is -1 then units should be ignored.
+    int32_t speed;                                                          ///< [in,out] The speed of the fan. On output, a value of -1 indicates that
+                                                                            ///< there is no fixed fan speed setting.
+    zes_fan_speed_units_t units;                                            ///< [in,out] The units that the fan speed is expressed in. On output, if
+                                                                            ///< fan speed is -1 then units should be ignored.
 
 } zes_fan_speed_t;
 
@@ -2805,8 +3115,8 @@ typedef struct _zes_fan_speed_t
 /// @brief Fan temperature/speed pair
 typedef struct _zes_fan_temp_speed_t
 {
-    uint32_t temperature;                           ///< [in,out] Temperature in degrees Celsius.
-    zes_fan_speed_t speed;                          ///< [in,out] The speed of the fan
+    uint32_t temperature;                                                   ///< [in,out] Temperature in degrees Celsius.
+    zes_fan_speed_t speed;                                                  ///< [in,out] The speed of the fan
 
 } zes_fan_temp_speed_t;
 
@@ -2820,11 +3130,11 @@ typedef struct _zes_fan_temp_speed_t
 /// @brief Fan speed table
 typedef struct _zes_fan_speed_table_t
 {
-    int32_t numPoints;                              ///< [in,out] The number of valid points in the fan speed table. 0 means
-                                                    ///< that there is no fan speed table configured. -1 means that a fan speed
-                                                    ///< table is not supported by the hardware.
-    zes_fan_temp_speed_t table[ZES_FAN_TEMP_SPEED_PAIR_COUNT];  ///< [in,out] Array of temperature/fan speed pairs. The table is ordered
-                                                    ///< based on temperature from lowest to highest.
+    int32_t numPoints;                                                      ///< [in,out] The number of valid points in the fan speed table. 0 means
+                                                                            ///< that there is no fan speed table configured. -1 means that a fan speed
+                                                                            ///< table is not supported by the hardware.
+    zes_fan_temp_speed_t table[ZES_FAN_TEMP_SPEED_PAIR_COUNT];              ///< [in,out] Array of temperature/fan speed pairs. The table is ordered
+                                                                            ///< based on temperature from lowest to highest.
 
 } zes_fan_speed_table_t;
 
@@ -2832,23 +3142,23 @@ typedef struct _zes_fan_speed_table_t
 /// @brief Fan properties
 typedef struct _zes_fan_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Indicates if software can control the fan speed assuming the
-                                                    ///< user has permissions
-    uint32_t supportedModes;                        ///< [out] Bitfield of supported fan configuration modes
-                                                    ///< (1<<::zes_fan_speed_mode_t)
-    uint32_t supportedUnits;                        ///< [out] Bitfield of supported fan speed units
-                                                    ///< (1<<::zes_fan_speed_units_t)
-    int32_t maxRPM;                                 ///< [out] The maximum RPM of the fan. A value of -1 means that this
-                                                    ///< property is unknown. 
-    int32_t maxPoints;                              ///< [out] The maximum number of points in the fan temp/speed table. A
-                                                    ///< value of -1 means that this fan doesn't support providing a temp/speed
-                                                    ///< table.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Indicates if software can control the fan speed assuming the
+                                                                            ///< user has permissions
+    uint32_t supportedModes;                                                ///< [out] Bitfield of supported fan configuration modes
+                                                                            ///< (1<<::zes_fan_speed_mode_t)
+    uint32_t supportedUnits;                                                ///< [out] Bitfield of supported fan speed units
+                                                                            ///< (1<<::zes_fan_speed_units_t)
+    int32_t maxRPM;                                                         ///< [out] The maximum RPM of the fan. A value of -1 means that this
+                                                                            ///< property is unknown. 
+    int32_t maxPoints;                                                      ///< [out] The maximum number of points in the fan temp/speed table. A
+                                                                            ///< value of -1 means that this fan doesn't support providing a temp/speed
+                                                                            ///< table.
 
 } zes_fan_properties_t;
 
@@ -2856,12 +3166,12 @@ typedef struct _zes_fan_properties_t
 /// @brief Fan configuration
 typedef struct _zes_fan_config_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_fan_speed_mode_t mode;                      ///< [in,out] The fan speed mode (fixed, temp-speed table)
-    zes_fan_speed_t speedFixed;                     ///< [in,out] The current fixed fan speed setting
-    zes_fan_speed_table_t speedTable;               ///< [out] A table containing temperature/speed pairs
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_fan_speed_mode_t mode;                                              ///< [in,out] The fan speed mode (fixed, temp-speed table)
+    zes_fan_speed_t speedFixed;                                             ///< [in,out] The current fixed fan speed setting
+    zes_fan_speed_table_t speedTable;                                       ///< [out] A table containing temperature/speed pairs
 
 } zes_fan_config_t;
 
@@ -2884,18 +3194,18 @@ typedef struct _zes_fan_config_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumFans(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_fan_handle_t* phFan                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_fan_handle_t* phFan                                                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2917,8 +3227,8 @@ zesDeviceEnumFans(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFanGetProperties(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    zes_fan_properties_t* pProperties               ///< [in,out] Will contain the properties of the fan.
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    zes_fan_properties_t* pProperties                                       ///< [in,out] Will contain the properties of the fan.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2941,8 +3251,8 @@ zesFanGetProperties(
 ///         + `nullptr == pConfig`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFanGetConfig(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    zes_fan_config_t* pConfig                       ///< [in,out] Will contain the current configuration of the fan.
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    zes_fan_config_t* pConfig                                               ///< [in,out] Will contain the current configuration of the fan.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2965,7 +3275,7 @@ zesFanGetConfig(
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFanSetDefaultMode(
-    zes_fan_handle_t hFan                           ///< [in] Handle for the component.
+    zes_fan_handle_t hFan                                                   ///< [in] Handle for the component.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -2989,11 +3299,11 @@ zesFanSetDefaultMode(
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Fixing the fan speed not supported by the hardware or the fan speed units are not supported. See ::zes_fan_properties_t.supportedModes and ::zes_fan_properties_t.supportedUnits.
+///         + Fixing the fan speed not supported by the hardware or the fan speed units are not supported. See the `supportedModes` and `supportedUnits` members of ::zes_fan_properties_t.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFanSetFixedSpeedMode(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    const zes_fan_speed_t* speed                    ///< [in] The fixed fan speed setting
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    const zes_fan_speed_t* speed                                            ///< [in] The fixed fan speed setting
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3019,11 +3329,11 @@ zesFanSetFixedSpeedMode(
 ///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
 ///         + The temperature/speed pairs in the array are not sorted on temperature from lowest to highest.
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Fan speed table not supported by the hardware or the fan speed units are not supported. See ::zes_fan_properties_t.supportedModes and ::zes_fan_properties_t.supportedUnits.
+///         + Fan speed table not supported by the hardware or the fan speed units are not supported. See the `supportedModes` and `supportedUnits` members of ::zes_fan_properties_t.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFanSetSpeedTableMode(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    const zes_fan_speed_table_t* speedTable         ///< [in] A table containing temperature/speed pairs.
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    const zes_fan_speed_table_t* speedTable                                 ///< [in] A table containing temperature/speed pairs.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3046,14 +3356,14 @@ zesFanSetSpeedTableMode(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == pSpeed`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + The requested fan speed units are not supported. See ::zes_fan_properties_t.supportedUnits.
+///         + The requested fan speed units are not supported. See the `supportedUnits` member of ::zes_fan_properties_t.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFanGetState(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    zes_fan_speed_units_t units,                    ///< [in] The units in which the fan speed should be returned.
-    int32_t* pSpeed                                 ///< [in,out] Will contain the current speed of the fan in the units
-                                                    ///< requested. A value of -1 indicates that the fan speed cannot be
-                                                    ///< measured.
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    zes_fan_speed_units_t units,                                            ///< [in] The units in which the fan speed should be returned.
+    int32_t* pSpeed                                                         ///< [in,out] Will contain the current speed of the fan in the units
+                                                                            ///< requested. A value of -1 indicates that the fan speed cannot be
+                                                                            ///< measured.
     );
 
 #if !defined(__GNUC__)
@@ -3067,18 +3377,18 @@ zesFanGetState(
 /// @brief Firmware properties
 typedef struct _zes_firmware_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Indicates if software can flash the firmware assuming the user
-                                                    ///< has permissions
-    char name[ZES_STRING_PROPERTY_SIZE];            ///< [out] NULL terminated string value. The string "unknown" will be
-                                                    ///< returned if this property cannot be determined.
-    char version[ZES_STRING_PROPERTY_SIZE];         ///< [out] NULL terminated string value. The string "unknown" will be
-                                                    ///< returned if this property cannot be determined.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Indicates if software can flash the firmware assuming the user
+                                                                            ///< has permissions
+    char name[ZES_STRING_PROPERTY_SIZE];                                    ///< [out] NULL terminated string value. The string "unknown" will be
+                                                                            ///< returned if this property cannot be determined.
+    char version[ZES_STRING_PROPERTY_SIZE];                                 ///< [out] NULL terminated string value. The string "unknown" will be
+                                                                            ///< returned if this property cannot be determined.
 
 } zes_firmware_properties_t;
 
@@ -3101,18 +3411,18 @@ typedef struct _zes_firmware_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumFirmwares(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_firmware_handle_t* phFirmware               ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_firmware_handle_t* phFirmware                                       ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3134,17 +3444,21 @@ zesDeviceEnumFirmwares(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFirmwareGetProperties(
-    zes_firmware_handle_t hFirmware,                ///< [in] Handle for the component.
-    zes_firmware_properties_t* pProperties          ///< [in,out] Pointer to an array that will hold the properties of the
-                                                    ///< firmware
+    zes_firmware_handle_t hFirmware,                                        ///< [in] Handle for the component.
+    zes_firmware_properties_t* pProperties                                  ///< [in,out] Pointer to an array that will hold the properties of the
+                                                                            ///< firmware
     );
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Flash a new firmware image
 /// 
 /// @details
+///     - Any running workload must be gracefully closed before invoking this
+///       function.
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - This is a non-blocking call. Application may call
+///       ::zesFirmwareGetFlashProgress to get completion status.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3160,9 +3474,32 @@ zesFirmwareGetProperties(
 ///         + User does not have permissions to perform this operation.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFirmwareFlash(
-    zes_firmware_handle_t hFirmware,                ///< [in] Handle for the component.
-    void* pImage,                                   ///< [in] Image of the new firmware to flash.
-    uint32_t size                                   ///< [in] Size of the flash image.
+    zes_firmware_handle_t hFirmware,                                        ///< [in] Handle for the component.
+    void* pImage,                                                           ///< [in] Image of the new firmware to flash.
+    uint32_t size                                                           ///< [in] Size of the flash image.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Firmware Flash Progress
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFirmware`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCompletionPercent`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFirmwareGetFlashProgress(
+    zes_firmware_handle_t hFirmware,                                        ///< [in] Handle for the component.
+    uint32_t* pCompletionPercent                                            ///< [in,out] Pointer to the Completion Percentage of Firmware Update
     );
 
 #if !defined(__GNUC__)
@@ -3176,9 +3513,9 @@ zesFirmwareFlash(
 /// @brief Frequency domains.
 typedef enum _zes_freq_domain_t
 {
-    ZES_FREQ_DOMAIN_GPU = 0,                        ///< GPU Core Domain.
-    ZES_FREQ_DOMAIN_MEMORY = 1,                     ///< Local Memory Domain.
-    ZES_FREQ_DOMAIN_MEDIA = 2,                      ///< GPU Media Domain.
+    ZES_FREQ_DOMAIN_GPU = 0,                                                ///< GPU Core Domain.
+    ZES_FREQ_DOMAIN_MEMORY = 1,                                             ///< Local Memory Domain.
+    ZES_FREQ_DOMAIN_MEDIA = 2,                                              ///< GPU Media Domain.
     ZES_FREQ_DOMAIN_FORCE_UINT32 = 0x7fffffff
 
 } zes_freq_domain_t;
@@ -3195,21 +3532,21 @@ typedef enum _zes_freq_domain_t
 ///       frequency that can be requested.
 typedef struct _zes_freq_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_freq_domain_t type;                         ///< [out] The hardware block that this frequency domain controls (GPU,
-                                                    ///< memory, ...)
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Indicates if software can control the frequency of this domain
-                                                    ///< assuming the user has permissions
-    ze_bool_t isThrottleEventSupported;             ///< [out] Indicates if software can register to receive event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED
-    double min;                                     ///< [out] The minimum hardware clock frequency in units of MHz.
-    double max;                                     ///< [out] The maximum non-overclock hardware clock frequency in units of
-                                                    ///< MHz.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_freq_domain_t type;                                                 ///< [out] The hardware block that this frequency domain controls (GPU,
+                                                                            ///< memory, ...)
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Indicates if software can control the frequency of this domain
+                                                                            ///< assuming the user has permissions
+    ze_bool_t isThrottleEventSupported;                                     ///< [out] Indicates if software can register to receive event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED
+    double min;                                                             ///< [out] The minimum hardware clock frequency in units of MHz.
+    double max;                                                             ///< [out] The maximum non-overclock hardware clock frequency in units of
+                                                                            ///< MHz.
 
 } zes_freq_properties_t;
 
@@ -3224,19 +3561,19 @@ typedef struct _zes_freq_properties_t
 ///       the min and max limit.
 typedef struct _zes_freq_range_t
 {
-    double min;                                     ///< [in,out] The min frequency in MHz below which hardware frequency
-                                                    ///< management will not request frequencies. On input, setting to 0 will
-                                                    ///< permit the frequency to go down to the hardware minimum while setting
-                                                    ///< to -1 will return the min frequency limit to the factory value (can be
-                                                    ///< larger than the hardware min). On output, a negative value indicates
-                                                    ///< that no external minimum frequency limit is in effect.
-    double max;                                     ///< [in,out] The max frequency in MHz above which hardware frequency
-                                                    ///< management will not request frequencies. On input, setting to 0 or a
-                                                    ///< very big number will permit the frequency to go all the way up to the
-                                                    ///< hardware maximum while setting to -1 will return the max frequency to
-                                                    ///< the factory value (which can be less than the hardware max). On
-                                                    ///< output, a negative number indicates that no external maximum frequency
-                                                    ///< limit is in effect.
+    double min;                                                             ///< [in,out] The min frequency in MHz below which hardware frequency
+                                                                            ///< management will not request frequencies. On input, setting to 0 will
+                                                                            ///< permit the frequency to go down to the hardware minimum while setting
+                                                                            ///< to -1 will return the min frequency limit to the factory value (can be
+                                                                            ///< larger than the hardware min). On output, a negative value indicates
+                                                                            ///< that no external minimum frequency limit is in effect.
+    double max;                                                             ///< [in,out] The max frequency in MHz above which hardware frequency
+                                                                            ///< management will not request frequencies. On input, setting to 0 or a
+                                                                            ///< very big number will permit the frequency to go all the way up to the
+                                                                            ///< hardware maximum while setting to -1 will return the max frequency to
+                                                                            ///< the factory value (which can be less than the hardware max). On
+                                                                            ///< output, a negative number indicates that no external maximum frequency
+                                                                            ///< limit is in effect.
 
 } zes_freq_range_t;
 
@@ -3245,14 +3582,14 @@ typedef struct _zes_freq_range_t
 typedef uint32_t zes_freq_throttle_reason_flags_t;
 typedef enum _zes_freq_throttle_reason_flag_t
 {
-    ZES_FREQ_THROTTLE_REASON_FLAG_AVE_PWR_CAP = ZE_BIT(0),  ///< frequency throttled due to average power excursion (PL1)
-    ZES_FREQ_THROTTLE_REASON_FLAG_BURST_PWR_CAP = ZE_BIT(1),///< frequency throttled due to burst power excursion (PL2)
-    ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT = ZE_BIT(2),///< frequency throttled due to current excursion (PL4)
-    ZES_FREQ_THROTTLE_REASON_FLAG_THERMAL_LIMIT = ZE_BIT(3),///< frequency throttled due to thermal excursion (T > TjMax)
-    ZES_FREQ_THROTTLE_REASON_FLAG_PSU_ALERT = ZE_BIT(4),///< frequency throttled due to power supply assertion
-    ZES_FREQ_THROTTLE_REASON_FLAG_SW_RANGE = ZE_BIT(5), ///< frequency throttled due to software supplied frequency range
-    ZES_FREQ_THROTTLE_REASON_FLAG_HW_RANGE = ZE_BIT(6), ///< frequency throttled due to a sub block that has a lower frequency
-                                                    ///< range when it receives clocks
+    ZES_FREQ_THROTTLE_REASON_FLAG_AVE_PWR_CAP = ZE_BIT(0),                  ///< frequency throttled due to average power excursion (PL1)
+    ZES_FREQ_THROTTLE_REASON_FLAG_BURST_PWR_CAP = ZE_BIT(1),                ///< frequency throttled due to burst power excursion (PL2)
+    ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT = ZE_BIT(2),                ///< frequency throttled due to current excursion (PL4)
+    ZES_FREQ_THROTTLE_REASON_FLAG_THERMAL_LIMIT = ZE_BIT(3),                ///< frequency throttled due to thermal excursion (T > TjMax)
+    ZES_FREQ_THROTTLE_REASON_FLAG_PSU_ALERT = ZE_BIT(4),                    ///< frequency throttled due to power supply assertion
+    ZES_FREQ_THROTTLE_REASON_FLAG_SW_RANGE = ZE_BIT(5),                     ///< frequency throttled due to software supplied frequency range
+    ZES_FREQ_THROTTLE_REASON_FLAG_HW_RANGE = ZE_BIT(6),                     ///< frequency throttled due to a sub block that has a lower frequency
+                                                                            ///< range when it receives clocks
     ZES_FREQ_THROTTLE_REASON_FLAG_FORCE_UINT32 = 0x7fffffff
 
 } zes_freq_throttle_reason_flag_t;
@@ -3261,23 +3598,23 @@ typedef enum _zes_freq_throttle_reason_flag_t
 /// @brief Frequency state
 typedef struct _zes_freq_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    double currentVoltage;                          ///< [out] Current voltage in Volts. A negative value indicates that this
-                                                    ///< property is not known.
-    double request;                                 ///< [out] The current frequency request in MHz. A negative value indicates
-                                                    ///< that this property is not known.
-    double tdp;                                     ///< [out] The maximum frequency in MHz supported under the current TDP
-                                                    ///< conditions. This fluctuates dynamically based on the power and thermal
-                                                    ///< limits of the part. A negative value indicates that this property is
-                                                    ///< not known.
-    double efficient;                               ///< [out] The efficient minimum frequency in MHz. A negative value
-                                                    ///< indicates that this property is not known.
-    double actual;                                  ///< [out] The resolved frequency in MHz. A negative value indicates that
-                                                    ///< this property is not known.
-    zes_freq_throttle_reason_flags_t throttleReasons;   ///< [out] The reasons that the frequency is being limited by the hardware.
-                                                    ///< Returns 0 (frequency not throttled) or a combination of ::zes_freq_throttle_reason_flag_t.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    double currentVoltage;                                                  ///< [out] Current voltage in Volts. A negative value indicates that this
+                                                                            ///< property is not known.
+    double request;                                                         ///< [out] The current frequency request in MHz. A negative value indicates
+                                                                            ///< that this property is not known.
+    double tdp;                                                             ///< [out] The maximum frequency in MHz supported under the current TDP
+                                                                            ///< conditions. This fluctuates dynamically based on the power and thermal
+                                                                            ///< limits of the part. A negative value indicates that this property is
+                                                                            ///< not known.
+    double efficient;                                                       ///< [out] The efficient minimum frequency in MHz. A negative value
+                                                                            ///< indicates that this property is not known.
+    double actual;                                                          ///< [out] The resolved frequency in MHz. A negative value indicates that
+                                                                            ///< this property is not known.
+    zes_freq_throttle_reason_flags_t throttleReasons;                       ///< [out] The reasons that the frequency is being limited by the hardware.
+                                                                            ///< Returns 0 (frequency not throttled) or a combination of ::zes_freq_throttle_reason_flag_t.
 
 } zes_freq_state_t;
 
@@ -3290,38 +3627,41 @@ typedef struct _zes_freq_state_t
 ///       s1.throttleTime) / (s2.timestamp - s1.timestamp)
 typedef struct _zes_freq_throttle_time_t
 {
-    uint64_t throttleTime;                          ///< [out] The monotonic counter of time in microseconds that the frequency
-                                                    ///< has been limited by the hardware.
-    uint64_t timestamp;                             ///< [out] Microsecond timestamp when throttleTime was captured.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
+    uint64_t throttleTime;                                                  ///< [out] The monotonic counter of time in microseconds that the frequency
+                                                                            ///< has been limited by the hardware.
+    uint64_t timestamp;                                                     ///< [out] Microsecond timestamp when throttleTime was captured.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
 
 } zes_freq_throttle_time_t;
 
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Overclocking modes
+/// 
+/// @details
+///     - [DEPRECATED] No longer supported.
 typedef enum _zes_oc_mode_t
 {
-    ZES_OC_MODE_OFF = 0,                            ///< Overclocking if off - hardware is running using factory default
-                                                    ///< voltages/frequencies.
-    ZES_OC_MODE_OVERRIDE = 1,                       ///< Overclock override mode - In this mode, a fixed user-supplied voltage
-                                                    ///< is applied independent of the frequency request. The maximum permitted
-                                                    ///< frequency can also be increased. This mode disables INTERPOLATIVE and
-                                                    ///< FIXED modes.
-    ZES_OC_MODE_INTERPOLATIVE = 2,                  ///< Overclock interpolative mode - In this mode, the voltage/frequency
-                                                    ///< curve can be extended with a new voltage/frequency point that will be
-                                                    ///< interpolated. The existing voltage/frequency points can also be offset
-                                                    ///< (up or down) by a fixed voltage. This mode disables FIXED and OVERRIDE
-                                                    ///< modes.
-    ZES_OC_MODE_FIXED = 3,                          ///< Overclocking fixed Mode - In this mode, hardware will disable most
-                                                    ///< frequency throttling and lock the frequency and voltage at the
-                                                    ///< specified overclock values. This mode disables OVERRIDE and
-                                                    ///< INTERPOLATIVE modes. This mode can damage the part, most of the
-                                                    ///< protections are disabled on this mode.
+    ZES_OC_MODE_OFF = 0,                                                    ///< Overclocking if off - hardware is running using factory default
+                                                                            ///< voltages/frequencies.
+    ZES_OC_MODE_OVERRIDE = 1,                                               ///< Overclock override mode - In this mode, a fixed user-supplied voltage
+                                                                            ///< is applied independent of the frequency request. The maximum permitted
+                                                                            ///< frequency can also be increased. This mode disables INTERPOLATIVE and
+                                                                            ///< FIXED modes.
+    ZES_OC_MODE_INTERPOLATIVE = 2,                                          ///< Overclock interpolative mode - In this mode, the voltage/frequency
+                                                                            ///< curve can be extended with a new voltage/frequency point that will be
+                                                                            ///< interpolated. The existing voltage/frequency points can also be offset
+                                                                            ///< (up or down) by a fixed voltage. This mode disables FIXED and OVERRIDE
+                                                                            ///< modes.
+    ZES_OC_MODE_FIXED = 3,                                                  ///< Overclocking fixed Mode - In this mode, hardware will disable most
+                                                                            ///< frequency throttling and lock the frequency and voltage at the
+                                                                            ///< specified overclock values. This mode disables OVERRIDE and
+                                                                            ///< INTERPOLATIVE modes. This mode can damage the part, most of the
+                                                                            ///< protections are disabled on this mode.
     ZES_OC_MODE_FORCE_UINT32 = 0x7fffffff
 
 } zes_oc_mode_t;
@@ -3332,35 +3672,36 @@ typedef enum _zes_oc_mode_t
 /// @details
 ///     - Provides all the overclocking capabilities and properties supported by
 ///       the device for the frequency domain.
+///     - [DEPRECATED] No longer supported.
 typedef struct _zes_oc_capabilities_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t isOcSupported;                        ///< [out] Indicates if any overclocking features are supported on this
-                                                    ///< frequency domain.
-    double maxFactoryDefaultFrequency;              ///< [out] Factory default non-overclock maximum frequency in Mhz.
-    double maxFactoryDefaultVoltage;                ///< [out] Factory default voltage used for the non-overclock maximum
-                                                    ///< frequency in MHz.
-    double maxOcFrequency;                          ///< [out] Maximum hardware overclocking frequency limit in Mhz.
-    double minOcVoltageOffset;                      ///< [out] The minimum voltage offset that can be applied to the
-                                                    ///< voltage/frequency curve. Note that this number can be negative.
-    double maxOcVoltageOffset;                      ///< [out] The maximum voltage offset that can be applied to the
-                                                    ///< voltage/frequency curve.
-    double maxOcVoltage;                            ///< [out] The maximum overclock voltage that hardware supports.
-    ze_bool_t isTjMaxSupported;                     ///< [out] Indicates if the maximum temperature limit (TjMax) can be
-                                                    ///< changed for this frequency domain.
-    ze_bool_t isIccMaxSupported;                    ///< [out] Indicates if the maximum current (IccMax) can be changed for
-                                                    ///< this frequency domain.
-    ze_bool_t isHighVoltModeCapable;                ///< [out] Indicates if this frequency domains supports a feature to set
-                                                    ///< very high voltages.
-    ze_bool_t isHighVoltModeEnabled;                ///< [out] Indicates if very high voltages are permitted on this frequency
-                                                    ///< domain.
-    ze_bool_t isExtendedModeSupported;              ///< [out] Indicates if the extended overclocking features are supported.
-                                                    ///< If this is supported, increments are on 1 Mhz basis.
-    ze_bool_t isFixedModeSupported;                 ///< [out] Indicates if the fixed mode is supported. In this mode, hardware
-                                                    ///< will disable most frequency throttling and lock the frequency and
-                                                    ///< voltage at the specified overclock values.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t isOcSupported;                                                ///< [out] Indicates if any overclocking features are supported on this
+                                                                            ///< frequency domain.
+    double maxFactoryDefaultFrequency;                                      ///< [out] Factory default non-overclock maximum frequency in Mhz.
+    double maxFactoryDefaultVoltage;                                        ///< [out] Factory default voltage used for the non-overclock maximum
+                                                                            ///< frequency in MHz.
+    double maxOcFrequency;                                                  ///< [out] Maximum hardware overclocking frequency limit in Mhz.
+    double minOcVoltageOffset;                                              ///< [out] The minimum voltage offset that can be applied to the
+                                                                            ///< voltage/frequency curve. Note that this number can be negative.
+    double maxOcVoltageOffset;                                              ///< [out] The maximum voltage offset that can be applied to the
+                                                                            ///< voltage/frequency curve.
+    double maxOcVoltage;                                                    ///< [out] The maximum overclock voltage that hardware supports.
+    ze_bool_t isTjMaxSupported;                                             ///< [out] Indicates if the maximum temperature limit (TjMax) can be
+                                                                            ///< changed for this frequency domain.
+    ze_bool_t isIccMaxSupported;                                            ///< [out] Indicates if the maximum current (IccMax) can be changed for
+                                                                            ///< this frequency domain.
+    ze_bool_t isHighVoltModeCapable;                                        ///< [out] Indicates if this frequency domains supports a feature to set
+                                                                            ///< very high voltages.
+    ze_bool_t isHighVoltModeEnabled;                                        ///< [out] Indicates if very high voltages are permitted on this frequency
+                                                                            ///< domain.
+    ze_bool_t isExtendedModeSupported;                                      ///< [out] Indicates if the extended overclocking features are supported.
+                                                                            ///< If this is supported, increments are on 1 Mhz basis.
+    ze_bool_t isFixedModeSupported;                                         ///< [out] Indicates if the fixed mode is supported. In this mode, hardware
+                                                                            ///< will disable most frequency throttling and lock the frequency and
+                                                                            ///< voltage at the specified overclock values.
 
 } zes_oc_capabilities_t;
 
@@ -3383,18 +3724,18 @@ typedef struct _zes_oc_capabilities_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumFrequencyDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_freq_handle_t* phFrequency                  ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_freq_handle_t* phFrequency                                          ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3416,8 +3757,8 @@ zesDeviceEnumFrequencyDomains(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyGetProperties(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_freq_properties_t* pProperties              ///< [in,out] The frequency properties for the specified domain.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_freq_properties_t* pProperties                                      ///< [in,out] The frequency properties for the specified domain.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3442,16 +3783,16 @@ zesFrequencyGetProperties(
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyGetAvailableClocks(
-    zes_freq_handle_t hFrequency,                   ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of frequencies.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of frequencies that are available.
-                                                    ///< if count is greater than the number of frequencies that are available,
-                                                    ///< then the driver shall update the value with the correct number of frequencies.
-    double* phFrequency                             ///< [in,out][optional][range(0, *pCount)] array of frequencies in units of
-                                                    ///< MHz and sorted from slowest to fastest.
-                                                    ///< if count is less than the number of frequencies that are available,
-                                                    ///< then the driver shall only retrieve that number of frequencies.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of frequencies.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of frequencies that are available.
+                                                                            ///< if count is greater than the number of frequencies that are available,
+                                                                            ///< then the driver shall update the value with the correct number of frequencies.
+    double* phFrequency                                                     ///< [in,out][optional][range(0, *pCount)] array of frequencies in units of
+                                                                            ///< MHz and sorted from slowest to fastest.
+                                                                            ///< if count is less than the number of frequencies that are available,
+                                                                            ///< then the driver shall only retrieve that number of frequencies.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3473,9 +3814,9 @@ zesFrequencyGetAvailableClocks(
 ///         + `nullptr == pLimits`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyGetRange(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_freq_range_t* pLimits                       ///< [in,out] The range between which the hardware can operate for the
-                                                    ///< specified domain.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_freq_range_t* pLimits                                               ///< [in,out] The range between which the hardware can operate for the
+                                                                            ///< specified domain.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3499,9 +3840,9 @@ zesFrequencyGetRange(
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencySetRange(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    const zes_freq_range_t* pLimits                 ///< [in] The limits between which the hardware can operate for the
-                                                    ///< specified domain.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    const zes_freq_range_t* pLimits                                         ///< [in] The limits between which the hardware can operate for the
+                                                                            ///< specified domain.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3524,8 +3865,8 @@ zesFrequencySetRange(
 ///         + `nullptr == pState`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyGetState(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_freq_state_t* pState                        ///< [in,out] Frequency state for the specified domain.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_freq_state_t* pState                                                ///< [in,out] Frequency state for the specified domain.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3547,9 +3888,9 @@ zesFrequencyGetState(
 ///         + `nullptr == pThrottleTime`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyGetThrottleTime(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_freq_throttle_time_t* pThrottleTime         ///< [in,out] Will contain a snapshot of the throttle time counters for the
-                                                    ///< specified domain.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_freq_throttle_time_t* pThrottleTime                                 ///< [in,out] Will contain a snapshot of the throttle time counters for the
+                                                                            ///< specified domain.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3558,6 +3899,7 @@ zesFrequencyGetThrottleTime(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3571,9 +3913,8 @@ zesFrequencyGetThrottleTime(
 ///         + `nullptr == pOcCapabilities`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcGetCapabilities(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_oc_capabilities_t* pOcCapabilities          ///< [in,out] Pointer to the capabilities structure
-                                                    ///< ::zes_oc_capabilities_t.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_oc_capabilities_t* pOcCapabilities                                  ///< [in,out] Pointer to the capabilities structure.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3583,6 +3924,7 @@ zesFrequencyOcGetCapabilities(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3595,19 +3937,20 @@ zesFrequencyOcGetCapabilities(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == pCurrentOcFrequency`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
+///         + Overclocking feature is locked on this frequency domain.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcGetFrequencyTarget(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double* pCurrentOcFrequency                     ///< [out] Overclocking Frequency in MHz, if extended moded is supported,
-                                                    ///< will returned in 1 Mhz granularity, else, in multiples of 50 Mhz. This
-                                                    ///< cannot be greater than ::zes_oc_capabilities_t.maxOcFrequency.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double* pCurrentOcFrequency                                             ///< [out] Overclocking Frequency in MHz, if extended moded is supported,
+                                                                            ///< will returned in 1 Mhz granularity, else, in multiples of 50 Mhz. This
+                                                                            ///< cannot be greater than the `maxOcFrequency` member of
+                                                                            ///< ::zes_oc_capabilities_t.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3617,6 +3960,7 @@ zesFrequencyOcGetFrequencyTarget(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3627,19 +3971,20 @@ zesFrequencyOcGetFrequencyTarget(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `nullptr == hFrequency`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
+///         + Overclocking feature is locked on this frequency domain.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcSetFrequencyTarget(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double CurrentOcFrequency                       ///< [in] Overclocking Frequency in MHz, if extended moded is supported, it
-                                                    ///< could be set in 1 Mhz granularity, else, in multiples of 50 Mhz. This
-                                                    ///< cannot be greater than ::zes_oc_capabilities_t.maxOcFrequency.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double CurrentOcFrequency                                               ///< [in] Overclocking Frequency in MHz, if extended moded is supported, it
+                                                                            ///< could be set in 1 Mhz granularity, else, in multiples of 50 Mhz. This
+                                                                            ///< cannot be greater than the `maxOcFrequency` member of
+                                                                            ///< ::zes_oc_capabilities_t.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3648,6 +3993,7 @@ zesFrequencyOcSetFrequencyTarget(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3661,22 +4007,22 @@ zesFrequencyOcSetFrequencyTarget(
 ///         + `nullptr == pCurrentVoltageTarget`
 ///         + `nullptr == pCurrentVoltageOffset`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
+///         + Overclocking feature is locked on this frequency domain.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcGetVoltageTarget(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double* pCurrentVoltageTarget,                  ///< [out] Overclock voltage in Volts. This cannot be greater than
-                                                    ///< ::zes_oc_capabilities_t.maxOcVoltage.
-    double* pCurrentVoltageOffset                   ///< [out] This voltage offset is applied to all points on the
-                                                    ///< voltage/frequency curve, include the new overclock voltageTarget. It
-                                                    ///< can be in the range (::zes_oc_capabilities_t.minOcVoltageOffset,
-                                                    ///< ::zes_oc_capabilities_t.maxOcVoltageOffset).
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double* pCurrentVoltageTarget,                                          ///< [out] Overclock voltage in Volts. This cannot be greater than the
+                                                                            ///< `maxOcVoltage` member of ::zes_oc_capabilities_t.
+    double* pCurrentVoltageOffset                                           ///< [out] This voltage offset is applied to all points on the
+                                                                            ///< voltage/frequency curve, including the new overclock voltageTarget.
+                                                                            ///< Valid range is between the `minOcVoltageOffset` and
+                                                                            ///< `maxOcVoltageOffset` members of ::zes_oc_capabilities_t.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3685,6 +4031,7 @@ zesFrequencyOcGetVoltageTarget(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3695,22 +4042,22 @@ zesFrequencyOcGetVoltageTarget(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `nullptr == hFrequency`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
+///         + Overclocking feature is locked on this frequency domain.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcSetVoltageTarget(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double CurrentVoltageTarget,                    ///< [in] Overclock voltage in Volts. This cannot be greater than
-                                                    ///< ::zes_oc_capabilities_t.maxOcVoltage.
-    double CurrentVoltageOffset                     ///< [in] This voltage offset is applied to all points on the
-                                                    ///< voltage/frequency curve, include the new overclock voltageTarget. It
-                                                    ///< can be in the range (::zes_oc_capabilities_t.minOcVoltageOffset,
-                                                    ///< ::zes_oc_capabilities_t.maxOcVoltageOffset).
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double CurrentVoltageTarget,                                            ///< [in] Overclock voltage in Volts. This cannot be greater than the
+                                                                            ///< `maxOcVoltage` member of ::zes_oc_capabilities_t.
+    double CurrentVoltageOffset                                             ///< [in] This voltage offset is applied to all points on the
+                                                                            ///< voltage/frequency curve, include the new overclock voltageTarget.
+                                                                            ///< Valid range is between the `minOcVoltageOffset` and
+                                                                            ///< `maxOcVoltageOffset` members of ::zes_oc_capabilities_t.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3719,6 +4066,7 @@ zesFrequencyOcSetVoltageTarget(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3731,17 +4079,17 @@ zesFrequencyOcSetVoltageTarget(
 ///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
 ///         + `::ZES_OC_MODE_FIXED < CurrentOcMode`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
+///         + Overclocking feature is locked on this frequency domain.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcSetMode(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_oc_mode_t CurrentOcMode                     ///< [in] Current Overclocking Mode ::zes_oc_mode_t.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_oc_mode_t CurrentOcMode                                             ///< [in] Current Overclocking Mode ::zes_oc_mode_t.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3750,6 +4098,7 @@ zesFrequencyOcSetMode(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3762,17 +4111,17 @@ zesFrequencyOcSetMode(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == pCurrentOcMode`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
+///         + Overclocking feature is locked on this frequency domain.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcGetMode(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_oc_mode_t* pCurrentOcMode                   ///< [out] Current Overclocking Mode ::zes_oc_mode_t.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_oc_mode_t* pCurrentOcMode                                           ///< [out] Current Overclocking Mode ::zes_oc_mode_t.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3781,6 +4130,7 @@ zesFrequencyOcGetMode(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3793,13 +4143,13 @@ zesFrequencyOcGetMode(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == pOcIccMax`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + Capability ::zes_oc_capabilities_t.isIccMaxSupported is false for this frequency domain
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + Capability the `isIccMaxSupported` member of ::zes_oc_capabilities_t is false for this frequency domain.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcGetIccMax(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double* pOcIccMax                               ///< [in,out] Will contain the maximum current limit in Amperes on
-                                                    ///< successful return.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double* pOcIccMax                                                       ///< [in,out] Will contain the maximum current limit in Amperes on
+                                                                            ///< successful return.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3809,6 +4159,7 @@ zesFrequencyOcGetIccMax(
 ///     - Setting ocIccMax to 0.0 will return the value to the factory default.
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3819,18 +4170,18 @@ zesFrequencyOcGetIccMax(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `nullptr == hFrequency`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + Capability ::zes_oc_capabilities_t.isIccMaxSupported is false for this frequency domain
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The `isIccMaxSupported` member of ::zes_oc_capabilities_t is false for this frequency domain.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
+///         + Overclocking feature is locked on this frequency domain.
 ///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + The specified current limit is too low or too high
+///         + The specified current limit is too low or too high.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcSetIccMax(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double ocIccMax                                 ///< [in] The new maximum current limit in Amperes.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double ocIccMax                                                         ///< [in] The new maximum current limit in Amperes.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3839,6 +4190,7 @@ zesFrequencyOcSetIccMax(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3851,12 +4203,12 @@ zesFrequencyOcSetIccMax(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == pOcTjMax`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcGetTjMax(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double* pOcTjMax                                ///< [in,out] Will contain the maximum temperature limit in degrees Celsius
-                                                    ///< on successful return.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double* pOcTjMax                                                        ///< [in,out] Will contain the maximum temperature limit in degrees Celsius
+                                                                            ///< on successful return.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3866,6 +4218,7 @@ zesFrequencyOcGetTjMax(
 ///     - Setting ocTjMax to 0.0 will return the value to the factory default.
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -3876,18 +4229,18 @@ zesFrequencyOcGetTjMax(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `nullptr == hFrequency`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + Capability ::zes_oc_capabilities_t.isTjMaxSupported is false for this frequency domain
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The `isTjMaxSupported` member of ::zes_oc_capabilities_t is false for this frequency domain.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
+///         + Overclocking feature is locked on this frequency domain.
 ///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + The specified temperature limit is too high
+///         + The specified temperature limit is too high.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesFrequencyOcSetTjMax(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double ocTjMax                                  ///< [in] The new maximum temperature limit in degrees Celsius.
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double ocTjMax                                                          ///< [in] The new maximum temperature limit in degrees Celsius.
     );
 
 #if !defined(__GNUC__)
@@ -3901,15 +4254,15 @@ zesFrequencyOcSetTjMax(
 /// @brief LED properties
 typedef struct _zes_led_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Indicates if software can control the LED assuming the user has
-                                                    ///< permissions
-    ze_bool_t haveRGB;                              ///< [out] Indicates if the LED is RGB capable
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Indicates if software can control the LED assuming the user has
+                                                                            ///< permissions
+    ze_bool_t haveRGB;                                                      ///< [out] Indicates if the LED is RGB capable
 
 } zes_led_properties_t;
 
@@ -3917,12 +4270,12 @@ typedef struct _zes_led_properties_t
 /// @brief LED color
 typedef struct _zes_led_color_t
 {
-    double red;                                     ///< [in,out][range(0.0, 1.0)] The LED red value. On output, a value less
-                                                    ///< than 0.0 indicates that the color is not known.
-    double green;                                   ///< [in,out][range(0.0, 1.0)] The LED green value. On output, a value less
-                                                    ///< than 0.0 indicates that the color is not known.
-    double blue;                                    ///< [in,out][range(0.0, 1.0)] The LED blue value. On output, a value less
-                                                    ///< than 0.0 indicates that the color is not known.
+    double red;                                                             ///< [in,out][range(0.0, 1.0)] The LED red value. On output, a value less
+                                                                            ///< than 0.0 indicates that the color is not known.
+    double green;                                                           ///< [in,out][range(0.0, 1.0)] The LED green value. On output, a value less
+                                                                            ///< than 0.0 indicates that the color is not known.
+    double blue;                                                            ///< [in,out][range(0.0, 1.0)] The LED blue value. On output, a value less
+                                                                            ///< than 0.0 indicates that the color is not known.
 
 } zes_led_color_t;
 
@@ -3930,11 +4283,11 @@ typedef struct _zes_led_color_t
 /// @brief LED state
 typedef struct _zes_led_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t isOn;                                 ///< [out] Indicates if the LED is on or off
-    zes_led_color_t color;                          ///< [out] Color of the LED
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t isOn;                                                         ///< [out] Indicates if the LED is on or off
+    zes_led_color_t color;                                                  ///< [out] Color of the LED
 
 } zes_led_state_t;
 
@@ -3957,18 +4310,18 @@ typedef struct _zes_led_state_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumLeds(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_led_handle_t* phLed                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_led_handle_t* phLed                                                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -3990,8 +4343,8 @@ zesDeviceEnumLeds(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesLedGetProperties(
-    zes_led_handle_t hLed,                          ///< [in] Handle for the component.
-    zes_led_properties_t* pProperties               ///< [in,out] Will contain the properties of the LED.
+    zes_led_handle_t hLed,                                                  ///< [in] Handle for the component.
+    zes_led_properties_t* pProperties                                       ///< [in,out] Will contain the properties of the LED.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4013,8 +4366,8 @@ zesLedGetProperties(
 ///         + `nullptr == pState`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesLedGetState(
-    zes_led_handle_t hLed,                          ///< [in] Handle for the component.
-    zes_led_state_t* pState                         ///< [in,out] Will contain the current state of the LED.
+    zes_led_handle_t hLed,                                                  ///< [in] Handle for the component.
+    zes_led_state_t* pState                                                 ///< [in,out] Will contain the current state of the LED.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4036,8 +4389,8 @@ zesLedGetState(
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesLedSetState(
-    zes_led_handle_t hLed,                          ///< [in] Handle for the component.
-    ze_bool_t enable                                ///< [in] Set to TRUE to turn the LED on, FALSE to turn off.
+    zes_led_handle_t hLed,                                                  ///< [in] Handle for the component.
+    ze_bool_t enable                                                        ///< [in] Set to TRUE to turn the LED on, FALSE to turn off.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4060,11 +4413,11 @@ zesLedSetState(
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to make these modifications.
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This LED doesn't not support color changes. See ::zes_led_properties_t.haveRGB.
+///         + This LED doesn't not support color changes. See the `haveRGB` member of ::zes_led_properties_t.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesLedSetColor(
-    zes_led_handle_t hLed,                          ///< [in] Handle for the component.
-    const zes_led_color_t* pColor                   ///< [in] New color of the LED.
+    zes_led_handle_t hLed,                                                  ///< [in] Handle for the component.
+    const zes_led_color_t* pColor                                           ///< [in] New color of the LED.
     );
 
 #if !defined(__GNUC__)
@@ -4078,26 +4431,26 @@ zesLedSetColor(
 /// @brief Memory module types
 typedef enum _zes_mem_type_t
 {
-    ZES_MEM_TYPE_HBM = 0,                           ///< HBM memory
-    ZES_MEM_TYPE_DDR = 1,                           ///< DDR memory
-    ZES_MEM_TYPE_DDR3 = 2,                          ///< DDR3 memory
-    ZES_MEM_TYPE_DDR4 = 3,                          ///< DDR4 memory
-    ZES_MEM_TYPE_DDR5 = 4,                          ///< DDR5 memory
-    ZES_MEM_TYPE_LPDDR = 5,                         ///< LPDDR memory
-    ZES_MEM_TYPE_LPDDR3 = 6,                        ///< LPDDR3 memory
-    ZES_MEM_TYPE_LPDDR4 = 7,                        ///< LPDDR4 memory
-    ZES_MEM_TYPE_LPDDR5 = 8,                        ///< LPDDR5 memory
-    ZES_MEM_TYPE_SRAM = 9,                          ///< SRAM memory
-    ZES_MEM_TYPE_L1 = 10,                           ///< L1 cache
-    ZES_MEM_TYPE_L3 = 11,                           ///< L3 cache
-    ZES_MEM_TYPE_GRF = 12,                          ///< Execution unit register file
-    ZES_MEM_TYPE_SLM = 13,                          ///< Execution unit shared local memory
-    ZES_MEM_TYPE_GDDR4 = 14,                        ///< GDDR4 memory
-    ZES_MEM_TYPE_GDDR5 = 15,                        ///< GDDR5 memory
-    ZES_MEM_TYPE_GDDR5X = 16,                       ///< GDDR5X memory
-    ZES_MEM_TYPE_GDDR6 = 17,                        ///< GDDR6 memory
-    ZES_MEM_TYPE_GDDR6X = 18,                       ///< GDDR6X memory
-    ZES_MEM_TYPE_GDDR7 = 19,                        ///< GDDR7 memory
+    ZES_MEM_TYPE_HBM = 0,                                                   ///< HBM memory
+    ZES_MEM_TYPE_DDR = 1,                                                   ///< DDR memory
+    ZES_MEM_TYPE_DDR3 = 2,                                                  ///< DDR3 memory
+    ZES_MEM_TYPE_DDR4 = 3,                                                  ///< DDR4 memory
+    ZES_MEM_TYPE_DDR5 = 4,                                                  ///< DDR5 memory
+    ZES_MEM_TYPE_LPDDR = 5,                                                 ///< LPDDR memory
+    ZES_MEM_TYPE_LPDDR3 = 6,                                                ///< LPDDR3 memory
+    ZES_MEM_TYPE_LPDDR4 = 7,                                                ///< LPDDR4 memory
+    ZES_MEM_TYPE_LPDDR5 = 8,                                                ///< LPDDR5 memory
+    ZES_MEM_TYPE_SRAM = 9,                                                  ///< SRAM memory
+    ZES_MEM_TYPE_L1 = 10,                                                   ///< L1 cache
+    ZES_MEM_TYPE_L3 = 11,                                                   ///< L3 cache
+    ZES_MEM_TYPE_GRF = 12,                                                  ///< Execution unit register file
+    ZES_MEM_TYPE_SLM = 13,                                                  ///< Execution unit shared local memory
+    ZES_MEM_TYPE_GDDR4 = 14,                                                ///< GDDR4 memory
+    ZES_MEM_TYPE_GDDR5 = 15,                                                ///< GDDR5 memory
+    ZES_MEM_TYPE_GDDR5X = 16,                                               ///< GDDR5X memory
+    ZES_MEM_TYPE_GDDR6 = 17,                                                ///< GDDR6 memory
+    ZES_MEM_TYPE_GDDR6X = 18,                                               ///< GDDR6X memory
+    ZES_MEM_TYPE_GDDR7 = 19,                                                ///< GDDR7 memory
     ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } zes_mem_type_t;
@@ -4106,8 +4459,8 @@ typedef enum _zes_mem_type_t
 /// @brief Memory module location
 typedef enum _zes_mem_loc_t
 {
-    ZES_MEM_LOC_SYSTEM = 0,                         ///< System memory
-    ZES_MEM_LOC_DEVICE = 1,                         ///< On board local device memory
+    ZES_MEM_LOC_SYSTEM = 0,                                                 ///< System memory
+    ZES_MEM_LOC_DEVICE = 1,                                                 ///< On board local device memory
     ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
 
 } zes_mem_loc_t;
@@ -4116,13 +4469,13 @@ typedef enum _zes_mem_loc_t
 /// @brief Memory health
 typedef enum _zes_mem_health_t
 {
-    ZES_MEM_HEALTH_UNKNOWN = 0,                     ///< The memory health cannot be determined.
-    ZES_MEM_HEALTH_OK = 1,                          ///< All memory channels are healthy.
-    ZES_MEM_HEALTH_DEGRADED = 2,                    ///< Excessive correctable errors have been detected on one or more
-                                                    ///< channels. Device should be reset.
-    ZES_MEM_HEALTH_CRITICAL = 3,                    ///< Operating with reduced memory to cover banks with too many
-                                                    ///< uncorrectable errors.
-    ZES_MEM_HEALTH_REPLACE = 4,                     ///< Device should be replaced due to excessive uncorrectable errors.
+    ZES_MEM_HEALTH_UNKNOWN = 0,                                             ///< The memory health cannot be determined.
+    ZES_MEM_HEALTH_OK = 1,                                                  ///< All memory channels are healthy.
+    ZES_MEM_HEALTH_DEGRADED = 2,                                            ///< Excessive correctable errors have been detected on one or more
+                                                                            ///< channels. Device should be reset.
+    ZES_MEM_HEALTH_CRITICAL = 3,                                            ///< Operating with reduced memory to cover banks with too many
+                                                                            ///< uncorrectable errors.
+    ZES_MEM_HEALTH_REPLACE = 4,                                             ///< Device should be replaced due to excessive uncorrectable errors.
     ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
 
 } zes_mem_health_t;
@@ -4131,21 +4484,21 @@ typedef enum _zes_mem_health_t
 /// @brief Memory properties
 typedef struct _zes_mem_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_mem_type_t type;                            ///< [out] The memory type
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    zes_mem_loc_t location;                         ///< [out] Location of this memory (system, device)
-    uint64_t physicalSize;                          ///< [out] Physical memory size in bytes. A value of 0 indicates that this
-                                                    ///< property is not known. However, a call to ::zesMemoryGetState() will
-                                                    ///< correctly return the total size of usable memory.
-    int32_t busWidth;                               ///< [out] Width of the memory bus. A value of -1 means that this property
-                                                    ///< is unknown.
-    int32_t numChannels;                            ///< [out] The number of memory channels. A value of -1 means that this
-                                                    ///< property is unknown.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_mem_type_t type;                                                    ///< [out] The memory type
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_mem_loc_t location;                                                 ///< [out] Location of this memory (system, device)
+    uint64_t physicalSize;                                                  ///< [out] Physical memory size in bytes. A value of 0 indicates that this
+                                                                            ///< property is not known. However, a call to ::zesMemoryGetState() will
+                                                                            ///< correctly return the total size of usable memory.
+    int32_t busWidth;                                                       ///< [out] Width of the memory bus. A value of -1 means that this property
+                                                                            ///< is unknown.
+    int32_t numChannels;                                                    ///< [out] The number of memory channels. A value of -1 means that this
+                                                                            ///< property is unknown.
 
 } zes_mem_properties_t;
 
@@ -4157,13 +4510,13 @@ typedef struct _zes_mem_properties_t
 ///     - Percent free is given by 100 * free / size.
 typedef struct _zes_mem_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_mem_health_t health;                        ///< [out] Indicates the health of the memory
-    uint64_t free;                                  ///< [out] The free memory in bytes
-    uint64_t size;                                  ///< [out] The total allocatable memory in bytes (can be less than
-                                                    ///< ::zes_mem_properties_t.physicalSize)
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_mem_health_t health;                                                ///< [out] Indicates the health of the memory
+    uint64_t free;                                                          ///< [out] The free memory in bytes
+    uint64_t size;                                                          ///< [out] The total allocatable memory in bytes (can be less than the
+                                                                            ///< `physicalSize` member of ::zes_mem_properties_t)
 
 } zes_mem_state_t;
 
@@ -4175,21 +4528,39 @@ typedef struct _zes_mem_state_t
 ///       using the equation: %bw = 10^6 * ((s2.readCounter - s1.readCounter) +
 ///       (s2.writeCounter - s1.writeCounter)) / (s2.maxBandwidth *
 ///       (s2.timestamp - s1.timestamp))
+///     - Counter can roll over and rollover needs to be handled by comparing
+///       the current read against the previous read
+///     - Counter is a 32 byte transaction count, which means the calculated
+///       delta (delta = current_value - previous_value or delta = 2^32 -
+///       previous_value + current_value in case of rollover) needs to be
+///       multiplied by 32 to get delta between samples in actual byte count
 typedef struct _zes_mem_bandwidth_t
 {
-    uint64_t readCounter;                           ///< [out] Total bytes read from memory
-    uint64_t writeCounter;                          ///< [out] Total bytes written to memory
-    uint64_t maxBandwidth;                          ///< [out] Current maximum bandwidth in units of bytes/sec
-    uint64_t timestamp;                             ///< [out] The timestamp in microseconds when these measurements were sampled.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
+    uint64_t readCounter;                                                   ///< [out] Total bytes read from memory
+    uint64_t writeCounter;                                                  ///< [out] Total bytes written to memory
+    uint64_t maxBandwidth;                                                  ///< [out] Current maximum bandwidth in units of bytes/sec
+    uint64_t timestamp;                                                     ///< [out] The timestamp in microseconds when these measurements were sampled.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
 
 } zes_mem_bandwidth_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties for Memory bandwidth
+/// 
+/// @details
+///     - Number of counter bits
+///     - [DEPRECATED] No longer supported.
+typedef struct _zes_mem_ext_bandwidth_t
+{
+    uint32_t memoryTimestampValidBits;                                      ///< [out] Returns the number of valid bits in the timestamp values
+
+} zes_mem_ext_bandwidth_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Get handle of memory modules
 /// 
@@ -4209,18 +4580,18 @@ typedef struct _zes_mem_bandwidth_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumMemoryModules(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_mem_handle_t* phMemory                      ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_mem_handle_t* phMemory                                              ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4242,8 +4613,8 @@ zesDeviceEnumMemoryModules(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesMemoryGetProperties(
-    zes_mem_handle_t hMemory,                       ///< [in] Handle for the component.
-    zes_mem_properties_t* pProperties               ///< [in,out] Will contain memory properties.
+    zes_mem_handle_t hMemory,                                               ///< [in] Handle for the component.
+    zes_mem_properties_t* pProperties                                       ///< [in,out] Will contain memory properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4265,8 +4636,8 @@ zesMemoryGetProperties(
 ///         + `nullptr == pState`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesMemoryGetState(
-    zes_mem_handle_t hMemory,                       ///< [in] Handle for the component.
-    zes_mem_state_t* pState                         ///< [in,out] Will contain the current health and allocated memory.
+    zes_mem_handle_t hMemory,                                               ///< [in] Handle for the component.
+    zes_mem_state_t* pState                                                 ///< [in,out] Will contain the current health and allocated memory.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4290,9 +4661,9 @@ zesMemoryGetState(
 ///         + User does not have permissions to query this telemetry.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesMemoryGetBandwidth(
-    zes_mem_handle_t hMemory,                       ///< [in] Handle for the component.
-    zes_mem_bandwidth_t* pBandwidth                 ///< [in,out] Will contain the total number of bytes read from and written
-                                                    ///< to memory, as well as the current maximum bandwidth.
+    zes_mem_handle_t hMemory,                                               ///< [in] Handle for the component.
+    zes_mem_bandwidth_t* pBandwidth                                         ///< [in,out] Will contain the total number of bytes read from and written
+                                                                            ///< to memory, as well as the current maximum bandwidth.
     );
 
 #if !defined(__GNUC__)
@@ -4306,14 +4677,14 @@ zesMemoryGetBandwidth(
 /// @brief Static information about a Performance Factor domain
 typedef struct _zes_perf_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t onSubdevice;                          ///< [out] True if this Performance Factor affects accelerators located on
-                                                    ///< a sub-device
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    zes_engine_type_flags_t engines;                ///< [out] Bitfield of accelerator engine types that are affected by this
-                                                    ///< Performance Factor.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this Performance Factor affects accelerators located on
+                                                                            ///< a sub-device
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_engine_type_flags_t engines;                                        ///< [out] Bitfield of accelerator engine types that are affected by this
+                                                                            ///< Performance Factor.
 
 } zes_perf_properties_t;
 
@@ -4338,18 +4709,18 @@ typedef struct _zes_perf_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumPerformanceFactorDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_perf_handle_t* phPerf                       ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_perf_handle_t* phPerf                                               ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4371,9 +4742,9 @@ zesDeviceEnumPerformanceFactorDomains(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPerformanceFactorGetProperties(
-    zes_perf_handle_t hPerf,                        ///< [in] Handle for the Performance Factor domain.
-    zes_perf_properties_t* pProperties              ///< [in,out] Will contain information about the specified Performance
-                                                    ///< Factor domain.
+    zes_perf_handle_t hPerf,                                                ///< [in] Handle for the Performance Factor domain.
+    zes_perf_properties_t* pProperties                                      ///< [in,out] Will contain information about the specified Performance
+                                                                            ///< Factor domain.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4395,9 +4766,9 @@ zesPerformanceFactorGetProperties(
 ///         + `nullptr == pFactor`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPerformanceFactorGetConfig(
-    zes_perf_handle_t hPerf,                        ///< [in] Handle for the Performance Factor domain.
-    double* pFactor                                 ///< [in,out] Will contain the actual Performance Factor being used by the
-                                                    ///< hardware (may not be the same as the requested Performance Factor).
+    zes_perf_handle_t hPerf,                                                ///< [in] Handle for the Performance Factor domain.
+    double* pFactor                                                         ///< [in,out] Will contain the actual Performance Factor being used by the
+                                                                            ///< hardware (may not be the same as the requested Performance Factor).
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4422,8 +4793,8 @@ zesPerformanceFactorGetConfig(
 ///         + `nullptr == hPerf`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPerformanceFactorSetConfig(
-    zes_perf_handle_t hPerf,                        ///< [in] Handle for the Performance Factor domain.
-    double factor                                   ///< [in] The new Performance Factor.
+    zes_perf_handle_t hPerf,                                                ///< [in] Handle for the Performance Factor domain.
+    double factor                                                           ///< [in] The new Performance Factor.
     );
 
 #if !defined(__GNUC__)
@@ -4437,10 +4808,12 @@ zesPerformanceFactorSetConfig(
 /// @brief Power Domain
 typedef enum _zes_power_domain_t
 {
-    ZES_POWER_DOMAIN_UNKNOWN = 0,                   ///< The PUnit power domain level cannot be determined.
-    ZES_POWER_DOMAIN_CARD = 1,                      ///< The PUnit power domain is a card-level power domain.
-    ZES_POWER_DOMAIN_PACKAGE = 2,                   ///< The PUnit power domain is a package-level power domain.
-    ZES_POWER_DOMAIN_STACK = 3,                     ///< The PUnit power domain is a stack-level power domain.
+    ZES_POWER_DOMAIN_UNKNOWN = 0,                                           ///< The PUnit power domain level cannot be determined.
+    ZES_POWER_DOMAIN_CARD = 1,                                              ///< The PUnit power domain is a card-level power domain.
+    ZES_POWER_DOMAIN_PACKAGE = 2,                                           ///< The PUnit power domain is a package-level power domain.
+    ZES_POWER_DOMAIN_STACK = 3,                                             ///< The PUnit power domain is a stack-level power domain.
+    ZES_POWER_DOMAIN_MEMORY = 4,                                            ///< The PUnit power domain is a memory-level power domain.
+    ZES_POWER_DOMAIN_GPU = 5,                                               ///< The PUnit power domain is a GPU-level power domain.
     ZES_POWER_DOMAIN_FORCE_UINT32 = 0x7fffffff
 
 } zes_power_domain_t;
@@ -4449,18 +4822,18 @@ typedef enum _zes_power_domain_t
 /// @brief Power Level Type
 typedef enum _zes_power_level_t
 {
-    ZES_POWER_LEVEL_UNKNOWN = 0,                    ///< The PUnit power monitoring duration cannot be determined.
-    ZES_POWER_LEVEL_SUSTAINED = 1,                  ///< The PUnit determines effective power draw by computing a moving
-                                                    ///< average of the actual power draw over a time interval (longer than
-                                                    ///< BURST).
-    ZES_POWER_LEVEL_BURST = 2,                      ///< The PUnit determines effective power draw by computing a moving
-                                                    ///< average of the actual power draw over a time interval (longer than
-                                                    ///< PEAK).
-    ZES_POWER_LEVEL_PEAK = 3,                       ///< The PUnit determines effective power draw by computing a moving
-                                                    ///< average of the actual power draw over a very short time interval.
-    ZES_POWER_LEVEL_INSTANTANEOUS = 4,              ///< The PUnit predicts effective power draw using the current device
-                                                    ///< configuration (frequency, voltage, etc...) & throttles proactively to
-                                                    ///< stay within the specified limit.
+    ZES_POWER_LEVEL_UNKNOWN = 0,                                            ///< The PUnit power monitoring duration cannot be determined.
+    ZES_POWER_LEVEL_SUSTAINED = 1,                                          ///< The PUnit determines effective power draw by computing a moving
+                                                                            ///< average of the actual power draw over a time interval (longer than
+                                                                            ///< BURST).
+    ZES_POWER_LEVEL_BURST = 2,                                              ///< The PUnit determines effective power draw by computing a moving
+                                                                            ///< average of the actual power draw over a time interval (longer than
+                                                                            ///< PEAK).
+    ZES_POWER_LEVEL_PEAK = 3,                                               ///< The PUnit determines effective power draw by computing a moving
+                                                                            ///< average of the actual power draw over a very short time interval.
+    ZES_POWER_LEVEL_INSTANTANEOUS = 4,                                      ///< The PUnit predicts effective power draw using the current device
+                                                                            ///< configuration (frequency, voltage, etc...) & throttles proactively to
+                                                                            ///< stay within the specified limit.
     ZES_POWER_LEVEL_FORCE_UINT32 = 0x7fffffff
 
 } zes_power_level_t;
@@ -4469,10 +4842,10 @@ typedef enum _zes_power_level_t
 /// @brief Power Source Type
 typedef enum _zes_power_source_t
 {
-    ZES_POWER_SOURCE_ANY = 0,                       ///< Limit active no matter whether the power source is mains powered or
-                                                    ///< battery powered.
-    ZES_POWER_SOURCE_MAINS = 1,                     ///< Limit active only when the device is mains powered.
-    ZES_POWER_SOURCE_BATTERY = 2,                   ///< Limit active only when the device is battery powered.
+    ZES_POWER_SOURCE_ANY = 0,                                               ///< Limit active no matter whether the power source is mains powered or
+                                                                            ///< battery powered.
+    ZES_POWER_SOURCE_MAINS = 1,                                             ///< Limit active only when the device is mains powered.
+    ZES_POWER_SOURCE_BATTERY = 2,                                           ///< Limit active only when the device is battery powered.
     ZES_POWER_SOURCE_FORCE_UINT32 = 0x7fffffff
 
 } zes_power_source_t;
@@ -4481,9 +4854,9 @@ typedef enum _zes_power_source_t
 /// @brief Limit Unit
 typedef enum _zes_limit_unit_t
 {
-    ZES_LIMIT_UNIT_UNKNOWN = 0,                     ///< The PUnit power monitoring unit cannot be determined.
-    ZES_LIMIT_UNIT_CURRENT = 1,                     ///< The limit is specified in milliamperes of current drawn.
-    ZES_LIMIT_UNIT_POWER = 2,                       ///< The limit is specified in milliwatts of power generated.
+    ZES_LIMIT_UNIT_UNKNOWN = 0,                                             ///< The PUnit power monitoring unit cannot be determined.
+    ZES_LIMIT_UNIT_CURRENT = 1,                                             ///< The limit is specified in milliamperes of current drawn.
+    ZES_LIMIT_UNIT_POWER = 2,                                               ///< The limit is specified in milliwatts of power generated.
     ZES_LIMIT_UNIT_FORCE_UINT32 = 0x7fffffff
 
 } zes_limit_unit_t;
@@ -4492,22 +4865,22 @@ typedef enum _zes_limit_unit_t
 /// @brief Properties related to device power settings
 typedef struct _zes_power_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Software can change the power limits of this domain assuming the
-                                                    ///< user has permissions.
-    ze_bool_t isEnergyThresholdSupported;           ///< [out] Indicates if this power domain supports the energy threshold
-                                                    ///< event (::ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED).
-    int32_t defaultLimit;                           ///< [out] (Deprecated) The factory default TDP power limit of the part in
-                                                    ///< milliwatts. A value of -1 means that this is not known.
-    int32_t minLimit;                               ///< [out] (Deprecated) The minimum power limit in milliwatts that can be
-                                                    ///< requested. A value of -1 means that this is not known.
-    int32_t maxLimit;                               ///< [out] (Deprecated) The maximum power limit in milliwatts that can be
-                                                    ///< requested. A value of -1 means that this is not known.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Software can change the power limits of this domain assuming the
+                                                                            ///< user has permissions.
+    ze_bool_t isEnergyThresholdSupported;                                   ///< [out] Indicates if this power domain supports the energy threshold
+                                                                            ///< event (::ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED).
+    int32_t defaultLimit;                                                   ///< [out] (Deprecated) The factory default TDP power limit of the part in
+                                                                            ///< milliwatts. A value of -1 means that this is not known.
+    int32_t minLimit;                                                       ///< [out] (Deprecated) The minimum power limit in milliwatts that can be
+                                                                            ///< requested. A value of -1 means that this is not known.
+    int32_t maxLimit;                                                       ///< [out] (Deprecated) The maximum power limit in milliwatts that can be
+                                                                            ///< requested. A value of -1 means that this is not known.
 
 } zes_power_properties_t;
 
@@ -4520,14 +4893,14 @@ typedef struct _zes_power_properties_t
 ///       s1.timestamp)
 typedef struct _zes_power_energy_counter_t
 {
-    uint64_t energy;                                ///< [out] The monotonic energy counter in microjoules.
-    uint64_t timestamp;                             ///< [out] Microsecond timestamp when energy was captured.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
+    uint64_t energy;                                                        ///< [out] The monotonic energy counter in microjoules.
+    uint64_t timestamp;                                                     ///< [out] Microsecond timestamp when energy was captured.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
 
 } zes_power_energy_counter_t;
 
@@ -4538,11 +4911,12 @@ typedef struct _zes_power_energy_counter_t
 ///     - The power controller (Punit) will throttle the operating frequency if
 ///       the power averaged over a window (typically seconds) exceeds this
 ///       limit.
+///     - [DEPRECATED] No longer supported.
 typedef struct _zes_power_sustained_limit_t
 {
-    ze_bool_t enabled;                              ///< [in,out] indicates if the limit is enabled (true) or ignored (false)
-    int32_t power;                                  ///< [in,out] power limit in milliwatts
-    int32_t interval;                               ///< [in,out] power averaging window (Tau) in milliseconds
+    ze_bool_t enabled;                                                      ///< [in,out] indicates if the limit is enabled (true) or ignored (false)
+    int32_t power;                                                          ///< [in,out] power limit in milliwatts
+    int32_t interval;                                                       ///< [in,out] power averaging window (Tau) in milliseconds
 
 } zes_power_sustained_limit_t;
 
@@ -4555,10 +4929,11 @@ typedef struct _zes_power_sustained_limit_t
 ///       limit known as PL2. Typically PL2 > PL1 so that it permits the
 ///       frequency to burst higher for short periods than would be otherwise
 ///       permitted by PL1.
+///     - [DEPRECATED] No longer supported.
 typedef struct _zes_power_burst_limit_t
 {
-    ze_bool_t enabled;                              ///< [in,out] indicates if the limit is enabled (true) or ignored (false)
-    int32_t power;                                  ///< [in,out] power limit in milliwatts
+    ze_bool_t enabled;                                                      ///< [in,out] indicates if the limit is enabled (true) or ignored (false)
+    int32_t power;                                                          ///< [in,out] power limit in milliwatts
 
 } zes_power_burst_limit_t;
 
@@ -4575,12 +4950,13 @@ typedef struct _zes_power_burst_limit_t
 ///       power controller will throttle the device frequencies down to min. It
 ///       is thus better to tune the PL4 value in order to avoid such
 ///       excursions.
+///     - [DEPRECATED] No longer supported.
 typedef struct _zes_power_peak_limit_t
 {
-    int32_t powerAC;                                ///< [in,out] power limit in milliwatts for the AC power source.
-    int32_t powerDC;                                ///< [in,out] power limit in milliwatts for the DC power source. On input,
-                                                    ///< this is ignored if the product does not have a battery. On output,
-                                                    ///< this will be -1 if the product does not have a battery.
+    int32_t powerAC;                                                        ///< [in,out] power limit in milliwatts for the AC power source.
+    int32_t powerDC;                                                        ///< [in,out] power limit in milliwatts for the DC power source. On input,
+                                                                            ///< this is ignored if the product does not have a battery. On output,
+                                                                            ///< this will be -1 if the product does not have a battery.
 
 } zes_power_peak_limit_t;
 
@@ -4591,11 +4967,11 @@ typedef struct _zes_power_peak_limit_t
 ///     - .
 typedef struct _zes_energy_threshold_t
 {
-    ze_bool_t enable;                               ///< [in,out] Indicates if the energy threshold is enabled.
-    double threshold;                               ///< [in,out] The energy threshold in Joules. Will be 0.0 if no threshold
-                                                    ///< has been set.
-    uint32_t processId;                             ///< [in,out] The host process ID that set the energy threshold. Will be
-                                                    ///< 0xFFFFFFFF if no threshold has been set.
+    ze_bool_t enable;                                                       ///< [in,out] Indicates if the energy threshold is enabled.
+    double threshold;                                                       ///< [in,out] The energy threshold in Joules. Will be 0.0 if no threshold
+                                                                            ///< has been set.
+    uint32_t processId;                                                     ///< [in,out] The host process ID that set the energy threshold. Will be
+                                                                            ///< 0xFFFFFFFF if no threshold has been set.
 
 } zes_energy_threshold_t;
 
@@ -4618,18 +4994,18 @@ typedef struct _zes_energy_threshold_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumPowerDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_pwr_handle_t* phPower                       ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_pwr_handle_t* phPower                                               ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4638,6 +5014,7 @@ zesDeviceEnumPowerDomains(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -4653,8 +5030,8 @@ zesDeviceEnumPowerDomains(
 ///         + The device does not provide access to card level power controls or telemetry. An invalid power domain handle will be returned in phPower.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceGetCardPowerDomain(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_pwr_handle_t* phPower                       ///< [in,out] power domain handle for the entire PCIe card.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_pwr_handle_t* phPower                                               ///< [in,out] power domain handle for the entire PCIe card.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4676,8 +5053,8 @@ zesDeviceGetCardPowerDomain(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPowerGetProperties(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    zes_power_properties_t* pProperties             ///< [in,out] Structure that will contain property data.
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    zes_power_properties_t* pProperties                                     ///< [in,out] Structure that will contain property data.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4699,9 +5076,9 @@ zesPowerGetProperties(
 ///         + `nullptr == pEnergy`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPowerGetEnergyCounter(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    zes_power_energy_counter_t* pEnergy             ///< [in,out] Will contain the latest snapshot of the energy counter and
-                                                    ///< timestamp when the last counter value was measured.
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    zes_power_energy_counter_t* pEnergy                                     ///< [in,out] Will contain the latest snapshot of the energy counter and
+                                                                            ///< timestamp when the last counter value was measured.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4710,8 +5087,7 @@ zesPowerGetEnergyCounter(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
-///     - Note: This function is deprecated and replaced by
-///       ::zesPowerGetLimitsExt.
+///     - [DEPRECATED] Use ::zesPowerGetLimitsExt.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -4723,13 +5099,13 @@ zesPowerGetEnergyCounter(
 ///         + `nullptr == hPower`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPowerGetLimits(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    zes_power_sustained_limit_t* pSustained,        ///< [in,out][optional] The sustained power limit. If this is null, the
-                                                    ///< current sustained power limits will not be returned.
-    zes_power_burst_limit_t* pBurst,                ///< [in,out][optional] The burst power limit. If this is null, the current
-                                                    ///< peak power limits will not be returned.
-    zes_power_peak_limit_t* pPeak                   ///< [in,out][optional] The peak power limit. If this is null, the peak
-                                                    ///< power limits will not be returned.
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    zes_power_sustained_limit_t* pSustained,                                ///< [in,out][optional] The sustained power limit. If this is null, the
+                                                                            ///< current sustained power limits will not be returned.
+    zes_power_burst_limit_t* pBurst,                                        ///< [in,out][optional] The burst power limit. If this is null, the current
+                                                                            ///< peak power limits will not be returned.
+    zes_power_peak_limit_t* pPeak                                           ///< [in,out][optional] The peak power limit. If this is null, the peak
+                                                                            ///< power limits will not be returned.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4738,8 +5114,7 @@ zesPowerGetLimits(
 /// @details
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
-///     - Note: This function is deprecated and replaced by
-///       ::zesPowerSetLimitsExt.
+///     - [DEPRECATED] Use ::zesPowerSetLimitsExt.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -4755,13 +5130,13 @@ zesPowerGetLimits(
 ///         + The device is in use, meaning that the GPU is under Over clocking, applying power limits under overclocking is not supported.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPowerSetLimits(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    const zes_power_sustained_limit_t* pSustained,  ///< [in][optional] The sustained power limit. If this is null, no changes
-                                                    ///< will be made to the sustained power limits.
-    const zes_power_burst_limit_t* pBurst,          ///< [in][optional] The burst power limit. If this is null, no changes will
-                                                    ///< be made to the burst power limits.
-    const zes_power_peak_limit_t* pPeak             ///< [in][optional] The peak power limit. If this is null, no changes will
-                                                    ///< be made to the peak power limits.
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    const zes_power_sustained_limit_t* pSustained,                          ///< [in][optional] The sustained power limit. If this is null, no changes
+                                                                            ///< will be made to the sustained power limits.
+    const zes_power_burst_limit_t* pBurst,                                  ///< [in][optional] The burst power limit. If this is null, no changes will
+                                                                            ///< be made to the burst power limits.
+    const zes_power_peak_limit_t* pPeak                                     ///< [in][optional] The peak power limit. If this is null, no changes will
+                                                                            ///< be made to the peak power limits.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4782,14 +5157,14 @@ zesPowerSetLimits(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == pThreshold`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Energy threshold not supported on this power domain (check ::zes_power_properties_t.isEnergyThresholdSupported).
+///         + Energy threshold not supported on this power domain (check the `isEnergyThresholdSupported` member of ::zes_power_properties_t).
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to request this feature.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPowerGetEnergyThreshold(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    zes_energy_threshold_t* pThreshold              ///< [in,out] Returns information about the energy threshold setting -
-                                                    ///< enabled/energy threshold/process ID.
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    zes_energy_threshold_t* pThreshold                                      ///< [in,out] Returns information about the energy threshold setting -
+                                                                            ///< enabled/energy threshold/process ID.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4822,15 +5197,15 @@ zesPowerGetEnergyThreshold(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `nullptr == hPower`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Energy threshold not supported on this power domain (check ::zes_power_properties_t.isEnergyThresholdSupported).
+///         + Energy threshold not supported on this power domain (check the `isEnergyThresholdSupported` member of ::zes_power_properties_t).
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to request this feature.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
 ///         + Another running process has set the energy threshold.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPowerSetEnergyThreshold(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    double threshold                                ///< [in] The energy threshold to be set in joules.
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    double threshold                                                        ///< [in] The energy threshold to be set in joules.
     );
 
 #if !defined(__GNUC__)
@@ -4844,11 +5219,11 @@ zesPowerSetEnergyThreshold(
 /// @brief PSU voltage status
 typedef enum _zes_psu_voltage_status_t
 {
-    ZES_PSU_VOLTAGE_STATUS_UNKNOWN = 0,             ///< The status of the power supply voltage controllers cannot be
-                                                    ///< determined
-    ZES_PSU_VOLTAGE_STATUS_NORMAL = 1,              ///< No unusual voltages have been detected
-    ZES_PSU_VOLTAGE_STATUS_OVER = 2,                ///< Over-voltage has occurred
-    ZES_PSU_VOLTAGE_STATUS_UNDER = 3,               ///< Under-voltage has occurred
+    ZES_PSU_VOLTAGE_STATUS_UNKNOWN = 0,                                     ///< The status of the power supply voltage controllers cannot be
+                                                                            ///< determined
+    ZES_PSU_VOLTAGE_STATUS_NORMAL = 1,                                      ///< No unusual voltages have been detected
+    ZES_PSU_VOLTAGE_STATUS_OVER = 2,                                        ///< Over-voltage has occurred
+    ZES_PSU_VOLTAGE_STATUS_UNDER = 3,                                       ///< Under-voltage has occurred
     ZES_PSU_VOLTAGE_STATUS_FORCE_UINT32 = 0x7fffffff
 
 } zes_psu_voltage_status_t;
@@ -4857,16 +5232,16 @@ typedef enum _zes_psu_voltage_status_t
 /// @brief Static properties of the power supply
 typedef struct _zes_psu_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t haveFan;                              ///< [out] True if the power supply has a fan
-    int32_t ampLimit;                               ///< [out] The maximum electrical current in milliamperes that can be
-                                                    ///< drawn. A value of -1 indicates that this property cannot be
-                                                    ///< determined.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t haveFan;                                                      ///< [out] True if the power supply has a fan
+    int32_t ampLimit;                                                       ///< [out] The maximum electrical current in milliamperes that can be
+                                                                            ///< drawn. A value of -1 indicates that this property cannot be
+                                                                            ///< determined.
 
 } zes_psu_properties_t;
 
@@ -4874,15 +5249,15 @@ typedef struct _zes_psu_properties_t
 /// @brief Dynamic state of the power supply
 typedef struct _zes_psu_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_psu_voltage_status_t voltStatus;            ///< [out] The current PSU voltage status
-    ze_bool_t fanFailed;                            ///< [out] Indicates if the fan has failed
-    int32_t temperature;                            ///< [out] Read the current heatsink temperature in degrees Celsius. A
-                                                    ///< value of -1 indicates that this property cannot be determined.
-    int32_t current;                                ///< [out] The amps being drawn in milliamperes. A value of -1 indicates
-                                                    ///< that this property cannot be determined.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_psu_voltage_status_t voltStatus;                                    ///< [out] The current PSU voltage status
+    ze_bool_t fanFailed;                                                    ///< [out] Indicates if the fan has failed
+    int32_t temperature;                                                    ///< [out] Read the current heatsink temperature in degrees Celsius. A
+                                                                            ///< value of -1 indicates that this property cannot be determined.
+    int32_t current;                                                        ///< [out] The amps being drawn in milliamperes. A value of -1 indicates
+                                                                            ///< that this property cannot be determined.
 
 } zes_psu_state_t;
 
@@ -4905,18 +5280,18 @@ typedef struct _zes_psu_state_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumPsus(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_psu_handle_t* phPsu                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_psu_handle_t* phPsu                                                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4938,8 +5313,8 @@ zesDeviceEnumPsus(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPsuGetProperties(
-    zes_psu_handle_t hPsu,                          ///< [in] Handle for the component.
-    zes_psu_properties_t* pProperties               ///< [in,out] Will contain the properties of the power supply.
+    zes_psu_handle_t hPsu,                                                  ///< [in] Handle for the component.
+    zes_psu_properties_t* pProperties                                       ///< [in,out] Will contain the properties of the power supply.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -4961,8 +5336,8 @@ zesPsuGetProperties(
 ///         + `nullptr == pState`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPsuGetState(
-    zes_psu_handle_t hPsu,                          ///< [in] Handle for the component.
-    zes_psu_state_t* pState                         ///< [in,out] Will contain the current state of the power supply.
+    zes_psu_handle_t hPsu,                                                  ///< [in] Handle for the component.
+    zes_psu_state_t* pState                                                 ///< [in,out] Will contain the current state of the power supply.
     );
 
 #if !defined(__GNUC__)
@@ -4976,8 +5351,8 @@ zesPsuGetState(
 /// @brief RAS error type
 typedef enum _zes_ras_error_type_t
 {
-    ZES_RAS_ERROR_TYPE_CORRECTABLE = 0,             ///< Errors were corrected by hardware
-    ZES_RAS_ERROR_TYPE_UNCORRECTABLE = 1,           ///< Error were not corrected
+    ZES_RAS_ERROR_TYPE_CORRECTABLE = 0,                                     ///< Errors were corrected by hardware
+    ZES_RAS_ERROR_TYPE_UNCORRECTABLE = 1,                                   ///< Error were not corrected
     ZES_RAS_ERROR_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } zes_ras_error_type_t;
@@ -4986,17 +5361,17 @@ typedef enum _zes_ras_error_type_t
 /// @brief RAS error categories
 typedef enum _zes_ras_error_cat_t
 {
-    ZES_RAS_ERROR_CAT_RESET = 0,                    ///< The number of accelerator engine resets attempted by the driver
-    ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS = 1,       ///< The number of hardware exceptions generated by the way workloads have
-                                                    ///< programmed the hardware
-    ZES_RAS_ERROR_CAT_DRIVER_ERRORS = 2,            ///< The number of low level driver communication errors have occurred
-    ZES_RAS_ERROR_CAT_COMPUTE_ERRORS = 3,           ///< The number of errors that have occurred in the compute accelerator
-                                                    ///< hardware
-    ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS = 4,       ///< The number of errors that have occurred in the fixed-function
-                                                    ///< accelerator hardware
-    ZES_RAS_ERROR_CAT_CACHE_ERRORS = 5,             ///< The number of errors that have occurred in caches (L1/L3/register
-                                                    ///< file/shared local memory/sampler)
-    ZES_RAS_ERROR_CAT_DISPLAY_ERRORS = 6,           ///< The number of errors that have occurred in the display
+    ZES_RAS_ERROR_CAT_RESET = 0,                                            ///< The number of accelerator engine resets attempted by the driver
+    ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS = 1,                               ///< The number of hardware exceptions generated by the way workloads have
+                                                                            ///< programmed the hardware
+    ZES_RAS_ERROR_CAT_DRIVER_ERRORS = 2,                                    ///< The number of low level driver communication errors have occurred
+    ZES_RAS_ERROR_CAT_COMPUTE_ERRORS = 3,                                   ///< The number of errors that have occurred in the compute accelerator
+                                                                            ///< hardware
+    ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS = 4,                               ///< The number of errors that have occurred in the fixed-function
+                                                                            ///< accelerator hardware
+    ZES_RAS_ERROR_CAT_CACHE_ERRORS = 5,                                     ///< The number of errors that have occurred in caches (L1/L3/register
+                                                                            ///< file/shared local memory/sampler)
+    ZES_RAS_ERROR_CAT_DISPLAY_ERRORS = 6,                                   ///< The number of errors that have occurred in the display
     ZES_RAS_ERROR_CAT_FORCE_UINT32 = 0x7fffffff
 
 } zes_ras_error_cat_t;
@@ -5011,13 +5386,13 @@ typedef enum _zes_ras_error_cat_t
 /// @brief RAS properties
 typedef struct _zes_ras_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_ras_error_type_t type;                      ///< [out] The type of RAS error
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_ras_error_type_t type;                                              ///< [out] The type of RAS error
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
 
 } zes_ras_properties_t;
 
@@ -5025,10 +5400,10 @@ typedef struct _zes_ras_properties_t
 /// @brief RAS error details
 typedef struct _zes_ras_state_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint64_t category[ZES_MAX_RAS_ERROR_CATEGORY_COUNT];///< [in][out] Breakdown of error by category
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t category[ZES_MAX_RAS_ERROR_CATEGORY_COUNT];                    ///< [in][out] Breakdown of error by category
 
 } zes_ras_state_t;
 
@@ -5049,15 +5424,15 @@ typedef struct _zes_ras_state_t
 ///       specified in detailedThresholds, a RAS event is triggered.
 typedef struct _zes_ras_config_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint64_t totalThreshold;                        ///< [in,out] If the total RAS errors exceeds this threshold, the event
-                                                    ///< will be triggered. A value of 0ULL disables triggering the event based
-                                                    ///< on the total counter.
-    zes_ras_state_t detailedThresholds;             ///< [in,out] If the RAS errors for each category exceed the threshold for
-                                                    ///< that category, the event will be triggered. A value of 0ULL will
-                                                    ///< disable an event being triggered for that category.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t totalThreshold;                                                ///< [in,out] If the total RAS errors exceeds this threshold, the event
+                                                                            ///< will be triggered. A value of 0ULL disables triggering the event based
+                                                                            ///< on the total counter.
+    zes_ras_state_t detailedThresholds;                                     ///< [in,out] If the RAS errors for each category exceed the threshold for
+                                                                            ///< that category, the event will be triggered. A value of 0ULL will
+                                                                            ///< disable an event being triggered for that category.
 
 } zes_ras_config_t;
 
@@ -5090,18 +5465,18 @@ typedef struct _zes_ras_config_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumRasErrorSets(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_ras_handle_t* phRas                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_ras_handle_t* phRas                                                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5125,8 +5500,8 @@ zesDeviceEnumRasErrorSets(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesRasGetProperties(
-    zes_ras_handle_t hRas,                          ///< [in] Handle for the component.
-    zes_ras_properties_t* pProperties               ///< [in,out] Structure describing RAS properties
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    zes_ras_properties_t* pProperties                                       ///< [in,out] Structure describing RAS properties
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5157,9 +5532,9 @@ zesRasGetProperties(
 ///         + `nullptr == pConfig`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesRasGetConfig(
-    zes_ras_handle_t hRas,                          ///< [in] Handle for the component.
-    zes_ras_config_t* pConfig                       ///< [in,out] Will be populed with the current RAS configuration -
-                                                    ///< thresholds used to trigger events
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    zes_ras_config_t* pConfig                                               ///< [in,out] Will be populed with the current RAS configuration -
+                                                                            ///< thresholds used to trigger events
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5196,8 +5571,8 @@ zesRasGetConfig(
 ///         + Don't have permissions to set thresholds.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesRasSetConfig(
-    zes_ras_handle_t hRas,                          ///< [in] Handle for the component.
-    const zes_ras_config_t* pConfig                 ///< [in] Change the RAS configuration - thresholds used to trigger events
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    const zes_ras_config_t* pConfig                                         ///< [in] Change the RAS configuration - thresholds used to trigger events
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5224,9 +5599,9 @@ zesRasSetConfig(
 ///         + Don't have permissions to clear error counters.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesRasGetState(
-    zes_ras_handle_t hRas,                          ///< [in] Handle for the component.
-    ze_bool_t clear,                                ///< [in] Set to 1 to clear the counters of this type
-    zes_ras_state_t* pState                         ///< [in,out] Breakdown of where errors have occurred
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    ze_bool_t clear,                                                        ///< [in] Set to 1 to clear the counters of this type
+    zes_ras_state_t* pState                                                 ///< [in,out] Breakdown of where errors have occurred
     );
 
 #if !defined(__GNUC__)
@@ -5240,21 +5615,18 @@ zesRasGetState(
 /// @brief Scheduler mode
 typedef enum _zes_sched_mode_t
 {
-    ZES_SCHED_MODE_TIMEOUT = 0,                     ///< Multiple applications or contexts are submitting work to the hardware.
-                                                    ///< When higher priority work arrives, the scheduler attempts to pause the
-                                                    ///< current executing work within some timeout interval, then submits the
-                                                    ///< other work.
-    ZES_SCHED_MODE_TIMESLICE = 1,                   ///< The scheduler attempts to fairly timeslice hardware execution time
-                                                    ///< between multiple contexts submitting work to the hardware
-                                                    ///< concurrently.
-    ZES_SCHED_MODE_EXCLUSIVE = 2,                   ///< Any application or context can run indefinitely on the hardware
-                                                    ///< without being preempted or terminated. All pending work for other
-                                                    ///< contexts must wait until the running context completes with no further
-                                                    ///< submitted work.
-    ZES_SCHED_MODE_COMPUTE_UNIT_DEBUG = 3,          ///< This is a special mode that must ben enabled when debugging an
-                                                    ///< application that uses this device e.g. using the Level0 Debug API. It
-                                                    ///< has the effect of disabling any timeouts on workload execution time
-                                                    ///< and will change workload scheduling to ensure debug accuracy.
+    ZES_SCHED_MODE_TIMEOUT = 0,                                             ///< Multiple applications or contexts are submitting work to the hardware.
+                                                                            ///< When higher priority work arrives, the scheduler attempts to pause the
+                                                                            ///< current executing work within some timeout interval, then submits the
+                                                                            ///< other work.
+    ZES_SCHED_MODE_TIMESLICE = 1,                                           ///< The scheduler attempts to fairly timeslice hardware execution time
+                                                                            ///< between multiple contexts submitting work to the hardware
+                                                                            ///< concurrently.
+    ZES_SCHED_MODE_EXCLUSIVE = 2,                                           ///< Any application or context can run indefinitely on the hardware
+                                                                            ///< without being preempted or terminated. All pending work for other
+                                                                            ///< contexts must wait until the running context completes with no further
+                                                                            ///< submitted work.
+    ZES_SCHED_MODE_COMPUTE_UNIT_DEBUG = 3,                                  ///< [DEPRECATED] No longer supported.
     ZES_SCHED_MODE_FORCE_UINT32 = 0x7fffffff
 
 } zes_sched_mode_t;
@@ -5263,19 +5635,19 @@ typedef enum _zes_sched_mode_t
 /// @brief Properties related to scheduler component
 typedef struct _zes_sched_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Software can change the scheduler component configuration
-                                                    ///< assuming the user has permissions.
-    zes_engine_type_flags_t engines;                ///< [out] Bitfield of accelerator engine types that are managed by this
-                                                    ///< scheduler component. Note that there can be more than one scheduler
-                                                    ///< component for the same type of accelerator engine.
-    uint32_t supportedModes;                        ///< [out] Bitfield of scheduler modes that can be configured for this
-                                                    ///< scheduler component (bitfield of 1<<::zes_sched_mode_t).
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Software can change the scheduler component configuration
+                                                                            ///< assuming the user has permissions.
+    zes_engine_type_flags_t engines;                                        ///< [out] Bitfield of accelerator engine types that are managed by this
+                                                                            ///< scheduler component. Note that there can be more than one scheduler
+                                                                            ///< component for the same type of accelerator engine.
+    uint32_t supportedModes;                                                ///< [out] Bitfield of scheduler modes that can be configured for this
+                                                                            ///< scheduler component (bitfield of 1<<::zes_sched_mode_t).
 
 } zes_sched_properties_t;
 
@@ -5289,16 +5661,16 @@ typedef struct _zes_sched_properties_t
 /// @brief Configuration for timeout scheduler mode (::ZES_SCHED_MODE_TIMEOUT)
 typedef struct _zes_sched_timeout_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint64_t watchdogTimeout;                       ///< [in,out] The maximum time in microseconds that the scheduler will wait
-                                                    ///< for a batch of work submitted to a hardware engine to complete or to
-                                                    ///< be preempted so as to run another context.
-                                                    ///< If this time is exceeded, the hardware engine is reset and the context terminated.
-                                                    ///< If set to ::ZES_SCHED_WATCHDOG_DISABLE, a running workload can run as
-                                                    ///< long as it wants without being terminated, but preemption attempts to
-                                                    ///< run other contexts are permitted but not enforced.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t watchdogTimeout;                                               ///< [in,out] The maximum time in microseconds that the scheduler will wait
+                                                                            ///< for a batch of work submitted to a hardware engine to complete or to
+                                                                            ///< be preempted so as to run another context.
+                                                                            ///< If this time is exceeded, the hardware engine is reset and the context terminated.
+                                                                            ///< If set to ::ZES_SCHED_WATCHDOG_DISABLE, a running workload can run as
+                                                                            ///< long as it wants without being terminated, but preemption attempts to
+                                                                            ///< run other contexts are permitted but not enforced.
 
 } zes_sched_timeout_properties_t;
 
@@ -5307,15 +5679,15 @@ typedef struct _zes_sched_timeout_properties_t
 ///        (::ZES_SCHED_MODE_TIMESLICE)
 typedef struct _zes_sched_timeslice_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint64_t interval;                              ///< [in,out] The average interval in microseconds that a submission for a
-                                                    ///< context will run on a hardware engine before being preempted out to
-                                                    ///< run a pending submission for another context.
-    uint64_t yieldTimeout;                          ///< [in,out] The maximum time in microseconds that the scheduler will wait
-                                                    ///< to preempt a workload running on an engine before deciding to reset
-                                                    ///< the hardware engine and terminating the associated context.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t interval;                                                      ///< [in,out] The average interval in microseconds that a submission for a
+                                                                            ///< context will run on a hardware engine before being preempted out to
+                                                                            ///< run a pending submission for another context.
+    uint64_t yieldTimeout;                                                  ///< [in,out] The maximum time in microseconds that the scheduler will wait
+                                                                            ///< to preempt a workload running on an engine before deciding to reset
+                                                                            ///< the hardware engine and terminating the associated context.
 
 } zes_sched_timeslice_properties_t;
 
@@ -5327,8 +5699,8 @@ typedef struct _zes_sched_timeslice_properties_t
 ///       or more accelerator engines.
 ///     - If an application wishes to change the scheduler behavior for all
 ///       accelerator engines of a specific type (e.g. compute), it should
-///       select all the handles where the structure member
-///       ::zes_sched_properties_t.engines contains that type.
+///       select all the handles where the `engines` member
+///       ::zes_sched_properties_t contains that type.
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
 /// 
@@ -5344,18 +5716,18 @@ typedef struct _zes_sched_timeslice_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumSchedulers(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_sched_handle_t* phScheduler                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_sched_handle_t* phScheduler                                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5377,8 +5749,8 @@ zesDeviceEnumSchedulers(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesSchedulerGetProperties(
-    zes_sched_handle_t hScheduler,                  ///< [in] Handle for the component.
-    zes_sched_properties_t* pProperties             ///< [in,out] Structure that will contain property data.
+    zes_sched_handle_t hScheduler,                                          ///< [in] Handle for the component.
+    zes_sched_properties_t* pProperties                                     ///< [in,out] Structure that will contain property data.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5402,8 +5774,8 @@ zesSchedulerGetProperties(
 ///         + This scheduler component does not support scheduler modes.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesSchedulerGetCurrentMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    zes_sched_mode_t* pMode                         ///< [in,out] Will contain the current scheduler mode.
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    zes_sched_mode_t* pMode                                                 ///< [in,out] Will contain the current scheduler mode.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5427,10 +5799,10 @@ zesSchedulerGetCurrentMode(
 ///         + This scheduler component does not support scheduler modes.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesSchedulerGetTimeoutModeProperties(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    ze_bool_t getDefaults,                          ///< [in] If TRUE, the driver will return the system default properties for
-                                                    ///< this mode, otherwise it will return the current properties.
-    zes_sched_timeout_properties_t* pConfig         ///< [in,out] Will contain the current parameters for this mode.
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    ze_bool_t getDefaults,                                                  ///< [in] If TRUE, the driver will return the system default properties for
+                                                                            ///< this mode, otherwise it will return the current properties.
+    zes_sched_timeout_properties_t* pConfig                                 ///< [in,out] Will contain the current parameters for this mode.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5454,10 +5826,10 @@ zesSchedulerGetTimeoutModeProperties(
 ///         + This scheduler component does not support scheduler modes.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesSchedulerGetTimesliceModeProperties(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    ze_bool_t getDefaults,                          ///< [in] If TRUE, the driver will return the system default properties for
-                                                    ///< this mode, otherwise it will return the current properties.
-    zes_sched_timeslice_properties_t* pConfig       ///< [in,out] Will contain the current parameters for this mode.
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    ze_bool_t getDefaults,                                                  ///< [in] If TRUE, the driver will return the system default properties for
+                                                                            ///< this mode, otherwise it will return the current properties.
+    zes_sched_timeslice_properties_t* pConfig                               ///< [in,out] Will contain the current parameters for this mode.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5489,10 +5861,10 @@ zesSchedulerGetTimesliceModeProperties(
 ///         + User does not have permissions to make this modification.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesSchedulerSetTimeoutMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    zes_sched_timeout_properties_t* pProperties,    ///< [in] The properties to use when configurating this mode.
-    ze_bool_t* pNeedReload                          ///< [in,out] Will be set to TRUE if a device driver reload is needed to
-                                                    ///< apply the new scheduler mode.
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    zes_sched_timeout_properties_t* pProperties,                            ///< [in] The properties to use when configurating this mode.
+    ze_bool_t* pNeedReload                                                  ///< [in,out] Will be set to TRUE if a device driver reload is needed to
+                                                                            ///< apply the new scheduler mode.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5523,10 +5895,10 @@ zesSchedulerSetTimeoutMode(
 ///         + User does not have permissions to make this modification.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesSchedulerSetTimesliceMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    zes_sched_timeslice_properties_t* pProperties,  ///< [in] The properties to use when configurating this mode.
-    ze_bool_t* pNeedReload                          ///< [in,out] Will be set to TRUE if a device driver reload is needed to
-                                                    ///< apply the new scheduler mode.
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    zes_sched_timeslice_properties_t* pProperties,                          ///< [in] The properties to use when configurating this mode.
+    ze_bool_t* pNeedReload                                                  ///< [in,out] Will be set to TRUE if a device driver reload is needed to
+                                                                            ///< apply the new scheduler mode.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5556,9 +5928,9 @@ zesSchedulerSetTimesliceMode(
 ///         + User does not have permissions to make this modification.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesSchedulerSetExclusiveMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    ze_bool_t* pNeedReload                          ///< [in,out] Will be set to TRUE if a device driver reload is needed to
-                                                    ///< apply the new scheduler mode.
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    ze_bool_t* pNeedReload                                                  ///< [in,out] Will be set to TRUE if a device driver reload is needed to
+                                                                            ///< apply the new scheduler mode.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5572,6 +5944,7 @@ zesSchedulerSetExclusiveMode(
 ///       without enforcing any scheduler fairness policies.
 ///     - The application may call this function from simultaneous threads.
 ///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
 /// 
 /// @returns
 ///     - ::ZE_RESULT_SUCCESS
@@ -5589,9 +5962,9 @@ zesSchedulerSetExclusiveMode(
 ///         + User does not have permissions to make this modification.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesSchedulerSetComputeUnitDebugMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    ze_bool_t* pNeedReload                          ///< [in,out] Will be set to TRUE if a device driver reload is needed to
-                                                    ///< apply the new scheduler mode.
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    ze_bool_t* pNeedReload                                                  ///< [in,out] Will be set to TRUE if a device driver reload is needed to
+                                                                            ///< apply the new scheduler mode.
     );
 
 #if !defined(__GNUC__)
@@ -5605,7 +5978,7 @@ zesSchedulerSetComputeUnitDebugMode(
 /// @brief Standby hardware components
 typedef enum _zes_standby_type_t
 {
-    ZES_STANDBY_TYPE_GLOBAL = 0,                    ///< Control the overall standby policy of the device/sub-device
+    ZES_STANDBY_TYPE_GLOBAL = 0,                                            ///< Control the overall standby policy of the device/sub-device
     ZES_STANDBY_TYPE_FORCE_UINT32 = 0x7fffffff
 
 } zes_standby_type_t;
@@ -5614,13 +5987,13 @@ typedef enum _zes_standby_type_t
 /// @brief Standby hardware component properties
 typedef struct _zes_standby_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_standby_type_t type;                        ///< [out] Which standby hardware component this controls
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_standby_type_t type;                                                ///< [out] Which standby hardware component this controls
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
 
 } zes_standby_properties_t;
 
@@ -5628,9 +6001,9 @@ typedef struct _zes_standby_properties_t
 /// @brief Standby promotion modes
 typedef enum _zes_standby_promo_mode_t
 {
-    ZES_STANDBY_PROMO_MODE_DEFAULT = 0,             ///< Best compromise between performance and energy savings.
-    ZES_STANDBY_PROMO_MODE_NEVER = 1,               ///< The device/component will never shutdown. This can improve performance
-                                                    ///< but uses more energy.
+    ZES_STANDBY_PROMO_MODE_DEFAULT = 0,                                     ///< Best compromise between performance and energy savings.
+    ZES_STANDBY_PROMO_MODE_NEVER = 1,                                       ///< The device/component will never shutdown. This can improve performance
+                                                                            ///< but uses more energy.
     ZES_STANDBY_PROMO_MODE_FORCE_UINT32 = 0x7fffffff
 
 } zes_standby_promo_mode_t;
@@ -5654,18 +6027,18 @@ typedef enum _zes_standby_promo_mode_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumStandbyDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_standby_handle_t* phStandby                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_standby_handle_t* phStandby                                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5687,8 +6060,8 @@ zesDeviceEnumStandbyDomains(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesStandbyGetProperties(
-    zes_standby_handle_t hStandby,                  ///< [in] Handle for the component.
-    zes_standby_properties_t* pProperties           ///< [in,out] Will contain the standby hardware properties.
+    zes_standby_handle_t hStandby,                                          ///< [in] Handle for the component.
+    zes_standby_properties_t* pProperties                                   ///< [in,out] Will contain the standby hardware properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5710,8 +6083,8 @@ zesStandbyGetProperties(
 ///         + `nullptr == pMode`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesStandbyGetMode(
-    zes_standby_handle_t hStandby,                  ///< [in] Handle for the component.
-    zes_standby_promo_mode_t* pMode                 ///< [in,out] Will contain the current standby mode.
+    zes_standby_handle_t hStandby,                                          ///< [in] Handle for the component.
+    zes_standby_promo_mode_t* pMode                                         ///< [in,out] Will contain the current standby mode.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5735,8 +6108,8 @@ zesStandbyGetMode(
 ///         + User does not have permissions to make these modifications.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesStandbySetMode(
-    zes_standby_handle_t hStandby,                  ///< [in] Handle for the component.
-    zes_standby_promo_mode_t mode                   ///< [in] New standby mode.
+    zes_standby_handle_t hStandby,                                          ///< [in] Handle for the component.
+    zes_standby_promo_mode_t mode                                           ///< [in] New standby mode.
     );
 
 #if !defined(__GNUC__)
@@ -5750,12 +6123,14 @@ zesStandbySetMode(
 /// @brief Temperature sensors
 typedef enum _zes_temp_sensors_t
 {
-    ZES_TEMP_SENSORS_GLOBAL = 0,                    ///< The maximum temperature across all device sensors
-    ZES_TEMP_SENSORS_GPU = 1,                       ///< The maximum temperature across all sensors in the GPU
-    ZES_TEMP_SENSORS_MEMORY = 2,                    ///< The maximum temperature across all sensors in the local memory
-    ZES_TEMP_SENSORS_GLOBAL_MIN = 3,                ///< The minimum temperature across all device sensors
-    ZES_TEMP_SENSORS_GPU_MIN = 4,                   ///< The minimum temperature across all sensors in the GPU
-    ZES_TEMP_SENSORS_MEMORY_MIN = 5,                ///< The minimum temperature across all sensors in the local device memory
+    ZES_TEMP_SENSORS_GLOBAL = 0,                                            ///< The maximum temperature across all device sensors
+    ZES_TEMP_SENSORS_GPU = 1,                                               ///< The maximum temperature across all sensors in the GPU
+    ZES_TEMP_SENSORS_MEMORY = 2,                                            ///< The maximum temperature across all sensors in the local memory
+    ZES_TEMP_SENSORS_GLOBAL_MIN = 3,                                        ///< The minimum temperature across all device sensors
+    ZES_TEMP_SENSORS_GPU_MIN = 4,                                           ///< The minimum temperature across all sensors in the GPU
+    ZES_TEMP_SENSORS_MEMORY_MIN = 5,                                        ///< The minimum temperature across all sensors in the local device memory
+    ZES_TEMP_SENSORS_GPU_BOARD = 6,                                         ///< The maximum temperature across all sensors in the GPU Board
+    ZES_TEMP_SENSORS_GPU_BOARD_MIN = 7,                                     ///< The minimum temperature across all sensors in the GPU Board
     ZES_TEMP_SENSORS_FORCE_UINT32 = 0x7fffffff
 
 } zes_temp_sensors_t;
@@ -5764,21 +6139,21 @@ typedef enum _zes_temp_sensors_t
 /// @brief Temperature sensor properties
 typedef struct _zes_temp_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_temp_sensors_t type;                        ///< [out] Which part of the device the temperature sensor measures
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    double maxTemperature;                          ///< [out] Will contain the maximum temperature for the specific device in
-                                                    ///< degrees Celsius.
-    ze_bool_t isCriticalTempSupported;              ///< [out] Indicates if the critical temperature event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL is supported
-    ze_bool_t isThreshold1Supported;                ///< [out] Indicates if the temperature threshold 1 event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 is supported
-    ze_bool_t isThreshold2Supported;                ///< [out] Indicates if the temperature threshold 2 event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 is supported
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_temp_sensors_t type;                                                ///< [out] Which part of the device the temperature sensor measures
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    double maxTemperature;                                                  ///< [out] Will contain the maximum temperature for the specific device in
+                                                                            ///< degrees Celsius.
+    ze_bool_t isCriticalTempSupported;                                      ///< [out] Indicates if the critical temperature event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL is supported
+    ze_bool_t isThreshold1Supported;                                        ///< [out] Indicates if the temperature threshold 1 event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 is supported
+    ze_bool_t isThreshold2Supported;                                        ///< [out] Indicates if the temperature threshold 2 event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 is supported
 
 } zes_temp_properties_t;
 
@@ -5786,11 +6161,11 @@ typedef struct _zes_temp_properties_t
 /// @brief Temperature sensor threshold
 typedef struct _zes_temp_threshold_t
 {
-    ze_bool_t enableLowToHigh;                      ///< [in,out] Trigger an event when the temperature crosses from below the
-                                                    ///< threshold to above.
-    ze_bool_t enableHighToLow;                      ///< [in,out] Trigger an event when the temperature crosses from above the
-                                                    ///< threshold to below.
-    double threshold;                               ///< [in,out] The threshold in degrees Celsius.
+    ze_bool_t enableLowToHigh;                                              ///< [in,out] Trigger an event when the temperature crosses from below the
+                                                                            ///< threshold to above.
+    ze_bool_t enableHighToLow;                                              ///< [in,out] Trigger an event when the temperature crosses from above the
+                                                                            ///< threshold to below.
+    double threshold;                                                       ///< [in,out] The threshold in degrees Celsius.
 
 } zes_temp_threshold_t;
 
@@ -5799,17 +6174,17 @@ typedef struct _zes_temp_threshold_t
 ///        trigger conditions.
 typedef struct _zes_temp_config_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t enableCritical;                       ///< [in,out] Indicates if event ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL should
-                                                    ///< be triggered by the driver.
-    zes_temp_threshold_t threshold1;                ///< [in,out] Configuration controlling if and when event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 should be triggered by the
-                                                    ///< driver.
-    zes_temp_threshold_t threshold2;                ///< [in,out] Configuration controlling if and when event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 should be triggered by the
-                                                    ///< driver.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t enableCritical;                                               ///< [in,out] Indicates if event ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL should
+                                                                            ///< be triggered by the driver.
+    zes_temp_threshold_t threshold1;                                        ///< [in,out] Configuration controlling if and when event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 should be triggered by the
+                                                                            ///< driver.
+    zes_temp_threshold_t threshold2;                                        ///< [in,out] Configuration controlling if and when event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 should be triggered by the
+                                                                            ///< driver.
 
 } zes_temp_config_t;
 
@@ -5832,18 +6207,18 @@ typedef struct _zes_temp_config_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesDeviceEnumTemperatureSensors(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_temp_handle_t* phTemperature                ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_temp_handle_t* phTemperature                                        ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5865,8 +6240,8 @@ zesDeviceEnumTemperatureSensors(
 ///         + `nullptr == pProperties`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesTemperatureGetProperties(
-    zes_temp_handle_t hTemperature,                 ///< [in] Handle for the component.
-    zes_temp_properties_t* pProperties              ///< [in,out] Will contain the temperature sensor properties.
+    zes_temp_handle_t hTemperature,                                         ///< [in] Handle for the component.
+    zes_temp_properties_t* pProperties                                      ///< [in,out] Will contain the temperature sensor properties.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5888,14 +6263,14 @@ zesTemperatureGetProperties(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == pConfig`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Temperature thresholds are not supported on this temperature sensor. Generally this is only supported for temperature sensor ::ZES_TEMP_SENSORS_GLOBAL
-///         + One or both of the thresholds is not supported - check ::zes_temp_properties_t.isThreshold1Supported and ::zes_temp_properties_t.isThreshold2Supported
+///         + Temperature thresholds are not supported on this temperature sensor. Generally this is only supported for temperature sensor ::ZES_TEMP_SENSORS_GLOBAL.
+///         + One or both of the thresholds is not supported. Check the `isThreshold1Supported` and `isThreshold2Supported` members of ::zes_temp_properties_t.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to request this feature.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesTemperatureGetConfig(
-    zes_temp_handle_t hTemperature,                 ///< [in] Handle for the component.
-    zes_temp_config_t* pConfig                      ///< [in,out] Returns current configuration.
+    zes_temp_handle_t hTemperature,                                         ///< [in] Handle for the component.
+    zes_temp_config_t* pConfig                                              ///< [in,out] Returns current configuration.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5929,9 +6304,9 @@ zesTemperatureGetConfig(
 ///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `nullptr == pConfig`
 ///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Temperature thresholds are not supported on this temperature sensor. Generally they are only supported for temperature sensor ::ZES_TEMP_SENSORS_GLOBAL
-///         + Enabling the critical temperature event is not supported - check ::zes_temp_properties_t.isCriticalTempSupported
-///         + One or both of the thresholds is not supported - check ::zes_temp_properties_t.isThreshold1Supported and ::zes_temp_properties_t.isThreshold2Supported
+///         + Temperature thresholds are not supported on this temperature sensor. Generally they are only supported for temperature sensor ::ZES_TEMP_SENSORS_GLOBAL.
+///         + Enabling the critical temperature event is not supported. Check the `isCriticalTempSupported` member of ::zes_temp_properties_t.
+///         + One or both of the thresholds is not supported. Check the `isThreshold1Supported` and `isThreshold2Supported` members of ::zes_temp_properties_t.
 ///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
 ///         + User does not have permissions to request this feature.
 ///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
@@ -5940,8 +6315,8 @@ zesTemperatureGetConfig(
 ///         + One or both the thresholds is above TjMax (see ::zesFrequencyOcGetTjMax()). Temperature thresholds must be below this value.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesTemperatureSetConfig(
-    zes_temp_handle_t hTemperature,                 ///< [in] Handle for the component.
-    const zes_temp_config_t* pConfig                ///< [in] New configuration.
+    zes_temp_handle_t hTemperature,                                         ///< [in] Handle for the component.
+    const zes_temp_config_t* pConfig                                        ///< [in] New configuration.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -5963,9 +6338,9 @@ zesTemperatureSetConfig(
 ///         + `nullptr == pTemperature`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesTemperatureGetState(
-    zes_temp_handle_t hTemperature,                 ///< [in] Handle for the component.
-    double* pTemperature                            ///< [in,out] Will contain the temperature read from the specified sensor
-                                                    ///< in degrees Celsius.
+    zes_temp_handle_t hTemperature,                                         ///< [in] Handle for the component.
+    double* pTemperature                                                    ///< [in,out] Will contain the temperature read from the specified sensor
+                                                                            ///< in degrees Celsius.
     );
 
 #if !defined(__GNUC__)
@@ -5985,8 +6360,8 @@ zesTemperatureGetState(
 /// @brief Power Limits Extension Version(s)
 typedef enum _zes_power_limits_ext_version_t
 {
-    ZES_POWER_LIMITS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZES_POWER_LIMITS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZES_POWER_LIMITS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),             ///< version 1.0
+    ZES_POWER_LIMITS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),         ///< latest known version
     ZES_POWER_LIMITS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
 
 } zes_power_limits_ext_version_t;
@@ -5995,27 +6370,27 @@ typedef enum _zes_power_limits_ext_version_t
 /// @brief Device power/current limit descriptor.
 typedef struct _zes_power_limit_ext_desc_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_power_level_t level;                        ///< [in,out] duration type over which the power draw is measured, i.e.
-                                                    ///< sustained, burst, peak, or critical.
-    zes_power_source_t source;                      ///< [out] source of power used by the system, i.e. AC or DC.
-    zes_limit_unit_t limitUnit;                     ///< [out] unit used for specifying limit, i.e. current units (milliamps)
-                                                    ///< or power units (milliwatts).
-    ze_bool_t enabledStateLocked;                   ///< [out] indicates if the power limit state (enabled/ignored) can be set
-                                                    ///< (false) or is locked (true).
-    ze_bool_t enabled;                              ///< [in,out] indicates if the limit is enabled (true) or ignored (false).
-                                                    ///< If enabledStateIsLocked is True, this value is ignored.
-    ze_bool_t intervalValueLocked;                  ///< [out] indicates if the interval can be modified (false) or is fixed
-                                                    ///< (true).
-    int32_t interval;                               ///< [in,out] power averaging window in milliseconds. If
-                                                    ///< intervalValueLocked is true, this value is ignored.
-    ze_bool_t limitValueLocked;                     ///< [out] indicates if the limit can be set (false) or if the limit is
-                                                    ///< fixed (true).
-    int32_t limit;                                  ///< [in,out] limit value. If limitValueLocked is true, this value is
-                                                    ///< ignored. The value should be provided in the unit specified by
-                                                    ///< limitUnit.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_power_level_t level;                                                ///< [in,out] duration type over which the power draw is measured, i.e.
+                                                                            ///< sustained, burst, peak, or critical.
+    zes_power_source_t source;                                              ///< [out] source of power used by the system, i.e. AC or DC.
+    zes_limit_unit_t limitUnit;                                             ///< [out] unit used for specifying limit, i.e. current units (milliamps)
+                                                                            ///< or power units (milliwatts).
+    ze_bool_t enabledStateLocked;                                           ///< [out] indicates if the power limit state (enabled/ignored) can be set
+                                                                            ///< (false) or is locked (true).
+    ze_bool_t enabled;                                                      ///< [in,out] indicates if the limit is enabled (true) or ignored (false).
+                                                                            ///< If enabledStateIsLocked is True, this value is ignored.
+    ze_bool_t intervalValueLocked;                                          ///< [out] indicates if the interval can be modified (false) or is fixed
+                                                                            ///< (true).
+    int32_t interval;                                                       ///< [in,out] power averaging window in milliseconds. If
+                                                                            ///< intervalValueLocked is true, this value is ignored.
+    ze_bool_t limitValueLocked;                                             ///< [out] indicates if the limit can be set (false) or if the limit is
+                                                                            ///< fixed (true).
+    int32_t limit;                                                          ///< [in,out] limit value. If limitValueLocked is true, this value is
+                                                                            ///< ignored. The value should be provided in the unit specified by
+                                                                            ///< limitUnit.
 
 } zes_power_limit_ext_desc_t;
 
@@ -6031,11 +6406,11 @@ typedef struct _zes_power_limit_ext_desc_t
 ///       package-level v/s stack-level & the factory default power limits.
 typedef struct _zes_power_ext_properties_t
 {
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    zes_power_domain_t domain;                      ///< [out] domain that the power limit belongs to.
-    zes_power_limit_ext_desc_t* defaultLimit;       ///< [out] the factory default limit of the part.
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_power_domain_t domain;                                              ///< [out] domain that the power limit belongs to.
+    zes_power_limit_ext_desc_t* defaultLimit;                               ///< [out] the factory default limit of the part.
 
 } zes_power_ext_properties_t;
 
@@ -6060,16 +6435,16 @@ typedef struct _zes_power_ext_properties_t
 ///         + `nullptr == pCount`
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPowerGetLimitsExt(
-    zes_pwr_handle_t hPower,                        ///< [in] Power domain handle instance.
-    uint32_t* pCount,                               ///< [in,out] Pointer to the number of power limit descriptors. If count is
-                                                    ///< zero, then the driver shall update the value with the total number of
-                                                    ///< components of this type that are available. If count is greater than
-                                                    ///< the number of components of this type that are available, then the
-                                                    ///< driver shall update the value with the correct number of components.
-    zes_power_limit_ext_desc_t* pSustained          ///< [in,out][optional][range(0, *pCount)] Array of query results for power
-                                                    ///< limit descriptors. If count is less than the number of components of
-                                                    ///< this type that are available, then the driver shall only retrieve that
-                                                    ///< number of components.
+    zes_pwr_handle_t hPower,                                                ///< [in] Power domain handle instance.
+    uint32_t* pCount,                                                       ///< [in,out] Pointer to the number of power limit descriptors. If count is
+                                                                            ///< zero, then the driver shall update the value with the total number of
+                                                                            ///< components of this type that are available. If count is greater than
+                                                                            ///< the number of components of this type that are available, then the
+                                                                            ///< driver shall update the value with the correct number of components.
+    zes_power_limit_ext_desc_t* pSustained                                  ///< [in,out][optional][range(0, *pCount)] Array of query results for power
+                                                                            ///< limit descriptors. If count is less than the number of components of
+                                                                            ///< this type that are available, then the driver shall only retrieve that
+                                                                            ///< number of components.
     );
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -6103,11 +6478,331 @@ zesPowerGetLimitsExt(
 ///         + The device is in use, meaning that the GPU is under Over clocking, applying power limits under overclocking is not supported.
 ZE_APIEXPORT ze_result_t ZE_APICALL
 zesPowerSetLimitsExt(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    uint32_t* pCount,                               ///< [in] Pointer to the number of power limit descriptors.
-    zes_power_limit_ext_desc_t* pSustained          ///< [in][optional][range(0, *pCount)] Array of power limit descriptors.
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    uint32_t* pCount,                                                       ///< [in] Pointer to the number of power limit descriptors.
+    zes_power_limit_ext_desc_t* pSustained                                  ///< [in][optional][range(0, *pCount)] Array of power limit descriptors.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Engine Activity
+#if !defined(__GNUC__)
+#pragma region engineActivity
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_ENGINE_ACTIVITY_EXT_NAME
+/// @brief Engine Activity Extension Name
+#define ZES_ENGINE_ACTIVITY_EXT_NAME  "ZES_extension_engine_activity"
+#endif // ZES_ENGINE_ACTIVITY_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Engine Activity Extension Version(s)
+typedef enum _zes_engine_activity_ext_version_t
+{
+    ZES_ENGINE_ACTIVITY_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),          ///< version 1.0
+    ZES_ENGINE_ACTIVITY_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),      ///< latest known version
+    ZES_ENGINE_ACTIVITY_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_engine_activity_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties related to Engine Groups
+/// 
+/// @details
+///     - This structure may be passed to ::zesEngineGetProperties by having the
+///       pNext member of ::zes_engine_properties_t point at this struct.
+///     - Used for SRIOV per Virtual Function device utilization by
+///       ::zes_engine_group_t
+typedef struct _zes_engine_ext_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t countOfVirtualFunctionInstance;                                ///< [out] Number of Virtual Function(VF) instances associated with engine
+                                                                            ///< to monitor the utilization of hardware across all Virtual Function
+                                                                            ///< from a Physical Function (PF) instance.
+                                                                            ///< These VF-by-VF views should provide engine group and individual engine
+                                                                            ///< level granularity.
+                                                                            ///< This count represents the number of VF instances that are actively
+                                                                            ///< using the resource represented by the engine handle.
+
+} zes_engine_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get activity stats for Physical Function (PF) and each Virtual
+///        Function (VF) associated with engine group.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEngine`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE - "Engine activity extension is not supported in the environment."
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesEngineGetActivityExt(
+    zes_engine_handle_t hEngine,                                            ///< [in] Handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] Pointer to the number of VF engine stats descriptors.
+                                                                            ///<  - if count is zero, the driver shall update the value with the total
+                                                                            ///< number of engine stats available.
+                                                                            ///<  - if count is greater than the total number of engine stats
+                                                                            ///< available, the driver shall update the value with the correct number
+                                                                            ///< of engine stats available.
+                                                                            ///<  - The count returned is the sum of number of VF instances currently
+                                                                            ///< available and the PF instance.
+    zes_engine_stats_t* pStats                                              ///< [in,out][optional][range(0, *pCount)] array of engine group activity counters.
+                                                                            ///<  - if count is less than the total number of engine stats available,
+                                                                            ///< then driver shall only retrieve that number of stats.
+                                                                            ///<  - the implementation shall populate the vector with engine stat for
+                                                                            ///< PF at index 0 of the vector followed by user provided pCount-1 number
+                                                                            ///< of VF engine stats.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for RAS Get State and Clear State
+#if !defined(__GNUC__)
+#pragma region rasState
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_RAS_GET_STATE_EXP_NAME
+/// @brief RAS Get State Extension Name
+#define ZES_RAS_GET_STATE_EXP_NAME  "ZES_extension_ras_state"
+#endif // ZES_RAS_GET_STATE_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS Get State Extension Version(s)
+typedef enum _zes_ras_state_exp_version_t
+{
+    ZES_RAS_STATE_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZES_RAS_STATE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
+    ZES_RAS_STATE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_ras_state_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS error categories
+typedef enum _zes_ras_error_category_exp_t
+{
+    ZES_RAS_ERROR_CATEGORY_EXP_RESET = 0,                                   ///< The number of accelerator engine resets attempted by the driver
+    ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS = 1,                      ///< The number of hardware exceptions generated by the way workloads have
+                                                                            ///< programmed the hardware
+    ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS = 2,                           ///< The number of low level driver communication errors have occurred
+    ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS = 3,                          ///< The number of errors that have occurred in the compute accelerator
+                                                                            ///< hardware
+    ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS = 4,                      ///< The number of errors that have occurred in the fixed-function
+                                                                            ///< accelerator hardware
+    ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS = 5,                            ///< The number of errors that have occurred in caches (L1/L3/register
+                                                                            ///< file/shared local memory/sampler)
+    ZES_RAS_ERROR_CATEGORY_EXP_DISPLAY_ERRORS = 6,                          ///< The number of errors that have occurred in the display
+    ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS = 7,                           ///< The number of errors that have occurred in Memory
+    ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS = 8,                            ///< The number of errors that have occurred in Scale Fabric
+    ZES_RAS_ERROR_CATEGORY_EXP_L3FABRIC_ERRORS = 9,                         ///< The number of errors that have occurred in L3 Fabric
+    ZES_RAS_ERROR_CATEGORY_EXP_FORCE_UINT32 = 0x7fffffff
+
+} zes_ras_error_category_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension structure for providing RAS error counters for different
+///        error sets
+typedef struct _zes_ras_state_exp_t
+{
+    zes_ras_error_category_exp_t category;                                  ///< [out] category for which error counter is provided.
+    uint64_t errorCounter;                                                  ///< [out] Current value of RAS counter for specific error category.
+
+} zes_ras_state_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ras Get State
+/// 
+/// @details
+///     - This function retrieves error counters for different RAS error
+///       categories.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hRas`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesRasGetStateExp(
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of RAS state structures that can be retrieved.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of error categories for which state can be retrieved.
+                                                                            ///< if count is greater than the number of RAS states available, then the
+                                                                            ///< driver shall update the value with the correct number of RAS states available.
+    zes_ras_state_exp_t* pState                                             ///< [in,out][optional][range(0, *pCount)] array of query results for RAS
+                                                                            ///< error states for different categories.
+                                                                            ///< if count is less than the number of RAS states available, then driver
+                                                                            ///< shall only retrieve that number of RAS states.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ras Clear State
+/// 
+/// @details
+///     - This function clears error counters for a RAS error category.
+///     - Clearing errors will affect other threads/applications - the counter
+///       values will start from zero.
+///     - Clearing errors requires write permissions.
+///     - The application should not call this function from simultaneous
+///       threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hRas`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_RAS_ERROR_CATEGORY_EXP_L3FABRIC_ERRORS < category`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + Don't have permissions to clear error counters.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesRasClearStateExp(
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    zes_ras_error_category_exp_t category                                   ///< [in] category for which error counter is to be cleared.
     );
 
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Memory State
+#if !defined(__GNUC__)
+#pragma region memPageOfflineState
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MEM_PAGE_OFFLINE_STATE_EXP_NAME
+/// @brief Memory State Extension Name
+#define ZES_MEM_PAGE_OFFLINE_STATE_EXP_NAME  "ZES_extension_mem_state"
+#endif // ZES_MEM_PAGE_OFFLINE_STATE_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory State Extension Version(s)
+typedef enum _zes_mem_page_offline_state_exp_version_t
+{
+    ZES_MEM_PAGE_OFFLINE_STATE_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
+    ZES_MEM_PAGE_OFFLINE_STATE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
+    ZES_MEM_PAGE_OFFLINE_STATE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_mem_page_offline_state_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties for Memory State
+/// 
+/// @details
+///     - This structure may be returned from ::zesMemoryGetState via the
+///       `pNext` member of ::zes_mem_state_t
+///     - These additional parameters get Memory Page Offline Metrics
+typedef struct _zes_mem_page_offline_state_exp_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t memoryPageOffline;                                             ///< [out] Returns the number of Memory Pages Offline
+    uint32_t maxMemoryPageOffline;                                          ///< [out] Returns the Allowed Memory Pages Offline
+
+} zes_mem_page_offline_state_exp_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Memory Timestamp Valid Bits
+#if !defined(__GNUC__)
+#pragma region memoryTimestampValidBits
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MEMORY_TIMESTAMP_VALID_BITS_EXP_NAME
+/// @brief Memory Timestamp Valid Bits Extension Name
+#define ZES_MEMORY_TIMESTAMP_VALID_BITS_EXP_NAME  "ZES_extension_mem_timestamp_valid_bits"
+#endif // ZES_MEMORY_TIMESTAMP_VALID_BITS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory Timestamp Valid Bits Extension Version(s)
+typedef enum _zes_mem_timestamp_bits_exp_version_t
+{
+    ZES_MEM_TIMESTAMP_BITS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),       ///< version 1.0
+    ZES_MEM_TIMESTAMP_BITS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
+    ZES_MEM_TIMESTAMP_BITS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_mem_timestamp_bits_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties for reporting valid bit count for memory
+///        timestamp value
+/// 
+/// @details
+///     - This structure may be returned from ::zesMemoryGetProperties via the
+///       `pNext` member of ::zes_mem_properties_t.
+///     - Used for denoting number of valid bits in the timestamp value returned
+///       in ::zes_mem_bandwidth_t.
+typedef struct _zes_mem_timestamp_bits_exp_t
+{
+    uint32_t memoryTimestampValidBits;                                      ///< [out] Returns the number of valid bits in the timestamp values
+
+} zes_mem_timestamp_bits_exp_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Power Domain Properties
+#if !defined(__GNUC__)
+#pragma region powerDomainProperties
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_POWER_DOMAIN_PROPERTIES_EXP_NAME
+/// @brief Power Domain Properties Name
+#define ZES_POWER_DOMAIN_PROPERTIES_EXP_NAME  "ZES_extension_power_domain_properties"
+#endif // ZES_POWER_DOMAIN_PROPERTIES_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Power Domain Properties Extension Version(s)
+typedef enum _zes_power_domain_properties_exp_version_t
+{
+    ZES_POWER_DOMAIN_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZES_POWER_DOMAIN_PROPERTIES_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZES_POWER_DOMAIN_PROPERTIES_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_power_domain_properties_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension structure for providing power domain information associated
+///        with a power handle
+/// 
+/// @details
+///     - This structure may be returned from ::zesPowerGetProperties via the
+///       `pNext` member of ::zes_power_properties_t.
+///     - Used for associating a power handle with a power domain.
+typedef struct _zes_power_domain_exp_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_power_domain_t powerDomain;                                         ///< [out] Power domain associated with the power handle.
+
+} zes_power_domain_exp_properties_t;
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
diff --git a/deps/mpi/bin/hydra_bstrap_proxy b/deps/mpi/bin/hydra_bstrap_proxy
index b99341f56..340699afc 100755
Binary files a/deps/mpi/bin/hydra_bstrap_proxy and b/deps/mpi/bin/hydra_bstrap_proxy differ
diff --git a/deps/mpi/bin/hydra_nameserver b/deps/mpi/bin/hydra_nameserver
index 3eb0844b5..fa04c95fd 100755
Binary files a/deps/mpi/bin/hydra_nameserver and b/deps/mpi/bin/hydra_nameserver differ
diff --git a/deps/mpi/bin/hydra_pmi_proxy b/deps/mpi/bin/hydra_pmi_proxy
index 595fa9cf4..bc4178c9e 100755
Binary files a/deps/mpi/bin/hydra_pmi_proxy and b/deps/mpi/bin/hydra_pmi_proxy differ
diff --git a/deps/mpi/bin/mpiexec b/deps/mpi/bin/mpiexec
index 607c67dba..87e9c764c 100755
Binary files a/deps/mpi/bin/mpiexec and b/deps/mpi/bin/mpiexec differ
diff --git a/deps/mpi/bin/mpiexec.hydra b/deps/mpi/bin/mpiexec.hydra
index 607c67dba..87e9c764c 100755
Binary files a/deps/mpi/bin/mpiexec.hydra and b/deps/mpi/bin/mpiexec.hydra differ
diff --git a/deps/mpi/bin/mpigcc b/deps/mpi/bin/mpigcc
index 17290ae1b..418fd9cdd 100755
--- a/deps/mpi/bin/mpigcc
+++ b/deps/mpi/bin/mpigcc
@@ -126,7 +126,7 @@ CC="gcc"
 MPICH_VERSION="3.4a2"
 CFLAGS=""
 CPPFLAGS=""
-MPIVERSION="2021.11"
+MPIVERSION="2021.12"
 MPILIBNAME="mpi"                           
 
 
diff --git a/deps/mpi/bin/mpigxx b/deps/mpi/bin/mpigxx
index 36504441f..fe1567b3d 100755
--- a/deps/mpi/bin/mpigxx
+++ b/deps/mpi/bin/mpigxx
@@ -123,7 +123,7 @@ fi
 CXX="g++"
 MPICH_VERSION="3.4a2"
 CXXFLAGS=""
-MPIVERSION="2021.11"
+MPIVERSION="2021.12"
 MPILIBNAME="mpi"
 MPICXXLIBNAME="mpicxx"
 
diff --git a/deps/mpi/include/mpi.h b/deps/mpi/include/mpi.h
index 3b2004544..aab83f9d6 100644
--- a/deps/mpi/include/mpi.h
+++ b/deps/mpi/include/mpi.h
@@ -599,8 +599,8 @@ typedef int (MPI_Delete_function) ( MPI_Comm, int, void *, void * );
  * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So,
  * 2019.0.0b0 will have the numeric version 20190000100.
  */
-#define I_MPI_VERSION "2021.11.0"
-#define I_MPI_NUMVERSION 20211100300
+#define I_MPI_VERSION "2021.12.0"
+#define I_MPI_NUMVERSION 20211200300
 
 /* for the datatype decoders */
 enum MPIR_Combiner_enum {
@@ -1016,6 +1016,16 @@ typedef int (MPI_Datarep_extent_function)(MPI_Datatype datatype, MPI_Aint *,
                       void *);
 #define MPI_CONVERSION_FN_NULL ((MPI_Datarep_conversion_function *)0)
 
+typedef int (MPI_Datarep_conversion_function_c)(void *, MPI_Datatype, MPI_Count,
+             void *, MPI_Offset, void *);
+#define MPI_CONVERSION_FN_NULL_C ((MPI_Datarep_conversion_function_c *)0)
+
+typedef struct {
+    void **storage_stack;
+} QMPI_Context;
+
+#define QMPI_MAX_TOOL_NAME_LENGTH 256
+
 /* 
    For systems that may need to add additional definitions to support
    different declaration styles and options (e.g., different calling 
@@ -1130,58 +1140,246 @@ int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype, void *outbuf
 int MPI_Unpack(const void *inbuf, int insize, int *position, void *outbuf, int outcount,
                MPI_Datatype datatype, MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm, int *size) MPICH_API_PUBLIC;
-int MPI_Barrier(MPI_Comm comm) MPICH_API_PUBLIC;
-int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
-              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-               int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
-               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root,
-                MPI_Comm comm)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Scatterv(const void *sendbuf, const int *sendcounts, const int *displs,
-                 MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                 int root, MPI_Comm comm)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
+int MPI_Op_create(MPI_User_function *user_fn, int commute, MPI_Op *op) MPICH_API_PUBLIC;
+int MPI_Op_free(MPI_Op *op) MPICH_API_PUBLIC;
 int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Allgather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                       int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                       MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                   const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPI_Comm comm)
+                   const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                   MPI_Comm comm)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Allgatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                        const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                        MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                  MPI_Comm comm)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Allreduce_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                       MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
                  int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Alltoallv(const void *sendbuf, const int *sendcounts, const int *sdispls,
-                  MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
-                  const int *rdispls, MPI_Datatype recvtype, MPI_Comm comm)
+int MPI_Alltoall_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                      int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                      MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                  MPI_Datatype sendtype, void *recvbuf, const int recvcounts[], const int rdispls[],
+                  MPI_Datatype recvtype, MPI_Comm comm)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int MPI_Alltoallv_init(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                       MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                       const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                       MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
 int MPI_Alltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
                   const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
-                  const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
-int MPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-               MPI_Op op, MPI_Comm comm)
+                  const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm)
+                  MPICH_API_PUBLIC;
+int MPI_Alltoallw_init(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                       const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                       const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
+                       MPI_Info info, MPI_Request *request) MPICH_API_PUBLIC;
+int MPI_Barrier(MPI_Comm comm) MPICH_API_PUBLIC;
+int MPI_Barrier_init(MPI_Comm comm, MPI_Info info, MPI_Request *request) MPICH_API_PUBLIC;
+int MPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Bcast_init(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm,
+                   MPI_Info info, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+               MPI_Comm comm)
                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-               MPI_Op op, int root, MPI_Comm comm)
+int MPI_Exscan_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                    MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+               int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Gather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                    int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Info info,
+                    MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
+                MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Gatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                     const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
+                     MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                    const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                    MPI_Comm comm, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                   MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                   MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                   const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                   const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                   const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
+                   MPI_Request *request) MPICH_API_PUBLIC;
+int MPI_Ibarrier(MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
+int MPI_Ibcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm,
+               MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                MPI_Comm comm, MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
+                MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
+                 MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                            void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                            MPI_Request *request)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                             void *recvbuf, const int recvcounts[], const int displs[],
+                             MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                           int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                           MPI_Request *request)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                            MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                            const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                            MPI_Request *request)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
+                            const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                            const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
+                            MPI_Request *request) MPICH_API_PUBLIC;
+int MPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                int root, MPI_Comm comm, MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                        MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm,
+                              MPI_Request *request)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+              MPI_Comm comm, MPI_Request *request)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
+                 MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[],
+                  MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                  int root, MPI_Comm comm, MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
+int MPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                           int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Neighbor_allgather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                                MPI_Info info, MPI_Request *request)
+                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                            void *recvbuf, const int recvcounts[], const int displs[],
+                            MPI_Datatype recvtype, MPI_Comm comm)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Neighbor_allgatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, const int recvcounts[], const int displs[],
+                                 MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                                 MPI_Request *request)
+                                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                          int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoall_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                               void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                               MPI_Info info, MPI_Request *request)
+                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                           MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                           const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoallv_init(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                                MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                                const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                                MPI_Info info, MPI_Request *request)
+                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
+                           const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                           const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm)
+                           MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoallw_init(const void *sendbuf, const int sendcounts[],
+                                const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[],
+                                const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Info info,
+                                MPI_Request *request) MPICH_API_PUBLIC;
+int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+               int root, MPI_Comm comm)
                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Op_create(MPI_User_function *user_fn, int commute, MPI_Op *op) MPICH_API_PUBLIC;
-int MPI_Op_free(MPI_Op *op) MPICH_API_PUBLIC;
-int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                  MPI_Op op, MPI_Comm comm)
-                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                    int root, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_local(const void *inbuf, void *inoutbuf, int count, MPI_Datatype datatype,
+                     MPI_Op op)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Reduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
                        MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                             MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_scatter_block_init(const void *sendbuf, void *recvbuf, int recvcount,
+                                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                                  MPI_Request *request)
+                                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_scatter_init(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                            MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                            MPI_Request *request)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Scan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
              MPI_Comm comm)
              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Group_size(MPI_Group group, int *size) MPICH_API_PUBLIC;
-int MPI_Group_rank(MPI_Group group, int *rank) MPICH_API_PUBLIC;
+int MPI_Scan_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                  MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Scatter_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                     int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Info info,
+                     MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Scatterv(const void *sendbuf, const int sendcounts[], const int displs[],
+                 MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                 int root, MPI_Comm comm)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
+int MPI_Scatterv_init(const void *sendbuf, const int sendcounts[], const int displs[],
+                      MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                      int root, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
+
+IMPI_DEVICE_EXPORT int MPI_Group_size(MPI_Group group, int *size) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Group_rank(MPI_Group group, int *rank) MPICH_API_PUBLIC;
 int MPI_Group_translate_ranks(MPI_Group group1, int n, const int ranks1[], MPI_Group group2,
                               int ranks2[]) MPICH_API_PUBLIC;
 int MPI_Group_compare(MPI_Group group1, MPI_Group group2, int *result) MPICH_API_PUBLIC;
@@ -1193,7 +1391,7 @@ int MPI_Group_incl(MPI_Group group, int n, const int ranks[], MPI_Group *newgrou
 int MPI_Group_excl(MPI_Group group, int n, const int ranks[], MPI_Group *newgroup) MPICH_API_PUBLIC;
 int MPI_Group_range_incl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup) MPICH_API_PUBLIC;
 int MPI_Group_range_excl(MPI_Group group, int n, int ranges[][3], MPI_Group *newgroup) MPICH_API_PUBLIC;
-int MPI_Group_free(MPI_Group *group) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Group_free(MPI_Group *group) MPICH_API_PUBLIC;
 int MPI_Comm_size(MPI_Comm comm, int *size) MPICH_API_PUBLIC;
 int MPI_Comm_rank(MPI_Comm comm, int *rank) MPICH_API_PUBLIC;
 int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int *result) MPICH_API_PUBLIC;
@@ -1291,13 +1489,21 @@ int MPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_
                    MPI_Win *win) MPICH_API_PUBLIC;
 IMPI_DEVICE_EXPORT int MPI_Win_fence(int assert, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_free(MPI_Win *win) MPICH_API_PUBLIC;
-int MPI_Win_get_group(MPI_Win win, MPI_Group *group) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Win_get_group(MPI_Win win, MPI_Group *group) MPICH_API_PUBLIC;
 IMPI_DEVICE_EXPORT int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_post(MPI_Group group, int assert, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_start(MPI_Group group, int assert, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_test(MPI_Win win, int *flag) MPICH_API_PUBLIC;
 IMPI_DEVICE_EXPORT int MPI_Win_unlock(int rank, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_wait(MPI_Win win) MPICH_API_PUBLIC;
+int MPI_Win_allocate_c(MPI_Aint size, MPI_Aint disp_unit, MPI_Info info, MPI_Comm comm,
+                       void *baseptr, MPI_Win *win) MPICH_API_PUBLIC;
+int MPI_Win_allocate_shared_c(MPI_Aint size, MPI_Aint disp_unit, MPI_Info info, MPI_Comm comm,
+                              void *baseptr, MPI_Win *win) MPICH_API_PUBLIC;
+int MPI_Win_create_c(void *base, MPI_Aint size, MPI_Aint disp_unit, MPI_Info info, MPI_Comm comm,
+                     MPI_Win *win) MPICH_API_PUBLIC;
+int MPI_Win_shared_query_c(MPI_Win win, int rank, MPI_Aint *size, MPI_Aint *disp_unit,
+                           void *baseptr) MPICH_API_PUBLIC;
 
 /* MPI-3 One-Sided Communication Routines */
 int MPI_Win_allocate(MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, void *baseptr,
@@ -1480,14 +1686,7 @@ int MPI_Type_create_f90_integer(int range, MPI_Datatype *newtype) MPICH_API_PUBL
 int MPI_Type_create_f90_real(int precision, int range, MPI_Datatype *newtype) MPICH_API_PUBLIC;
 int MPI_Type_create_f90_complex(int precision, int range, MPI_Datatype *newtype) MPICH_API_PUBLIC;
 
-int MPI_Reduce_local(const void *inbuf, void *inoutbuf, int count, MPI_Datatype datatype,
-                     MPI_Op op)
-                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Op_commutative(MPI_Op op, int *commute) MPICH_API_PUBLIC;
-int MPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
-                             MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Dist_graph_create_adjacent(MPI_Comm comm_old, int indegree, const int sources[],
                                    const int sourceweights[], int outdegree,
                                    const int destinations[], const int destweights[],
@@ -1510,112 +1709,6 @@ int MPI_Mrecv(void *buf, int count, MPI_Datatype datatype, MPI_Message *message,
 
 /* Nonblocking collectives */
 int MPI_Comm_idup(MPI_Comm comm, MPI_Comm *newcomm, MPI_Request *request) MPICH_API_PUBLIC;
-int MPI_Ibarrier(MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
-int MPI_Ibcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm,
-               MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
-                MPI_Request *request)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                 const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
-                 MPI_Comm comm, MPI_Request *request)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int MPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
-                 MPI_Request *request)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[],
-                  MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                  int root, MPI_Comm comm, MPI_Request *request)
-                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
-int MPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                    const int recvcounts[], const int displs[], MPI_Datatype recvtype,
-                    MPI_Comm comm, MPI_Request *request)
-                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int MPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                  int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
-                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                   MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
-                   const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
-                   MPI_Request *request)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int MPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                   const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
-                   const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
-                   MPI_Request *request) MPICH_API_PUBLIC;
-int MPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                MPI_Op op, int root, MPI_Comm comm, MPI_Request *request)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                   MPI_Op op, MPI_Comm comm, MPI_Request *request)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
-                        MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
-                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
-                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm,
-                              MPI_Request *request)
-                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
-                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
-              MPI_Comm comm, MPI_Request *request)
-              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                MPI_Op op, MPI_Comm comm, MPI_Request *request)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-
-/* Neighborhood collectives */
-int MPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                            void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                            MPI_Comm comm, MPI_Request *request)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                             void *recvbuf, const int recvcounts[], const int displs[],
-                             MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int MPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                           void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
-                           MPI_Request *request)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                            MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
-                            const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
-                            MPI_Request *request)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int MPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
-                            const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
-                            void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[],
-                            const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
-int MPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                           void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                            void *recvbuf, const int recvcounts[], const int displs[],
-                            MPI_Datatype recvtype, MPI_Comm comm)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int MPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                          void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
-                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                           MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
-                           const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int MPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
-                           const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
-                           const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
 
 /* Shared memory */
 int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm) MPICH_API_PUBLIC;
@@ -1795,56 +1888,8 @@ int PMPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype, void *outbu
 int PMPI_Unpack(const void *inbuf, int insize, int *position, void *outbuf, int outcount,
                 MPI_Datatype datatype, MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm, int *size) MPICH_API_PUBLIC;
-int PMPI_Barrier(MPI_Comm comm) MPICH_API_PUBLIC;
-int PMPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
-               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int PMPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                 const int *recvcounts, const int *displs, MPI_Datatype recvtype, int root,
-                 MPI_Comm comm)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int PMPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Scatterv(const void *sendbuf, const int *sendcounts, const int *displs,
-                  MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                  int root, MPI_Comm comm)
-                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
-int PMPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                    const int *recvcounts, const int *displs, MPI_Datatype recvtype, MPI_Comm comm)
-                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int PMPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                  int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
-                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Alltoallv(const void *sendbuf, const int *sendcounts, const int *sdispls,
-                   MPI_Datatype sendtype, void *recvbuf, const int *recvcounts,
-                   const int *rdispls, MPI_Datatype recvtype, MPI_Comm comm)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int PMPI_Alltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                   const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
-                   const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
-int PMPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                MPI_Op op, MPI_Comm comm)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                MPI_Op op, int root, MPI_Comm comm)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Op_create(MPI_User_function *user_fn, int commute, MPI_Op *op) MPICH_API_PUBLIC;
 int PMPI_Op_free(MPI_Op *op) MPICH_API_PUBLIC;
-int PMPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                   MPI_Op op, MPI_Comm comm)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Reduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
-                        MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
-                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Scan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
-              MPI_Comm comm)
-              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Group_size(MPI_Group group, int *size) MPICH_API_PUBLIC;
 int PMPI_Group_rank(MPI_Group group, int *rank) MPICH_API_PUBLIC;
 int PMPI_Group_translate_ranks(MPI_Group group1, int n, const int ranks1[], MPI_Group group2,
@@ -2173,118 +2218,8 @@ int PMPI_Mrecv(void *buf, int count, MPI_Datatype datatype, MPI_Message *message
 
 /* Nonblocking collectives */
 int PMPI_Comm_idup(MPI_Comm comm, MPI_Comm *newcomm, MPI_Request *request) MPICH_API_PUBLIC;
-int PMPI_Ibarrier(MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
-int PMPI_Ibcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm,
-                MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int PMPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
-                 MPI_Request *request)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                  const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
-                  MPI_Comm comm, MPI_Request *request)
-                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int PMPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                  int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
-                  MPI_Request *request)
-                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[],
-                   MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                   int root, MPI_Comm comm, MPI_Request *request)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
-int PMPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                    int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
-                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                     const int recvcounts[], const int displs[], MPI_Datatype recvtype,
-                     MPI_Comm comm, MPI_Request *request)
-                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int PMPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
-                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                    MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
-                    const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
-                    MPI_Request *request)
-                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int PMPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                    const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
-                    const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
-                    MPI_Request *request) MPICH_API_PUBLIC;
-int PMPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                 MPI_Op op, int root, MPI_Comm comm, MPI_Request *request)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                    MPI_Op op, MPI_Comm comm, MPI_Request *request)
-                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
-                         MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
-                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
-                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
-                               MPI_Datatype datatype, MPI_Op op, MPI_Comm comm,
-                               MPI_Request *request)
-                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
-                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
-               MPI_Comm comm, MPI_Request *request)
-               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
-                 MPI_Op op, MPI_Comm comm, MPI_Request *request)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-
-/* Neighborhood collectives */
-int PMPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                             void *recvbuf, int recvcount, MPI_Datatype recvtype,
-                             MPI_Comm comm, MPI_Request *request)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                              void *recvbuf, const int recvcounts[], const int displs[],
-                              MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
-                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int PMPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                            void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
-                            MPI_Request *request)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                             MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
-                             const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
-                             MPI_Request *request)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int PMPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[],
-                             const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
-                             void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[],
-                             const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
-int PMPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                            void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                             void *recvbuf, const int recvcounts[], const int displs[],
-                             MPI_Datatype recvtype, MPI_Comm comm)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int PMPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
-                           void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
-                            MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
-                            const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4)
-                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int PMPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
-                            const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
-                            const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
-                            MPI_Comm comm) MPICH_API_PUBLIC;
-
 /* Shared memory */
 int PMPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info, MPI_Comm *newcomm) MPICH_API_PUBLIC;
-
 /* Noncollective communicator creation */
 int PMPI_Comm_create_group(MPI_Comm comm, MPI_Group group, int tag, MPI_Comm *newcomm) MPICH_API_PUBLIC;
 
@@ -2356,44 +2291,84 @@ int PMPIX_Comm_agree(MPI_Comm comm, int *flag) MPICH_API_PUBLIC;
 int MPI_Allgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                     MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm)
                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Allgather_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                         void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                         MPI_Info info, MPI_Request *request)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Allgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                     const MPI_Count *recvcounts, const MPI_Aint *displs, MPI_Datatype recvtype, MPI_Comm comm)
+                     const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype,
+                     MPI_Comm comm)
                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Allgatherv_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                          void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
+                          MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                          MPI_Request *request)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int MPI_Allreduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                     MPI_Op op, MPI_Comm comm)
                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int MPI_Reduce_scatter_c(const void *sendbuf, void *recvbuf, const MPI_Count recvcounts[],
-                         MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+int MPI_Allreduce_init_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
+                         MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request)
                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Alltoall_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                    MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Alltoall_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                        void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                        MPI_Info info, MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Alltoallv_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint sdispls[],
                     MPI_Datatype sendtype, void *recvbuf, const MPI_Count recvcounts[],
                     const MPI_Aint rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int MPI_Alltoallv_init_c(const void *sendbuf, const MPI_Count sendcounts[],
+                         const MPI_Aint sdispls[], MPI_Datatype sendtype, void *recvbuf,
+                         const MPI_Count recvcounts[], const MPI_Aint rdispls[],
+                         MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
 int MPI_Alltoallw_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint sdispls[],
                     const MPI_Datatype sendtypes[], void *recvbuf, const MPI_Count recvcounts[],
-                    const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
+                    const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm)
+                    MPICH_API_PUBLIC;
+int MPI_Alltoallw_init_c(const void *sendbuf, const MPI_Count sendcounts[],
+                         const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf,
+                         const MPI_Count recvcounts[], const MPI_Aint rdispls[],
+                         const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Info info,
+                         MPI_Request *request) MPICH_API_PUBLIC;
 int MPI_Bcast_c(void *buffer, MPI_Count count, MPI_Datatype datatype, int root, MPI_Comm comm)
     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Bcast_init_c(void *buffer, MPI_Count count, MPI_Datatype datatype, int root, MPI_Comm comm,
+                     MPI_Info info, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 int MPI_Exscan_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                  MPI_Op op, MPI_Comm comm)
                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Exscan_init_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
+                      MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Gather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                  MPI_Count recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Gather_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                      void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, int root,
+                      MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Gatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                  const MPI_Count *recvcounts, const MPI_Aint *displs, MPI_Datatype recvtype, int root,
-                  MPI_Comm comm)
+                  const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype,
+                  int root, MPI_Comm comm)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Gatherv_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                       void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
+                       MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Info info,
+                       MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int MPI_Iallgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                      MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm,
                      MPI_Request *request)
                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Iallgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                      const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype,
-                      MPI_Comm comm, MPI_Request *request)
+int MPI_Iallgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                      void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
+                      MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int MPI_Iallreduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                      MPI_Op op, MPI_Comm comm, MPI_Request *request)
@@ -2421,8 +2396,8 @@ int MPI_Igather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtyp
                   MPI_Request *request)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Igatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                   const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype, int root,
-                   MPI_Comm comm, MPI_Request *request)
+                   const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype,
+                   int root, MPI_Comm comm, MPI_Request *request)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int MPI_Ineighbor_allgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                               void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
@@ -2431,8 +2406,7 @@ int MPI_Ineighbor_allgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Data
 int MPI_Ineighbor_allgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                                void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
                                MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
-                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int MPI_Ineighbor_alltoall_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                              void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
                              MPI_Comm comm, MPI_Request *request)
@@ -2445,7 +2419,8 @@ int MPI_Ineighbor_alltoallv_c(const void *sendbuf, const MPI_Count sendcounts[],
 int MPI_Ineighbor_alltoallw_c(const void *sendbuf, const MPI_Count sendcounts[],
                               const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
                               void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint rdispls[],
-                              const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
+                              const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request)
+                              MPICH_API_PUBLIC;
 int MPI_Ireduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                   MPI_Op op, int root, MPI_Comm comm, MPI_Request *request)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
@@ -2464,254 +2439,733 @@ int MPI_Iscatter_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendty
                    MPI_Request *request)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Iscatterv_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint displs[],
-                    MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
-                    int root, MPI_Comm comm, MPI_Request *request)
+                    MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount,
+                    MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request)
                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
 int MPI_Neighbor_allgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                              void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
                              MPI_Comm comm)
                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Neighbor_allgather_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
+                                  MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Neighbor_allgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                               void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
                               MPI_Datatype recvtype, MPI_Comm comm)
-                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int MPI_Neighbor_allgatherv_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, const MPI_Count recvcounts[],
+                                   const MPI_Aint displs[], MPI_Datatype recvtype, MPI_Comm comm,
+                                   MPI_Info info, MPI_Request *request)
+                                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int MPI_Neighbor_alltoall_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                             void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
                             MPI_Comm comm)
                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoall_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
+                                 MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int MPI_Neighbor_alltoallv_c(const void *sendbuf, const MPI_Count sendcounts[],
                              const MPI_Aint sdispls[], MPI_Datatype sendtype, void *recvbuf,
                              const MPI_Count recvcounts[], const MPI_Aint rdispls[],
                              MPI_Datatype recvtype, MPI_Comm comm)
                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int MPI_Neighbor_alltoallw_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint sdispls[],
-                             const MPI_Datatype sendtypes[], void *recvbuf, const MPI_Count recvcounts[],
-                             const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoallv_init_c(const void *sendbuf, const MPI_Count sendcounts[],
+                                  const MPI_Aint sdispls[], MPI_Datatype sendtype, void *recvbuf,
+                                  const MPI_Count recvcounts[], const MPI_Aint rdispls[],
+                                  MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                                  MPI_Request *request)
+                                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoallw_c(const void *sendbuf, const MPI_Count sendcounts[],
+                             const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                             void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint rdispls[],
+                             const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
+int MPI_Neighbor_alltoallw_init_c(const void *sendbuf, const MPI_Count sendcounts[],
+                                  const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                  void *recvbuf, const MPI_Count recvcounts[],
+                                  const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                                  MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                                  MPICH_API_PUBLIC;
 int MPI_Reduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                  MPI_Op op, int root, MPI_Comm comm)
                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_init_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
+                      MPI_Op op, int root, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_scatter_c(const void *sendbuf, void *recvbuf, const MPI_Count recvcounts[],
+                         MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Reduce_scatter_block_c(const void *sendbuf, void *recvbuf, MPI_Count recvcount,
                                MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_scatter_block_init_c(const void *sendbuf, void *recvbuf, MPI_Count recvcount,
+                                    MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                                    MPI_Request *request)
+                                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Reduce_scatter_init_c(const void *sendbuf, void *recvbuf, const MPI_Count recvcounts[],
+                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                              MPI_Request *request)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Scan_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                MPI_Op op, MPI_Comm comm)
                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int MPI_Scan_init_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
+                    MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int MPI_Scatter_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                   MPI_Count recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Scatterv_c(const void *sendbuf, const MPI_Count *sendcounts, const MPI_Aint *displs,
+int MPI_Scatter_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                       void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, int root,
+                       MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Scatterv_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint displs[],
                    MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
                    int root, MPI_Comm comm)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
-
-
+int MPI_Scatterv_init_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint displs[],
+                        MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount,
+                        MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Info info,
+                        MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
+int MPI_Get_count_c(const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count)
+    MPICH_API_PUBLIC;
+int MPI_Get_elements_c(const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count)
+    MPICH_API_PUBLIC;
+int MPI_Pack_c(const void *inbuf, MPI_Count incount, MPI_Datatype datatype, void *outbuf,
+               MPI_Count outsize, MPI_Count *position, MPI_Comm comm) MPICH_API_PUBLIC;
+int MPI_Pack_external_c(const char *datarep, const void *inbuf, MPI_Count incount,
+                        MPI_Datatype datatype, void *outbuf, MPI_Count outsize,
+                        MPI_Count *position) MPICH_API_PUBLIC;
+int MPI_Pack_external_size_c(const char *datarep, MPI_Count incount, MPI_Datatype datatype,
+                             MPI_Count *size) MPICH_API_PUBLIC;
+int MPI_Pack_size_c(MPI_Count incount, MPI_Datatype datatype, MPI_Comm comm, MPI_Count *size)
+    MPICH_API_PUBLIC;
+int MPI_Status_set_elements_c(MPI_Status *status, MPI_Datatype datatype, MPI_Count count)
+    MPICH_API_PUBLIC;
+int MPI_Type_contiguous_c(MPI_Count count, MPI_Datatype oldtype, MPI_Datatype *newtype)
+    MPICH_API_PUBLIC;
+int MPI_Type_create_darray_c(int size, int rank, int ndims, const MPI_Count array_of_gsizes[],
+                             const int array_of_distribs[], const int array_of_dargs[],
+                             const int array_of_psizes[], int order, MPI_Datatype oldtype,
+                             MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Type_create_hindexed_c(MPI_Count count, const MPI_Count array_of_blocklengths[],
+                               const MPI_Count array_of_displacements[], MPI_Datatype oldtype,
+                               MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Type_create_hindexed_block_c(MPI_Count count, MPI_Count blocklength,
+                                     const MPI_Count array_of_displacements[], MPI_Datatype oldtype,
+                                     MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Type_create_hvector_c(MPI_Count count, MPI_Count blocklength, MPI_Count stride,
+                              MPI_Datatype oldtype, MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Type_create_indexed_block_c(MPI_Count count, MPI_Count blocklength,
+                                    const MPI_Count array_of_displacements[], MPI_Datatype oldtype,
+                                    MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Type_create_resized_c(MPI_Datatype oldtype, MPI_Count lb, MPI_Count extent,
+                              MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Type_create_struct_c(MPI_Count count, const MPI_Count array_of_blocklengths[],
+                             const MPI_Count array_of_displacements[],
+                             const MPI_Datatype array_of_types[], MPI_Datatype *newtype)
+                             MPICH_API_PUBLIC;
+int MPI_Type_create_subarray_c(int ndims, const MPI_Count array_of_sizes[],
+                               const MPI_Count array_of_subsizes[],
+                               const MPI_Count array_of_starts[], int order, MPI_Datatype oldtype,
+                               MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Type_get_contents_c(MPI_Datatype datatype, MPI_Count max_integers, MPI_Count max_addresses,
+                            MPI_Count max_large_counts, MPI_Count max_datatypes,
+                            int array_of_integers[], MPI_Aint array_of_addresses[],
+                            MPI_Count array_of_large_counts[], MPI_Datatype array_of_datatypes[])
+                            MPICH_API_PUBLIC;
+int MPI_Type_get_envelope_c(MPI_Datatype datatype, MPI_Count *num_integers,
+                            MPI_Count *num_addresses, MPI_Count *num_large_counts,
+                            MPI_Count *num_datatypes, int *combiner) MPICH_API_PUBLIC;
+int MPI_Type_get_extent_c(MPI_Datatype datatype, MPI_Count *lb, MPI_Count *extent)
+    MPICH_API_PUBLIC;
+int MPI_Type_get_true_extent_c(MPI_Datatype datatype, MPI_Count *true_lb, MPI_Count *true_extent)
+    MPICH_API_PUBLIC;
+int MPI_Type_indexed_c(MPI_Count count, const MPI_Count array_of_blocklengths[],
+                       const MPI_Count array_of_displacements[], MPI_Datatype oldtype,
+                       MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Type_size_c(MPI_Datatype datatype, MPI_Count *size) MPICH_API_PUBLIC;
+int MPI_Type_vector_c(MPI_Count count, MPI_Count blocklength, MPI_Count stride,
+                      MPI_Datatype oldtype, MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int MPI_Unpack_c(const void *inbuf, MPI_Count insize, MPI_Count *position, void *outbuf,
+                 MPI_Count outcount, MPI_Datatype datatype, MPI_Comm comm) MPICH_API_PUBLIC;
+int MPI_Unpack_external_c(const char datarep[], const void *inbuf, MPI_Count insize,
+                          MPI_Count *position, void *outbuf, MPI_Count outcount,
+                          MPI_Datatype datatype) MPICH_API_PUBLIC;
+int MPI_Bsend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Bsend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                     MPI_Comm comm, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Buffer_attach_c(void *buffer, MPI_Count size) MPICH_API_PUBLIC;
+int MPI_Buffer_detach_c(void *buffer_addr, MPI_Count *size) MPICH_API_PUBLIC;
+int MPI_Ibsend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                 MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Imrecv_c(void *buf, MPI_Count count, MPI_Datatype datatype, MPI_Message *message,
+                 MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Irecv_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, int tag,
+                MPI_Comm comm, MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Irsend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                 MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Isend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                MPI_Comm comm, MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Issend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                 MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Mrecv_c(void *buf, MPI_Count count, MPI_Datatype datatype, MPI_Message *message,
+                MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Recv_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, int tag,
+               MPI_Comm comm, MPI_Status *status)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Recv_init_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, int tag,
+                    MPI_Comm comm, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Rsend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Rsend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                     MPI_Comm comm, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Send_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+               MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Send_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                    MPI_Comm comm, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Sendrecv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, int dest,
+                   int sendtag, void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
+                   int source, int recvtag, MPI_Comm comm, MPI_Status *status)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(6,8) MPICH_API_PUBLIC;
+int MPI_Sendrecv_replace_c(void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int sendtag,
+                           int source, int recvtag, MPI_Comm comm, MPI_Status *status)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Ssend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Ssend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
+                     MPI_Comm comm, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Accumulate_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+                     int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+                     MPI_Datatype target_datatype, MPI_Op op, MPI_Win win)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Get_c(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+              int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+              MPI_Datatype target_datatype, MPI_Win win)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Get_accumulate_c(const void *origin_addr, MPI_Count origin_count,
+                         MPI_Datatype origin_datatype, void *result_addr, MPI_Count result_count,
+                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
+                         MPI_Count target_count, MPI_Datatype target_datatype, MPI_Op op,
+                         MPI_Win win)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+              int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+              MPI_Datatype target_datatype, MPI_Win win)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Raccumulate_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+                      int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+                      MPI_Datatype target_datatype, MPI_Op op, MPI_Win win, MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Rget_c(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+               int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+               MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int MPI_Rget_accumulate_c(const void *origin_addr, MPI_Count origin_count,
+                          MPI_Datatype origin_datatype, void *result_addr, MPI_Count result_count,
+                          MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
+                          MPI_Count target_count, MPI_Datatype target_datatype, MPI_Op op,
+                          MPI_Win win, MPI_Request *request)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int MPI_Rput_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+               int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+               MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Allgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                      MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm)
                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Allgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                      const MPI_Count *recvcounts, const MPI_Aint *displs, MPI_Datatype recvtype, MPI_Comm comm)
+int PMPI_Allgather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                        int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                        MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Allgather_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                          void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                          MPI_Info info, MPI_Request *request)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                    const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                    MPI_Comm comm)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Allgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                      void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
+                      MPI_Datatype recvtype, MPI_Comm comm)
                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Allgatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                         const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                         MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Allgatherv_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                           void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
+                           MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                           MPI_Request *request)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                   MPI_Comm comm)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Allreduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
-                    MPI_Op op, MPI_Comm comm)
-                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
-int PMPI_Reduce_scatter_c(const void *sendbuf, void *recvbuf, const MPI_Count recvcounts[],
-                          MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                     MPI_Op op, MPI_Comm comm)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Allreduce_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                        MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Allreduce_init_c(const void *sendbuf, void *recvbuf, MPI_Count count,
+                          MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                          MPI_Request *request)
                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Alltoall_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                     MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm)
                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Alltoall_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                       int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                       MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Alltoall_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                         void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                         MPI_Info info, MPI_Request *request)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                   MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                   const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
 int PMPI_Alltoallv_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint sdispls[],
                      MPI_Datatype sendtype, void *recvbuf, const MPI_Count recvcounts[],
                      const MPI_Aint rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int PMPI_Alltoallv_init(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                        MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                        const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                        MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int PMPI_Alltoallv_init_c(const void *sendbuf, const MPI_Count sendcounts[],
+                          const MPI_Aint sdispls[], MPI_Datatype sendtype, void *recvbuf,
+                          const MPI_Count recvcounts[], const MPI_Aint rdispls[],
+                          MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                          MPI_Request *request)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int PMPI_Alltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                   const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                   const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm)
+                   MPICH_API_PUBLIC;
 int PMPI_Alltoallw_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint sdispls[],
                      const MPI_Datatype sendtypes[], void *recvbuf, const MPI_Count recvcounts[],
-                     const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
+                     const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm)
+                     MPICH_API_PUBLIC;
+int PMPI_Alltoallw_init(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                        const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                        const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
+                        MPI_Info info, MPI_Request *request) MPICH_API_PUBLIC;
+int PMPI_Alltoallw_init_c(const void *sendbuf, const MPI_Count sendcounts[],
+                          const MPI_Aint sdispls[], const MPI_Datatype sendtypes[], void *recvbuf,
+                          const MPI_Count recvcounts[], const MPI_Aint rdispls[],
+                          const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Info info,
+                          MPI_Request *request) MPICH_API_PUBLIC;
+int PMPI_Barrier(MPI_Comm comm) MPICH_API_PUBLIC;
+int PMPI_Barrier_init(MPI_Comm comm, MPI_Info info, MPI_Request *request) MPICH_API_PUBLIC;
+int PMPI_Bcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 int PMPI_Bcast_c(void *buffer, MPI_Count count, MPI_Datatype datatype, int root, MPI_Comm comm)
     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Bcast_init(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm,
+                    MPI_Info info, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Bcast_init_c(void *buffer, MPI_Count count, MPI_Datatype datatype, int root, MPI_Comm comm,
+                      MPI_Info info, MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Exscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Exscan_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                   MPI_Op op, MPI_Comm comm)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Exscan_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                     MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Exscan_init_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
+                       MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Gather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Gather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                   MPI_Count recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Gather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                     int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Info info,
+                     MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Gather_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                       void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, int root,
+                       MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Gatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
+                 MPI_Comm comm)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int PMPI_Gatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                   const MPI_Count *recvcounts, const MPI_Aint *displs, MPI_Datatype recvtype, int root,
-                   MPI_Comm comm)
+                   const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype,
+                   int root, MPI_Comm comm)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
-int PMPI_Iallgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                      MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+int PMPI_Gatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                      const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
+                      MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Gatherv_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                        void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
+                        MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Info info,
+                        MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Iallgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                    int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Iallgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                      void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm,
                       MPI_Request *request)
                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Iallgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                       const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype,
-                       MPI_Comm comm, MPI_Request *request)
+int PMPI_Iallgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                     const int recvcounts[], const int displs[], MPI_Datatype recvtype,
+                     MPI_Comm comm, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Iallgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                       void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
+                       MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                    MPI_Comm comm, MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Iallreduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                       MPI_Op op, MPI_Comm comm, MPI_Request *request)
                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Ialltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                   int recvcount, MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Ialltoall_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                      MPI_Count recvcount, MPI_Datatype recvtype, MPI_Comm comm,
                      MPI_Request *request)
                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Ialltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                    MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                    const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                    MPI_Request *request)
+                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
 int PMPI_Ialltoallv_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint sdispls[],
                       MPI_Datatype sendtype, void *recvbuf, const MPI_Count recvcounts[],
                       const MPI_Aint rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
                       MPI_Request *request)
                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int PMPI_Ialltoallw(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                    const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                    const int rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
+                    MPI_Request *request) MPICH_API_PUBLIC;
 int PMPI_Ialltoallw_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint sdispls[],
-                     const MPI_Datatype sendtypes[], void *recvbuf, const MPI_Count recvcounts[],
-                     const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
-                     MPI_Request *request) MPICH_API_PUBLIC;
+                      const MPI_Datatype sendtypes[], void *recvbuf, const MPI_Count recvcounts[],
+                      const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm,
+                      MPI_Request *request) MPICH_API_PUBLIC;
+int PMPI_Ibarrier(MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
+int PMPI_Ibcast(void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm,
+                MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 int PMPI_Ibcast_c(void *buffer, MPI_Count count, MPI_Datatype datatype, int root, MPI_Comm comm,
                   MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                 MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Iexscan_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                    MPI_Op op, MPI_Comm comm, MPI_Request *request)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Igather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
+                 MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Igather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                    MPI_Count recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
                    MPI_Request *request)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Igatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  const int recvcounts[], const int displs[], MPI_Datatype recvtype, int root,
+                  MPI_Comm comm, MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int PMPI_Igatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
-                    const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype, int root,
-                    MPI_Comm comm, MPI_Request *request)
+                    const MPI_Count recvcounts[], const MPI_Aint displs[], MPI_Datatype recvtype,
+                    int root, MPI_Comm comm, MPI_Request *request)
                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Ineighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                             void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                             MPI_Request *request)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Ineighbor_allgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                                void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
                                MPI_Comm comm, MPI_Request *request)
                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Ineighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                              void *recvbuf, const int recvcounts[], const int displs[],
+                              MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int PMPI_Ineighbor_allgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
-                                void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
-                                MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
-                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+                                void *recvbuf, const MPI_Count recvcounts[],
+                                const MPI_Aint displs[], MPI_Datatype recvtype, MPI_Comm comm,
+                                MPI_Request *request)
+                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Ineighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                            void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                            MPI_Request *request)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Ineighbor_alltoall_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                               void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
                               MPI_Comm comm, MPI_Request *request)
                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Ineighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                             MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                             const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                             MPI_Request *request)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
 int PMPI_Ineighbor_alltoallv_c(const void *sendbuf, const MPI_Count sendcounts[],
                                const MPI_Aint sdispls[], MPI_Datatype sendtype, void *recvbuf,
                                const MPI_Count recvcounts[], const MPI_Aint rdispls[],
                                MPI_Datatype recvtype, MPI_Comm comm, MPI_Request *request)
                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int PMPI_Ineighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
+                             const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                             const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                             MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
 int PMPI_Ineighbor_alltoallw_c(const void *sendbuf, const MPI_Count sendcounts[],
                                const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
-                               void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint rdispls[],
-                               const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
+                               void *recvbuf, const MPI_Count recvcounts[],
+                               const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                               MPI_Comm comm, MPI_Request *request) MPICH_API_PUBLIC;
+int PMPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                 int root, MPI_Comm comm, MPI_Request *request)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Ireduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                    MPI_Op op, int root, MPI_Comm comm, MPI_Request *request)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                         MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Ireduce_scatter_c(const void *sendbuf, void *recvbuf, const MPI_Count recvcounts[],
                            MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Request *request)
                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Ireduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                               MPI_Datatype datatype, MPI_Op op, MPI_Comm comm,
+                               MPI_Request *request)
+                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Ireduce_scatter_block_c(const void *sendbuf, void *recvbuf, MPI_Count recvcount,
                                  MPI_Datatype datatype, MPI_Op op, MPI_Comm comm,
                                  MPI_Request *request)
                                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+               MPI_Comm comm, MPI_Request *request)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Iscan_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                  MPI_Op op, MPI_Comm comm, MPI_Request *request)
                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Iscatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                  int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
+                  MPI_Request *request)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Iscatter_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                     MPI_Count recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm,
                     MPI_Request *request)
                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Iscatterv(const void *sendbuf, const int sendcounts[], const int displs[],
+                   MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                   int root, MPI_Comm comm, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
 int PMPI_Iscatterv_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint displs[],
-                     MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
-                     int root, MPI_Comm comm, MPI_Request *request)
+                     MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount,
+                     MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Request *request)
                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
+int PMPI_Neighbor_allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                            void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Neighbor_allgather_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                               void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
                               MPI_Comm comm)
                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Neighbor_allgather_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                 void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                                 MPI_Info info, MPI_Request *request)
+                                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Neighbor_allgather_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                                   void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
+                                   MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Neighbor_allgatherv(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                             void *recvbuf, const int recvcounts[], const int displs[],
+                             MPI_Datatype recvtype, MPI_Comm comm)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
 int PMPI_Neighbor_allgatherv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                                void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint displs[],
                                MPI_Datatype recvtype, MPI_Comm comm)
-                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
-                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Neighbor_allgatherv_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, const int recvcounts[], const int displs[],
+                                  MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                                  MPI_Request *request)
+                                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Neighbor_allgatherv_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                                    void *recvbuf, const MPI_Count recvcounts[],
+                                    const MPI_Aint displs[], MPI_Datatype recvtype, MPI_Comm comm,
+                                    MPI_Info info, MPI_Request *request)
+                                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,7) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoall(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                           int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Neighbor_alltoall_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
                              void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
                              MPI_Comm comm)
                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoall_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype,
+                                void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm,
+                                MPI_Info info, MPI_Request *request)
+                                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoall_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                                  void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
+                                  MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoallv(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                            MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                            const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm)
+                            MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
 int PMPI_Neighbor_alltoallv_c(const void *sendbuf, const MPI_Count sendcounts[],
                               const MPI_Aint sdispls[], MPI_Datatype sendtype, void *recvbuf,
                               const MPI_Count recvcounts[], const MPI_Aint rdispls[],
                               MPI_Datatype recvtype, MPI_Comm comm)
                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
-int PMPI_Neighbor_alltoallw_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint sdispls[],
-                            const MPI_Datatype sendtypes[], void *recvbuf, const MPI_Count recvcounts[],
-                            const MPI_Aint rdispls[], const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoallv_init(const void *sendbuf, const int sendcounts[], const int sdispls[],
+                                 MPI_Datatype sendtype, void *recvbuf, const int recvcounts[],
+                                 const int rdispls[], MPI_Datatype recvtype, MPI_Comm comm,
+                                 MPI_Info info, MPI_Request *request)
+                                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoallv_init_c(const void *sendbuf, const MPI_Count sendcounts[],
+                                   const MPI_Aint sdispls[], MPI_Datatype sendtype, void *recvbuf,
+                                   const MPI_Count recvcounts[], const MPI_Aint rdispls[],
+                                   MPI_Datatype recvtype, MPI_Comm comm, MPI_Info info,
+                                   MPI_Request *request)
+                                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,8) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoallw(const void *sendbuf, const int sendcounts[], const MPI_Aint sdispls[],
+                            const MPI_Datatype sendtypes[], void *recvbuf, const int recvcounts[],
+                            const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                            MPI_Comm comm) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoallw_c(const void *sendbuf, const MPI_Count sendcounts[],
+                              const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                              void *recvbuf, const MPI_Count recvcounts[], const MPI_Aint rdispls[],
+                              const MPI_Datatype recvtypes[], MPI_Comm comm) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoallw_init(const void *sendbuf, const int sendcounts[],
+                                 const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                 void *recvbuf, const int recvcounts[], const MPI_Aint rdispls[],
+                                 const MPI_Datatype recvtypes[], MPI_Comm comm, MPI_Info info,
+                                 MPI_Request *request) MPICH_API_PUBLIC;
+int PMPI_Neighbor_alltoallw_init_c(const void *sendbuf, const MPI_Count sendcounts[],
+                                   const MPI_Aint sdispls[], const MPI_Datatype sendtypes[],
+                                   void *recvbuf, const MPI_Count recvcounts[],
+                                   const MPI_Aint rdispls[], const MPI_Datatype recvtypes[],
+                                   MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                                   MPICH_API_PUBLIC;
+int PMPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                int root, MPI_Comm comm)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Reduce_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                   MPI_Op op, int root, MPI_Comm comm)
                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
+                     MPI_Op op, int root, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_init_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
+                       MPI_Op op, int root, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_local(const void *inbuf, void *inoutbuf, int count, MPI_Datatype datatype,
+                      MPI_Op op)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_scatter(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                        MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_scatter_c(const void *sendbuf, void *recvbuf, const MPI_Count recvcounts[],
+                          MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_scatter_block(const void *sendbuf, void *recvbuf, int recvcount,
+                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+                              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Reduce_scatter_block_c(const void *sendbuf, void *recvbuf, MPI_Count recvcount,
                                 MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
                                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_scatter_block_init(const void *sendbuf, void *recvbuf, int recvcount,
+                                   MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                                   MPI_Request *request)
+                                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_scatter_block_init_c(const void *sendbuf, void *recvbuf, MPI_Count recvcount,
+                                     MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                                     MPI_Request *request)
+                                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_scatter_init(const void *sendbuf, void *recvbuf, const int recvcounts[],
+                             MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                             MPI_Request *request)
+                             MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Reduce_scatter_init_c(const void *sendbuf, void *recvbuf, const MPI_Count recvcounts[],
+                               MPI_Datatype datatype, MPI_Op op, MPI_Comm comm, MPI_Info info,
+                               MPI_Request *request)
+                               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Scan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+              MPI_Comm comm)
+              MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
 int PMPI_Scan_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
                 MPI_Op op, MPI_Comm comm)
                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Scan_init(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op,
+                   MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Scan_init_c(const void *sendbuf, void *recvbuf, MPI_Count count, MPI_Datatype datatype,
+                     MPI_Op op, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) MPICH_API_PUBLIC;
+int PMPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                 int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
+                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
 int PMPI_Scatter_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, void *recvbuf,
                    MPI_Count recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int PMPI_Scatterv_c(const void *sendbuf, const MPI_Count *sendcounts, const MPI_Aint *displs,
-                    MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
-                    int root, MPI_Comm comm)
+int PMPI_Scatter_init(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf,
+                      int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Info info,
+                      MPI_Request *request)
+                      MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Scatter_init_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype,
+                        void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype, int root,
+                        MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                        MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Scatterv(const void *sendbuf, const int sendcounts[], const int displs[],
+                  MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                  int root, MPI_Comm comm)
+                  MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
+int PMPI_Scatterv_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint displs[],
+                    MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount,
+                    MPI_Datatype recvtype, int root, MPI_Comm comm)
                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
-
-int MPI_Bsend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Bsend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                     MPI_Comm comm, MPI_Request *request)
-                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Buffer_attach_c(void *buffer, MPI_Count size) MPICH_API_PUBLIC;
-int MPI_Buffer_detach_c(void *buffer_addr, MPI_Count *size) MPICH_API_PUBLIC;
-int MPI_Ibsend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                 MPI_Comm comm, MPI_Request *request)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Imrecv_c(void *buf, MPI_Count count, MPI_Datatype datatype, MPI_Message *message,
-                 MPI_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Irecv_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, int tag,
-                MPI_Comm comm, MPI_Request *request)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Irsend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                 MPI_Comm comm, MPI_Request *request)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Isend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                MPI_Comm comm, MPI_Request *request)
-                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Issend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                 MPI_Comm comm, MPI_Request *request)
-                 MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Mrecv_c(void *buf, MPI_Count count, MPI_Datatype datatype, MPI_Message *message,
-                MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Recv_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, int tag,
-               MPI_Comm comm, MPI_Status *status)
-               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Recv_init_c(void *buf, MPI_Count count, MPI_Datatype datatype, int source, int tag,
-                    MPI_Comm comm, MPI_Request *request)
-                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Rsend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Rsend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                     MPI_Comm comm, MPI_Request *request)
-                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Send_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-               MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Send_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                    MPI_Comm comm, MPI_Request *request)
-                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Sendrecv_c(const void *sendbuf, MPI_Count sendcount, MPI_Datatype sendtype, int dest,
-                   int sendtag, void *recvbuf, MPI_Count recvcount, MPI_Datatype recvtype,
-                   int source, int recvtag, MPI_Comm comm, MPI_Status *status)
-                   MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(6,8) MPICH_API_PUBLIC;
-int MPI_Sendrecv_replace_c(void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int sendtag,
-                           int source, int recvtag, MPI_Comm comm, MPI_Status *status)
-                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Ssend_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Ssend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int tag,
-                     MPI_Comm comm, MPI_Request *request)
-                     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Scatterv_init(const void *sendbuf, const int sendcounts[], const int displs[],
+                       MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
+                       int root, MPI_Comm comm, MPI_Info info, MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
+int PMPI_Scatterv_init_c(const void *sendbuf, const MPI_Count sendcounts[], const MPI_Aint displs[],
+                         MPI_Datatype sendtype, void *recvbuf, MPI_Count recvcount,
+                         MPI_Datatype recvtype, int root, MPI_Comm comm, MPI_Info info,
+                         MPI_Request *request)
+                         MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,4) MPICH_ATTR_POINTER_WITH_TYPE_TAG(5,7) MPICH_API_PUBLIC;
 
 int PMPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
     MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
@@ -2823,6 +3277,116 @@ int PMPI_Ssend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, i
                       MPI_Comm comm, MPI_Request *request)
                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 
+int PMPI_Get_count_c(const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count)
+    MPICH_API_PUBLIC;
+int PMPI_Get_elements_c(const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count)
+    MPICH_API_PUBLIC;
+int PMPI_Get_elements_x(const MPI_Status *status, MPI_Datatype datatype, MPI_Count *count)
+    MPICH_API_PUBLIC;
+int PMPI_Pack_c(const void *inbuf, MPI_Count incount, MPI_Datatype datatype, void *outbuf,
+                MPI_Count outsize, MPI_Count *position, MPI_Comm comm) MPICH_API_PUBLIC;
+int PMPI_Pack_external_c(const char *datarep, const void *inbuf, MPI_Count incount,
+                         MPI_Datatype datatype, void *outbuf, MPI_Count outsize,
+                         MPI_Count *position) MPICH_API_PUBLIC;
+int PMPI_Pack_external_size_c(const char *datarep, MPI_Count incount, MPI_Datatype datatype,
+                              MPI_Count *size) MPICH_API_PUBLIC;
+int PMPI_Pack_size_c(MPI_Count incount, MPI_Datatype datatype, MPI_Comm comm, MPI_Count *size)
+    MPICH_API_PUBLIC;
+int PMPI_Status_set_elements_c(MPI_Status *status, MPI_Datatype datatype, MPI_Count count)
+    MPICH_API_PUBLIC;
+int PMPI_Type_contiguous_c(MPI_Count count, MPI_Datatype oldtype, MPI_Datatype *newtype)
+    MPICH_API_PUBLIC;
+int PMPI_Type_create_darray_c(int size, int rank, int ndims, const MPI_Count array_of_gsizes[],
+                              const int array_of_distribs[], const int array_of_dargs[],
+                              const int array_of_psizes[], int order, MPI_Datatype oldtype,
+                              MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int PMPI_Type_create_hindexed_c(MPI_Count count, const MPI_Count array_of_blocklengths[],
+                                const MPI_Count array_of_displacements[], MPI_Datatype oldtype,
+                                MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int PMPI_Type_create_hindexed_block_c(MPI_Count count, MPI_Count blocklength,
+                                      const MPI_Count array_of_displacements[],
+                                      MPI_Datatype oldtype, MPI_Datatype *newtype)
+                                      MPICH_API_PUBLIC;
+int PMPI_Type_create_hvector_c(MPI_Count count, MPI_Count blocklength, MPI_Count stride,
+                               MPI_Datatype oldtype, MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int PMPI_Type_create_indexed_block_c(MPI_Count count, MPI_Count blocklength,
+                                     const MPI_Count array_of_displacements[], MPI_Datatype oldtype,
+                                     MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int PMPI_Type_create_resized_c(MPI_Datatype oldtype, MPI_Count lb, MPI_Count extent,
+                               MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int PMPI_Type_create_struct_c(MPI_Count count, const MPI_Count array_of_blocklengths[],
+                              const MPI_Count array_of_displacements[],
+                              const MPI_Datatype array_of_types[], MPI_Datatype *newtype)
+                              MPICH_API_PUBLIC;
+int PMPI_Type_create_subarray_c(int ndims, const MPI_Count array_of_sizes[],
+                                const MPI_Count array_of_subsizes[],
+                                const MPI_Count array_of_starts[], int order, MPI_Datatype oldtype,
+                                MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int PMPI_Type_get_contents_c(MPI_Datatype datatype, MPI_Count max_integers, MPI_Count max_addresses,
+                             MPI_Count max_large_counts, MPI_Count max_datatypes,
+                             int array_of_integers[], MPI_Aint array_of_addresses[],
+                             MPI_Count array_of_large_counts[], MPI_Datatype array_of_datatypes[])
+                             MPICH_API_PUBLIC;
+int PMPI_Type_get_envelope_c(MPI_Datatype datatype, MPI_Count *num_integers,
+                             MPI_Count *num_addresses, MPI_Count *num_large_counts,
+                             MPI_Count *num_datatypes, int *combiner) MPICH_API_PUBLIC;
+int PMPI_Type_get_extent_c(MPI_Datatype datatype, MPI_Count *lb, MPI_Count *extent)
+    MPICH_API_PUBLIC;
+int PMPI_Type_get_true_extent_c(MPI_Datatype datatype, MPI_Count *true_lb, MPI_Count *true_extent)
+    MPICH_API_PUBLIC;
+int PMPI_Type_indexed_c(MPI_Count count, const MPI_Count array_of_blocklengths[],
+                        const MPI_Count array_of_displacements[], MPI_Datatype oldtype,
+                        MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int PMPI_Type_size_c(MPI_Datatype datatype, MPI_Count *size) MPICH_API_PUBLIC;
+int PMPI_Type_vector_c(MPI_Count count, MPI_Count blocklength, MPI_Count stride,
+                       MPI_Datatype oldtype, MPI_Datatype *newtype) MPICH_API_PUBLIC;
+int PMPI_Unpack_c(const void *inbuf, MPI_Count insize, MPI_Count *position, void *outbuf,
+                  MPI_Count outcount, MPI_Datatype datatype, MPI_Comm comm) MPICH_API_PUBLIC;
+int PMPI_Unpack_external_c(const char datarep[], const void *inbuf, MPI_Count insize,
+                           MPI_Count *position, void *outbuf, MPI_Count outcount,
+                           MPI_Datatype datatype) MPICH_API_PUBLIC;
+
+int PMPI_Win_allocate_c(MPI_Aint size, MPI_Aint disp_unit, MPI_Info info, MPI_Comm comm,
+                        void *baseptr, MPI_Win *win) MPICH_API_PUBLIC;
+int PMPI_Win_create_c(void *base, MPI_Aint size, MPI_Aint disp_unit, MPI_Info info, MPI_Comm comm,
+                      MPI_Win *win) MPICH_API_PUBLIC;
+int PMPI_Win_shared_query_c(MPI_Win win, int rank, MPI_Aint *size, MPI_Aint *disp_unit,
+                            void *baseptr) MPICH_API_PUBLIC;
+int PMPI_Win_allocate_shared_c(MPI_Aint size, MPI_Aint disp_unit, MPI_Info info, MPI_Comm comm,
+                               void *baseptr, MPI_Win *win) MPICH_API_PUBLIC;
+int PMPI_Get_c(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+               int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+               MPI_Datatype target_datatype, MPI_Win win)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Get_accumulate_c(const void *origin_addr, MPI_Count origin_count,
+                          MPI_Datatype origin_datatype, void *result_addr, MPI_Count result_count,
+                          MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
+                          MPI_Count target_count, MPI_Datatype target_datatype, MPI_Op op,
+                          MPI_Win win)
+                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Put_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+               int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+               MPI_Datatype target_datatype, MPI_Win win)
+               MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Raccumulate_c(const void *origin_addr, MPI_Count origin_count,
+                       MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp,
+                       MPI_Count target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Win win,
+                       MPI_Request *request)
+                       MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Rget_c(void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+                int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+                MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
+int PMPI_Rget_accumulate_c(const void *origin_addr, MPI_Count origin_count,
+                           MPI_Datatype origin_datatype, void *result_addr, MPI_Count result_count,
+                           MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
+                           MPI_Count target_count, MPI_Datatype target_datatype, MPI_Op op,
+                           MPI_Win win, MPI_Request *request)
+                           MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
+int PMPI_Rput_c(const void *origin_addr, MPI_Count origin_count, MPI_Datatype origin_datatype,
+                int target_rank, MPI_Aint target_disp, MPI_Count target_count,
+                MPI_Datatype target_datatype, MPI_Win win, MPI_Request *request)
+                MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 #endif  /* MPI_BUILD_PROFILING */
 /* End of MPI bindings */
 
@@ -2840,6 +3404,18 @@ int PMPI_Ssend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, i
 
 #include "mpio.h"
 
+/* GPU extensions */
+#define MPIX_GPU_SUPPORT_CUDA  (0)
+#define MPIX_GPU_SUPPORT_ZE    (1)
+#define MPIX_GPU_SUPPORT_DEVICE_INITIATED   (3)
+int MPIX_GPU_query_support(int gpu_type, int *is_supported) MPICH_API_PUBLIC;
+int MPIX_Query_cuda_support(void) MPICH_API_PUBLIC;
+int MPIX_Query_ze_support(void) MPICH_API_PUBLIC;
+#ifdef MPI_BUILD_PROFILING
+int PMPIX_GPU_query_support(int gpu_type, int *is_supported) MPICH_API_PUBLIC;
+int PMPIX_Query_cuda_support(void) MPICH_API_PUBLIC;
+int PMPIX_Query_ze_support(void) MPICH_API_PUBLIC;
+#endif
 #if defined(__cplusplus)
 }
 /* Add the C++ bindings */
diff --git a/deps/mpi/include/mpicxx.h b/deps/mpi/include/mpicxx.h
index 696ee8558..980df839c 100644
--- a/deps/mpi/include/mpicxx.h
+++ b/deps/mpi/include/mpicxx.h
@@ -287,7 +287,7 @@ class Datatype  {
     static Datatype Create_struct( int v1, int v2[], Aint v3[],  const Datatype v4[] ) 
     {
         Datatype v5;
-        MPI_Datatype *l4 = new MPI_Datatype[v1];
+        MPI_Datatype *l4 = new MPI_Datatype[(unsigned long)v1];
         { 
             int i4; 
             for (i4=0;i4<v1;i4++) {
@@ -376,7 +376,7 @@ class Datatype  {
     }
     virtual void Get_contents( int v2, int v3, int v4, int v5[], Aint v6[], Datatype v7[] ) const
     {
-        MPI_Datatype *l7 = new MPI_Datatype[v4];
+        MPI_Datatype *l7 = new MPI_Datatype[(unsigned long)v4];
         MPIX_CALLWORLD( MPI_Type_get_contents( (MPI_Datatype) the_real_datatype, v2, v3, v4, v5, v6, l7 ));
         { 
             int i7; 
@@ -417,7 +417,7 @@ class Datatype  {
     static Datatype Create_struct( int v1, const int v2[], const Aint v3[],  const Datatype v4[] ) 
     {
         Datatype v5;
-        MPI_Datatype *l4 = new MPI_Datatype[v1];
+        MPI_Datatype *l4 = new MPI_Datatype[(unsigned long)v1];
         { 
             int i4; 
             for (i4=0;i4<v1;i4++) {
@@ -849,7 +849,7 @@ class Request  {
     static bool Testany( int v1, Request v2[], int &v3, Status & v5 ) 
     {
         int v4;
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -869,7 +869,7 @@ class Request  {
     static bool Testany( int v1, Request v2[], int &v3 ) 
     {
         int v4;
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -889,8 +889,8 @@ class Request  {
     static int Waitsome( int v1, Request v2[], int v4[], Status v5[] ) 
     {
         int v3;
-        MPI_Request *l2 = new MPI_Request[v1];
-        MPI_Status *l5 = new MPI_Status[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
+        MPI_Status *l5 = new MPI_Status[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -917,7 +917,7 @@ class Request  {
     static int Waitsome( int v1, Request v2[], int v4[] ) 
     {
         int v3;
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -941,8 +941,8 @@ class Request  {
     static bool Testall( int v1, Request v2[], Status v4[] ) 
     {
         int v3;
-        MPI_Request *l2 = new MPI_Request[v1];
-        MPI_Status *l4 = new MPI_Status[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
+        MPI_Status *l4 = new MPI_Status[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -969,7 +969,7 @@ class Request  {
     static bool Testall( int v1, Request v2[] ) 
     {
         int v3;
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -997,8 +997,8 @@ class Request  {
     static int Testsome( int v1, Request v2[], int v4[], Status v5[] ) 
     {
         int v3;
-        MPI_Request *l2 = new MPI_Request[v1];
-        MPI_Status *l5 = new MPI_Status[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
+        MPI_Status *l5 = new MPI_Status[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -1025,7 +1025,7 @@ class Request  {
     static int Testsome( int v1, Request v2[], int v4[] ) 
     {
         int v3;
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -1044,8 +1044,8 @@ class Request  {
     }
     static void Waitall( int v1, Request v2[], Status v3[] ) 
     {
-        MPI_Request *l2 = new MPI_Request[v1];
-        MPI_Status *l3 = new MPI_Status[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
+        MPI_Status *l3 = new MPI_Status[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -1070,7 +1070,7 @@ class Request  {
     }
     static void Waitall( int v1, Request v2[] ) 
     {
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -1089,7 +1089,7 @@ class Request  {
     static int Waitany( int v1, Request v2[], Status & v4 ) 
     {
         int v3;
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -1109,7 +1109,7 @@ class Request  {
     static int Waitany( int v1, Request v2[] ) 
     {
         int v3;
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -1183,7 +1183,7 @@ class Prequest : public Request {
     }
     static void Startall( int v1, Prequest v2[] ) 
     {
-        MPI_Request *l2 = new MPI_Request[v1];
+        MPI_Request *l2 = new MPI_Request[(unsigned long)v1];
         { 
             int i2; 
             for (i2=0;i2<v1;i2++) {
@@ -1412,9 +1412,9 @@ class Comm  {
     {
         MPIX_CALLREF( this, MPI_Rsend( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, v5, (MPI_Comm) the_real_comm ));
     }
-    virtual void Gatherv( const void * v1, int v2, const Datatype &v3, void * v4, const int * v5, const int * v6, const Datatype &v7, int v8 ) const
+    virtual void Gatherv( const void * v1, int v2, const Datatype &v3, void * v4, const int v5[], const int v6[], const Datatype &v7, int v8 ) const
     {
-        MPIX_CALLREF( this, MPI_Gatherv( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, (const int *)v5, (const int *)v6, (MPI_Datatype)(v7.the_real_datatype), v8, (MPI_Comm) the_real_comm ));
+        MPIX_CALLREF( this, MPI_Gatherv( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, (const int  *)v5, (const int  *)v6, (MPI_Datatype)(v7.the_real_datatype), v8, (MPI_Comm) the_real_comm ));
     }
     virtual void Disconnect( void ) 
     {
@@ -1426,8 +1426,8 @@ class Comm  {
     }
     virtual void Alltoallw( const void * v1, const int v2[], const int v3[],  const Datatype v4[], void * v5, const int v6[], const int v7[],  const Datatype v8[] ) const
     {
-        MPI_Datatype *l4 = new MPI_Datatype[Get_size()];
-        MPI_Datatype *l8 = new MPI_Datatype[Get_size()];
+        MPI_Datatype *l4 = new MPI_Datatype[(unsigned long)Get_size()];
+        MPI_Datatype *l8 = new MPI_Datatype[(unsigned long)Get_size()];
         { 
             int i4; 
             for (i4=0;i4<Get_size();i4++) {
@@ -1472,9 +1472,9 @@ class Comm  {
         MPIX_CALLREF( this, MPI_Comm_set_name( (MPI_Comm) the_real_comm, (const char *)v2 ));
     }
     static Intercomm Get_parent( void ) ;
-    virtual void Alltoallv( const void * v1, const int * v2, const int * v3, const Datatype &v4, void * v5, const int * v6, const int * v7, const Datatype &v8 ) const
+    virtual void Alltoallv( const void * v1, const int v2[], const int v3[], const Datatype &v4, void * v5, const int v6[], const int v7[], const Datatype &v8 ) const
     {
-        MPIX_CALLREF( this, MPI_Alltoallv( (const void *)v1, (const int *)v2, (const int *)v3, (MPI_Datatype)(v4.the_real_datatype), v5, (const int *)v6, (const int *)v7, (MPI_Datatype)(v8.the_real_datatype), (MPI_Comm) the_real_comm ));
+        MPIX_CALLREF( this, MPI_Alltoallv( (const void *)v1, (const int  *)v2, (const int  *)v3, (MPI_Datatype)(v4.the_real_datatype), v5, (const int  *)v6, (const int  *)v7, (MPI_Datatype)(v8.the_real_datatype), (MPI_Comm) the_real_comm ));
     }
     virtual void Reduce_scatter( const void * v1, void * v2, const int v3[], const Datatype &v4, const Op &v5 ) const
     {
@@ -1504,17 +1504,17 @@ class Comm  {
     {
         MPIX_CALLREF( this, MPI_Comm_delete_attr( (MPI_Comm) the_real_comm, v2 ));
     }
-    virtual void Scatterv( const void * v1, const int * v2, const int * v3, const Datatype &v4, void * v5, int v6, const Datatype &v7, int v8 ) const
+    virtual void Scatterv( const void * v1, const int v2[], const int v3[], const Datatype &v4, void * v5, int v6, const Datatype &v7, int v8 ) const
     {
-        MPIX_CALLREF( this, MPI_Scatterv( (const void *)v1, (const int *)v2, (const int *)v3, (MPI_Datatype)(v4.the_real_datatype), v5, v6, (MPI_Datatype)(v7.the_real_datatype), v8, (MPI_Comm) the_real_comm ));
+        MPIX_CALLREF( this, MPI_Scatterv( (const void *)v1, (const int  *)v2, (const int  *)v3, (MPI_Datatype)(v4.the_real_datatype), v5, v6, (MPI_Datatype)(v7.the_real_datatype), v8, (MPI_Comm) the_real_comm ));
     }
     virtual void Get_name( char * v2, int &v3 ) const
     {
         MPIX_CALLREF( this, MPI_Comm_get_name( (MPI_Comm) the_real_comm, v2, &v3 ));
     }
-    virtual void Allgatherv( const void * v1, int v2, const Datatype &v3, void * v4, const int * v5, const int * v6, const Datatype &v7 ) const
+    virtual void Allgatherv( const void * v1, int v2, const Datatype &v3, void * v4, const int v5[], const int v6[], const Datatype &v7 ) const
     {
-        MPIX_CALLREF( this, MPI_Allgatherv( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, (const int *)v5, (const int *)v6, (MPI_Datatype)(v7.the_real_datatype), (MPI_Comm) the_real_comm ));
+        MPIX_CALLREF( this, MPI_Allgatherv( (const void *)v1, v2, (MPI_Datatype)(v3.the_real_datatype), v4, (const int  *)v5, (const int  *)v6, (MPI_Datatype)(v7.the_real_datatype), (MPI_Comm) the_real_comm ));
     }
     virtual Comm &Clone(void) const = 0;
     typedef int Copy_attr_function(const Comm& oldcomm, int comm_keyval, void* extra_state, void* attribute_val_in, void* attribute_val_out, bool& flag); 
@@ -1759,7 +1759,7 @@ Intercomm Spawn(const char* command, const char* argv[], int maxprocs, const MPI
 }
 Intercomm Spawn_multiple(int count, const char* array_of_commands[], const char** array_of_argv[], const int array_of_maxprocs[], const MPI::Info array_of_info[], int root) {
     Intercomm ic;
-    MPI_Info  *li = new MPI_Info [count];
+    MPI_Info  *li = new MPI_Info [(unsigned long)count];
     int i;
     for (i=0; i<count; i++) {
         li[i] = array_of_info[i].the_real_info;
@@ -1774,7 +1774,7 @@ Intercomm Spawn_multiple(int count, const char* array_of_commands[], const char*
 }
 Intercomm Spawn_multiple(int count, const char* array_of_commands[], const char** array_of_argv[], const int array_of_maxprocs[], const MPI::Info array_of_info[], int root, int array_of_errcodes[]) {
     Intercomm ic;
-    MPI_Info  *li = new MPI_Info [count];
+    MPI_Info  *li = new MPI_Info [(unsigned long)count];
     int i;
     for (i=0; i<count; i++) {
         li[i] = array_of_info[i].the_real_info;
@@ -2465,7 +2465,7 @@ class Cartcomm : public Intracomm {
     }
     virtual void Get_topo( int v2, int v3[], bool v4[], int v5[] ) const
     {
-        int *l4 = new int[v2];
+        int *l4 = new int[(unsigned long)v2];
         MPIX_CALLREF( this, MPI_Cart_get( (MPI_Comm) the_real_comm, v2, v3, l4, v5 ));
         { 
             int i4; 
@@ -2483,7 +2483,7 @@ class Cartcomm : public Intracomm {
     virtual int Map( int v2, const int v3[], const bool v4[] ) const
     {
         int v5;
-        int *l4 = new int[v2];
+        int *l4 = new int[(unsigned long)v2];
         { 
             int i4; 
             for (i4=0;i4<v2;i4++) {
@@ -2498,7 +2498,7 @@ class Cartcomm : public Intracomm {
     virtual Cartcomm Sub( const bool v2[] ) const
     {
         Cartcomm v3;
-        int *l2 = new int[10];
+        int *l2 = new int[(unsigned long)10];
         { 
             int i2; 
             for (i2=0;i2<10;i2++) {
diff --git a/deps/mpi/include/mpio.h b/deps/mpi/include/mpio.h
index b86ffc1d9..ea4b7bf64 100644
--- a/deps/mpi/include/mpio.h
+++ b/deps/mpi/include/mpio.h
@@ -121,6 +121,8 @@ typedef long long MPI_Offset;
 #define HAVE_MPI_DATAREP_FUNCTIONS
 typedef int (MPI_Datarep_conversion_function)(void *, MPI_Datatype, int, 
              void *, MPI_Offset, void *);
+typedef int (MPI_Datarep_conversion_function_c)(void *, MPI_Datatype, MPI_Count,
+                                                void *, MPI_Offset, void *);
 typedef int (MPI_Datarep_extent_function)(MPI_Datatype datatype, MPI_Aint *,
 					  void *);
 #endif
@@ -332,6 +334,80 @@ int MPI_File_iread_all(MPI_File fh, void *buf, int count, MPI_Datatype datatype,
 int MPI_File_iwrite_all(MPI_File fh, const void *buf, int count, MPI_Datatype datatype,
                          MPI_Request *request)
     MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+
+/* MPI 4.0 large count functions */
+int MPI_File_read_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_read_all_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_read_all_begin_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_read_at_c(MPI_File fh, MPI_Offset offset, void *buf, MPI_Count count, MPI_Datatype datatype,
+                     MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_read_at_all_c(MPI_File fh, MPI_Offset offset, void * buf, MPI_Count count,
+                         MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_read_at_all_begin_c(MPI_File fh, MPI_Offset offset, void *buf, MPI_Count count,
+                               MPI_Datatype datatype) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_read_ordered_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype,
+                          MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_read_ordered_begin_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_read_shared_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype,
+                         MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_write_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                   MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_write_all_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                       MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_write_all_begin_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+
+int MPI_File_write_at_c(MPI_File fh, MPI_Offset offset, const void * buf, MPI_Count count,
+                      MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_write_at_all_c(MPI_File fh, MPI_Offset offset, const void *buf, MPI_Count count,
+                          MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_write_at_all_begin_c(MPI_File fh, MPI_Offset offset, const void *buf, MPI_Count count,
+                                MPI_Datatype datatype) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_write_ordered_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                           MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_write_ordered_begin_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_write_shared_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                          MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_iread_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype, MPIO_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_iread_all_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype,
+                        MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_iread_at_c(MPI_File fh, MPI_Offset offset, void *buf, MPI_Count count, MPI_Datatype datatype,
+                      MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_iread_at_all_c(MPI_File fh, MPI_Offset offset, void *buf, MPI_Count count,
+                           MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_iread_shared_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype,
+                          MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_iwrite_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                    MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_iwrite_all_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                         MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int MPI_File_iwrite_at_c(MPI_File fh, MPI_Offset offset, const void *buf, MPI_Count count,
+                       MPI_Datatype datatype, MPIO_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_iwrite_at_all_c(MPI_File fh, MPI_Offset offset, const void *buf, MPI_Count count,
+                            MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int MPI_File_iwrite_shared_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                           MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+
+int MPI_File_get_type_extent_c(MPI_File fh, MPI_Datatype datatype, MPI_Count *extent) ROMIO_API_PUBLIC;
+
+int MPI_Register_datarep_c(const char *datarep, MPI_Datarep_conversion_function_c *read_conversion_fn,
+			 MPI_Datarep_conversion_function_c *write_conversion_fn,
+			 MPI_Datarep_extent_function *dtype_file_extent_fn, void *extra_state) ROMIO_API_PUBLIC;
+
 /* End Prototypes */
 
 #ifndef HAVE_MPI_DARRAY_SUBARRAY
@@ -562,6 +638,79 @@ int PMPI_File_iwrite_all(MPI_File fh, const void *buf, int count, MPI_Datatype d
                           MPI_Request *request)
     MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
 
+/* MPI 4.0 large count functions */
+int PMPI_File_read_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_read_all_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_read_all_begin_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_read_at_c(MPI_File fh, MPI_Offset offset, void *buf, MPI_Count count, MPI_Datatype datatype,
+                     MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_read_at_all_c(MPI_File fh, MPI_Offset offset, void * buf, MPI_Count count,
+                         MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_read_at_all_begin_c(MPI_File fh, MPI_Offset offset, void *buf, MPI_Count count,
+                               MPI_Datatype datatype) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_read_ordered_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype,
+                          MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_read_ordered_begin_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_read_shared_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype,
+                         MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_write_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                   MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_write_all_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                       MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_write_all_begin_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+
+int PMPI_File_write_at_c(MPI_File fh, MPI_Offset offset, const void * buf, MPI_Count count,
+                      MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_write_at_all_c(MPI_File fh, MPI_Offset offset, const void *buf, MPI_Count count,
+                          MPI_Datatype datatype, MPI_Status *status)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_write_at_all_begin_c(MPI_File fh, MPI_Offset offset, const void *buf, MPI_Count count,
+                                MPI_Datatype datatype) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_write_ordered_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                           MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_write_ordered_begin_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_write_shared_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                          MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_iread_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype, MPIO_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_iread_all_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype,
+                        MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_iread_at_c(MPI_File fh, MPI_Offset offset, void *buf, MPI_Count count, MPI_Datatype datatype,
+                      MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_iread_at_all_c(MPI_File fh, MPI_Offset offset, void *buf, MPI_Count count,
+                           MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_iread_shared_c(MPI_File fh, void *buf, MPI_Count count, MPI_Datatype datatype,
+                          MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_iwrite_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                    MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_iwrite_all_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                         MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+int PMPI_File_iwrite_at_c(MPI_File fh, MPI_Offset offset, const void *buf, MPI_Count count,
+                       MPI_Datatype datatype, MPIO_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_iwrite_at_all_c(MPI_File fh, MPI_Offset offset, const void *buf, MPI_Count count,
+                            MPI_Datatype datatype, MPI_Request *request)
+    MPICH_ATTR_POINTER_WITH_TYPE_TAG(3,5) ROMIO_API_PUBLIC;
+int PMPI_File_iwrite_shared_c(MPI_File fh, const void *buf, MPI_Count count, MPI_Datatype datatype,
+                           MPIO_Request *request) MPICH_ATTR_POINTER_WITH_TYPE_TAG(2,4) ROMIO_API_PUBLIC;
+
+int PMPI_File_get_type_extent_c(MPI_File fh, MPI_Datatype datatype, MPI_Count *extent) ROMIO_API_PUBLIC;
+
+int PMPI_Register_datarep_c(const char *datarep, MPI_Datarep_conversion_function_c *read_conversion_fn,
+			 MPI_Datarep_conversion_function_c *write_conversion_fn,
+			 MPI_Datarep_extent_function *dtype_file_extent_fn, void *extra_state) ROMIO_API_PUBLIC;
+
 #ifndef HAVE_MPI_DARRAY_SUBARRAY
 /* Section 4.14.4 */
 int PMPI_Type_create_subarray(int, int *, int *, int *, int, 
diff --git a/deps/mpi/lib/libmpi.so b/deps/mpi/lib/libmpi.so
index a728c1a42..789edaa4b 100755
Binary files a/deps/mpi/lib/libmpi.so and b/deps/mpi/lib/libmpi.so differ
diff --git a/deps/mpi/lib/libmpi.so.12 b/deps/mpi/lib/libmpi.so.12
index a728c1a42..789edaa4b 100755
Binary files a/deps/mpi/lib/libmpi.so.12 and b/deps/mpi/lib/libmpi.so.12 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0 b/deps/mpi/lib/libmpi.so.12.0
index a728c1a42..789edaa4b 100755
Binary files a/deps/mpi/lib/libmpi.so.12.0 and b/deps/mpi/lib/libmpi.so.12.0 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0.0 b/deps/mpi/lib/libmpi.so.12.0.0
index a728c1a42..789edaa4b 100755
Binary files a/deps/mpi/lib/libmpi.so.12.0.0 and b/deps/mpi/lib/libmpi.so.12.0.0 differ
diff --git a/deps/mpi/lib/libmpicxx.so b/deps/mpi/lib/libmpicxx.so
index 76103b784..17d45c739 100755
Binary files a/deps/mpi/lib/libmpicxx.so and b/deps/mpi/lib/libmpicxx.so differ
diff --git a/deps/mpi/lib/libmpicxx.so.12 b/deps/mpi/lib/libmpicxx.so.12
index 76103b784..17d45c739 100755
Binary files a/deps/mpi/lib/libmpicxx.so.12 and b/deps/mpi/lib/libmpicxx.so.12 differ
diff --git a/deps/mpi/lib/libmpicxx.so.12.0 b/deps/mpi/lib/libmpicxx.so.12.0
index 76103b784..17d45c739 100755
Binary files a/deps/mpi/lib/libmpicxx.so.12.0 and b/deps/mpi/lib/libmpicxx.so.12.0 differ
diff --git a/deps/mpi/lib/libmpicxx.so.12.0.0 b/deps/mpi/lib/libmpicxx.so.12.0.0
index 76103b784..17d45c739 100755
Binary files a/deps/mpi/lib/libmpicxx.so.12.0.0 and b/deps/mpi/lib/libmpicxx.so.12.0.0 differ
diff --git a/deps/mpi/lib/libmpifort.so b/deps/mpi/lib/libmpifort.so
index 26a93dc35..2c6bd829a 100755
Binary files a/deps/mpi/lib/libmpifort.so and b/deps/mpi/lib/libmpifort.so differ
diff --git a/deps/mpi/lib/libmpifort.so.12 b/deps/mpi/lib/libmpifort.so.12
index 26a93dc35..2c6bd829a 100755
Binary files a/deps/mpi/lib/libmpifort.so.12 and b/deps/mpi/lib/libmpifort.so.12 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0 b/deps/mpi/lib/libmpifort.so.12.0
index 26a93dc35..2c6bd829a 100755
Binary files a/deps/mpi/lib/libmpifort.so.12.0 and b/deps/mpi/lib/libmpifort.so.12.0 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0.0 b/deps/mpi/lib/libmpifort.so.12.0.0
index 26a93dc35..2c6bd829a 100755
Binary files a/deps/mpi/lib/libmpifort.so.12.0.0 and b/deps/mpi/lib/libmpifort.so.12.0.0 differ
diff --git a/deps/mpi/licensing/third-party-programs.txt b/deps/mpi/licensing/third-party-programs.txt
index 78202fa91..3aee344ee 100644
--- a/deps/mpi/licensing/third-party-programs.txt
+++ b/deps/mpi/licensing/third-party-programs.txt
@@ -1,16 +1,17 @@
-Intel(R) MPI Library 2021.11 Third Party Programs File
+Intel(R) MPI Library 2021.12 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
 Third party programs and their corresponding required notices and/or license 
 terms are listed below.
 
--------------------------------------------------------------------------------
+=====================================================================================
 
 1. MPICH 
-
    Copyright Notice
    1998--2020, Argonne National Laboratory
+
+   ROMIO
    
   Mpich license
 
@@ -44,7 +45,7 @@ terms are listed below.
   product, or process disclosed, or represents that its use would not infringe
   privately owned rights.
   
--------------------------------------------------------------------------------
+=====================================================================================
 
 2. Open MPI 
 
@@ -184,7 +185,7 @@ terms are listed below.
   product, or process disclosed, or represents that its use would not infringe
   privately owned rights.
 
--------------------------------------------------------------------------------   
+=====================================================================================
 
 3. hwloc 
    
@@ -233,7 +234,7 @@ terms are listed below.
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   
--------------------------------------------------------------------------------  
+=====================================================================================
 
 4. Libfabric and OpenFabrics Interfaces (OFI)
 
@@ -268,85 +269,874 @@ terms are listed below.
   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
   POSSIBILITY OF SUCH DAMAGE.
   
--------------------------------------------------------------------------------  
-
-5. Intel® Distribution for Python*
-
-   Intel Simplified Software License (Version August 2021)
-
-  Use and Redistribution. You may use and redistribute the software (the
-  "Software"), without modification, provided the following conditions are met:
-
- * Redistributions must reproduce the above copyright notice and the following
-   terms of use in the Software and in the documentation and/or other materials
-   provided with the distribution.
- * Neither the name of Intel nor the names of its suppliers may be used to 
-   endorse or promote products derived from this Software without specific  
-   prior written permission.
- * No reverse engineering, decompilation, or disassembly of this Software is
-   permitted.
-
-  No other licenses. Except as provided in the preceding section, Intel grants no
-  licenses or other rights by implication, estoppel or otherwise to, patent,
-  copyright, trademark, trade name, service mark or other intellectual property
-  licenses or rights of Intel.
-
-  Third party software. The Software may contain Third Party Software. "Third
-  Party Software" is open source software, third party software, or other Intel
-  software that may be identified in the Software itself or in the files (if any)
-  listed in the "third-party-software.txt" or similarly named text file included
-  with the Software. Third Party Software, even if included with the distribution
-  of the Software, may be governed by separate license terms, including without
-  limitation, open source software license terms, third party software license
-  terms, and other Intel software license terms. Those separate license terms
-  solely govern your use of the Third Party Software, and nothing in this license
-  limits any rights under, or grants rights that supersede, the terms of the
-  applicable license terms.
-
-  DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
-  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE
-  DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS
-  WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE
-  THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND
-  ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT
-  INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE SOFTWARE.
-
-  LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT,
-  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-  No support. Intel may make changes to the Software, at any time without notice,
-  and is not obligated to support, update or provide training for the Software.
-
-  Termination. Your right to use the Software is terminated in the event of your
-  breach of this license.
-
-  Feedback. Should you provide Intel with comments, modifications, corrections,
-  enhancements or other input ("Feedback") related to the Software, Intel will be
-  free to use, disclose, reproduce, license or otherwise distribute or exploit the
-  Feedback in its sole discretion without any obligations or restrictions of any
-  kind, including without limitation, intellectual property rights or licensing
-  obligations.
-
-  Compliance with laws. You agree to comply with all relevant laws and regulations
-  governing your use, transfer, import or export (or prohibition thereof) of the
-  Software.
+=====================================================================================
+
+5. Python*
+
+Terms and conditions for accessing or otherwise using Python
+Python software and documentation are licensed under the PSF License Agreement.
+
+Starting with Python 3.8.6, examples, recipes, and other code in the documentation are dual licensed under the PSF License Agreement and the Zero-Clause BSD license.
+
+Some software incorporated into Python is under different licenses. The licenses are listed with code falling under that license. See Licenses and Acknowledgements for Incorporated Software for an incomplete list of these licenses.
+
+PSF LICENSE AGREEMENT FOR PYTHON 3.12.2
+1. This LICENSE AGREEMENT is between the Python Software Foundation ("PSF"), and
+   the Individual or Organization ("Licensee") accessing and otherwise using Python
+   3.12.2 software in source or binary form and its associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, PSF hereby
+   grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+   analyze, test, perform and/or display publicly, prepare derivative works,
+   distribute, and otherwise use Python 3.12.2 alone or in any derivative
+   version, provided, however, that PSF's License Agreement and PSF's notice of
+   copyright, i.e., "Copyright © 2001-2023 Python Software Foundation; All Rights
+   Reserved" are retained in Python 3.12.2 alone or in any derivative version
+   prepared by Licensee.
+
+3. In the event Licensee prepares a derivative work that is based on or
+   incorporates Python 3.12.2 or any part thereof, and wants to make the
+   derivative work available to others as provided herein, then Licensee hereby
+   agrees to include in any such work a brief summary of the changes made to Python
+   3.12.2.
+
+4. PSF is making Python 3.12.2 available to Licensee on an "AS IS" basis.
+   PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.  BY WAY OF
+   EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND DISCLAIMS ANY REPRESENTATION OR
+   WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE
+   USE OF PYTHON 3.12.2 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON 3.12.2
+   FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF
+   MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 3.12.2, OR ANY DERIVATIVE
+   THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material breach of
+   its terms and conditions.
+
+7. Nothing in this License Agreement shall be deemed to create any relationship
+   of agency, partnership, or joint venture between PSF and Licensee.  This License
+   Agreement does not grant permission to use PSF trademarks or trade name in a
+   trademark sense to endorse or promote products or services of Licensee, or any
+   third party.
+
+8. By copying, installing or otherwise using Python 3.12.2, Licensee agrees
+   to be bound by the terms and conditions of this License Agreement.
+BEOPEN.COM LICENSE AGREEMENT FOR PYTHON 2.0
+BEOPEN PYTHON OPEN SOURCE LICENSE AGREEMENT VERSION 1
+
+1. This LICENSE AGREEMENT is between BeOpen.com ("BeOpen"), having an office at
+   160 Saratoga Avenue, Santa Clara, CA 95051, and the Individual or Organization
+   ("Licensee") accessing and otherwise using this software in source or binary
+   form and its associated documentation ("the Software").
+
+2. Subject to the terms and conditions of this BeOpen Python License Agreement,
+   BeOpen hereby grants Licensee a non-exclusive, royalty-free, world-wide license
+   to reproduce, analyze, test, perform and/or display publicly, prepare derivative
+   works, distribute, and otherwise use the Software alone or in any derivative
+   version, provided, however, that the BeOpen Python License is retained in the
+   Software, alone or in any derivative version prepared by Licensee.
+
+3. BeOpen is making the Software available to Licensee on an "AS IS" basis.
+   BEOPEN MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.  BY WAY OF
+   EXAMPLE, BUT NOT LIMITATION, BEOPEN MAKES NO AND DISCLAIMS ANY REPRESENTATION OR
+   WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE
+   USE OF THE SOFTWARE WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
+
+4. BEOPEN SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF THE SOFTWARE FOR
+   ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF USING,
+   MODIFYING OR DISTRIBUTING THE SOFTWARE, OR ANY DERIVATIVE THEREOF, EVEN IF
+   ADVISED OF THE POSSIBILITY THEREOF.
+
+5. This License Agreement will automatically terminate upon a material breach of
+   its terms and conditions.
+
+6. This License Agreement shall be governed by and interpreted in all respects
+   by the law of the State of California, excluding conflict of law provisions.
+   Nothing in this License Agreement shall be deemed to create any relationship of
+   agency, partnership, or joint venture between BeOpen and Licensee.  This License
+   Agreement does not grant permission to use BeOpen trademarks or trade names in a
+   trademark sense to endorse or promote products or services of Licensee, or any
+   third party.  As an exception, the "BeOpen Python" logos available at
+   http://www.pythonlabs.com/logos.html may be used according to the permissions
+   granted on that web page.
+
+7. By copying, installing or otherwise using the software, Licensee agrees to be
+   bound by the terms and conditions of this License Agreement.
+CNRI LICENSE AGREEMENT FOR PYTHON 1.6.1
+1. This LICENSE AGREEMENT is between the Corporation for National Research
+   Initiatives, having an office at 1895 Preston White Drive, Reston, VA 20191
+   ("CNRI"), and the Individual or Organization ("Licensee") accessing and
+   otherwise using Python 1.6.1 software in source or binary form and its
+   associated documentation.
+
+2. Subject to the terms and conditions of this License Agreement, CNRI hereby
+   grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce,
+   analyze, test, perform and/or display publicly, prepare derivative works,
+   distribute, and otherwise use Python 1.6.1 alone or in any derivative version,
+   provided, however, that CNRI's License Agreement and CNRI's notice of copyright,
+   i.e., "Copyright © 1995-2001 Corporation for National Research Initiatives; All
+   Rights Reserved" are retained in Python 1.6.1 alone or in any derivative version
+   prepared by Licensee.  Alternately, in lieu of CNRI's License Agreement,
+   Licensee may substitute the following text (omitting the quotes): "Python 1.6.1
+   is made available subject to the terms and conditions in CNRI's License
+   Agreement.  This Agreement together with Python 1.6.1 may be located on the
+   internet using the following unique, persistent identifier (known as a handle):
+   1895.22/1013.  This Agreement may also be obtained from a proxy server on the
+   internet using the following URL: http://hdl.handle.net/1895.22/1013."
+
+3. In the event Licensee prepares a derivative work that is based on or
+   incorporates Python 1.6.1 or any part thereof, and wants to make the derivative
+   work available to others as provided herein, then Licensee hereby agrees to
+   include in any such work a brief summary of the changes made to Python 1.6.1.
+
+4. CNRI is making Python 1.6.1 available to Licensee on an "AS IS" basis.  CNRI
+   MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.  BY WAY OF EXAMPLE,
+   BUT NOT LIMITATION, CNRI MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY
+   OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF
+   PYTHON 1.6.1 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS.
+
+5. CNRI SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON 1.6.1 FOR
+   ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF
+   MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON 1.6.1, OR ANY DERIVATIVE
+   THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF.
+
+6. This License Agreement will automatically terminate upon a material breach of
+   its terms and conditions.
+
+7. This License Agreement shall be governed by the federal intellectual property
+   law of the United States, including without limitation the federal copyright
+   law, and, to the extent such U.S. federal law does not apply, by the law of the
+   Commonwealth of Virginia, excluding Virginia's conflict of law provisions.
+   Notwithstanding the foregoing, with regard to derivative works based on Python
+   1.6.1 that incorporate non-separable material that was previously distributed
+   under the GNU General Public License (GPL), the law of the Commonwealth of
+   Virginia shall govern this License Agreement only as to issues arising under or
+   with respect to Paragraphs 4, 5, and 7 of this License Agreement.  Nothing in
+   this License Agreement shall be deemed to create any relationship of agency,
+   partnership, or joint venture between CNRI and Licensee.  This License Agreement
+   does not grant permission to use CNRI trademarks or trade name in a trademark
+   sense to endorse or promote products or services of Licensee, or any third
+   party.
+
+8. By clicking on the "ACCEPT" button where indicated, or by copying, installing
+   or otherwise using Python 1.6.1, Licensee agrees to be bound by the terms and
+   conditions of this License Agreement.
+CWI LICENSE AGREEMENT FOR PYTHON 0.9.0 THROUGH 1.2
+Copyright © 1991 - 1995, Stichting Mathematisch Centrum Amsterdam, The
+Netherlands.  All rights reserved.
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted, provided that
+the above copyright notice appear in all copies and that both that copyright
+notice and this permission notice appear in supporting documentation, and that
+the name of Stichting Mathematisch Centrum or CWI not be used in advertising or
+publicity pertaining to distribution of the software without specific, written
+prior permission.
+
+STICHTING MATHEMATISCH CENTRUM DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+EVENT SHALL STICHTING MATHEMATISCH CENTRUM BE LIABLE FOR ANY SPECIAL, INDIRECT
+OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
+ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON 3.12.2 DOCUMENTATION
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted.
 
-  Governing law. All disputes will be governed by the laws of the United States of
-  America and the State of Delaware without reference to conflict of law
-  principles and subject to the exclusive jurisdiction of the state or federal
-  courts sitting in the State of Delaware, and each party agrees that it submits
-  to the personal jurisdiction and venue of those courts and waives any
-  objections. The United Nations Convention on Contracts for the International
-  Sale of Goods (1980) is specifically excluded and will not apply to the
-  Software.
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+Licenses and Acknowledgements for Incorporated Software
+This section is an incomplete, but growing list of licenses and acknowledgements for third-party software incorporated in the Python distribution.
+
+Mersenne Twister
+The _random C extension underlying the random module includes code based on a download from http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/emt19937ar.html. The following are the verbatim comments from the original code:
+
+A C-program for MT19937, with initialization improved 2002/1/26.
+Coded by Takuji Nishimura and Makoto Matsumoto.
+
+Before using, initialize the state by using init_genrand(seed)
+or init_by_array(init_key, key_length).
+
+Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ 1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+    products derived from this software without specific prior written
+    permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+Any feedback is very welcome.
+http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+Sockets
+The socket module uses the functions, getaddrinfo(), and getnameinfo(), which are coded in separate source files from the WIDE Project, https://www.wide.ad.jp/.
+
+Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of the project nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+Asynchronous socket services
+The test.support.asynchat and test.support.asyncore modules contain the following notice:
+
+Copyright 1996 by Sam Rushing
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and
+its documentation for any purpose and without fee is hereby
+granted, provided that the above copyright notice appear in all
+copies and that both that copyright notice and this permission
+notice appear in supporting documentation, and that the name of Sam
+Rushing not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+SAM RUSHING DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN
+NO EVENT SHALL SAM RUSHING BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+Cookie management
+The http.cookies module contains the following notice:
+
+Copyright 2000 by Timothy O'Malley <timo@alum.mit.edu>
+
+               All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software
+and its documentation for any purpose and without fee is hereby
+granted, provided that the above copyright notice appear in all
+copies and that both that copyright notice and this permission
+notice appear in supporting documentation, and that the name of
+Timothy O'Malley  not be used in advertising or publicity
+pertaining to distribution of the software without specific, written
+prior permission.
+
+Timothy O'Malley DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS, IN NO EVENT SHALL Timothy O'Malley BE LIABLE FOR
+ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+Execution tracing
+The trace module contains the following notice:
+
+portions copyright 2001, Autonomous Zones Industries, Inc., all rights...
+err...  reserved and offered to the public under the terms of the
+Python 2.2 license.
+Author: Zooko O'Whielacronx
+http://zooko.com/
+mailto:zooko@zooko.com
+
+Copyright 2000, Mojam Media, Inc., all rights reserved.
+Author: Skip Montanaro
+
+Copyright 1999, Bioreason, Inc., all rights reserved.
+Author: Andrew Dalke
+
+Copyright 1995-1997, Automatrix, Inc., all rights reserved.
+Author: Skip Montanaro
+
+Copyright 1991-1995, Stichting Mathematisch Centrum, all rights reserved.
+
+
+Permission to use, copy, modify, and distribute this Python software and
+its associated documentation for any purpose without fee is hereby
+granted, provided that the above copyright notice appears in all copies,
+and that both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of neither Automatrix,
+Bioreason or Mojam Media be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior permission.
+UUencode and UUdecode functions
+The uu module contains the following notice:
+
+Copyright 1994 by Lance Ellinghouse
+Cathedral City, California Republic, United States of America.
+                       All Rights Reserved
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Lance Ellinghouse
+not be used in advertising or publicity pertaining to distribution
+of the software without specific, written prior permission.
+LANCE ELLINGHOUSE DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS, IN NO EVENT SHALL LANCE ELLINGHOUSE CENTRUM BE LIABLE
+FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Modified by Jack Jansen, CWI, July 1995:
+- Use binascii module to do the actual line-by-line conversion
+  between ascii and binary. This results in a 1000-fold speedup. The C
+  version is still 5 times faster, though.
+- Arguments more compliant with Python standard
+XML Remote Procedure Calls
+The xmlrpc.client module contains the following notice:
+
+    The XML-RPC client interface is
+
+Copyright (c) 1999-2002 by Secret Labs AB
+Copyright (c) 1999-2002 by Fredrik Lundh
+
+By obtaining, using, and/or copying this software and/or its
+associated documentation, you agree that you have read, understood,
+and will comply with the following terms and conditions:
+
+Permission to use, copy, modify, and distribute this software and
+its associated documentation for any purpose and without fee is
+hereby granted, provided that the above copyright notice appears in
+all copies, and that both that copyright notice and this permission
+notice appear in supporting documentation, and that the name of
+Secret Labs AB or the author not be used in advertising or publicity
+pertaining to distribution of the software without specific, written
+prior permission.
+
+SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+OF THIS SOFTWARE.
+test_epoll
+The test.test_epoll module contains the following notice:
+
+Copyright (c) 2001-2006 Twisted Matrix Laboratories.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+Select kqueue
+The select module contains the following notice for the kqueue interface:
+
+Copyright (c) 2000 Doug White, 2006 James Knight, 2007 Christian Heimes
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+SipHash24
+The file Python/pyhash.c contains Marek Majkowski’ implementation of Dan Bernstein’s SipHash24 algorithm. It contains the following note:
+
+<MIT License>
+Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
--------------------------------------------------------------------------------
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+</MIT License>
+
+Original location:
+   https://github.com/majek/csiphash/
+
+Solution inspired by code from:
+   Samuel Neves (supercop/crypto_auth/siphash24/little)
+   djb (supercop/crypto_auth/siphash24/little2)
+   Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
+strtod and dtoa
+The file Python/dtoa.c, which supplies C functions dtoa and strtod for conversion of C doubles to and from strings, is derived from the file of the same name by David M. Gay, currently available from https://web.archive.org/web/20220517033456/http://www.netlib.org/fp/dtoa.c. The original file, as retrieved on March 16, 2009, contains the following copyright and licensing notice:
+
+/****************************************************************
+ *
+ * The author of this software is David M. Gay.
+ *
+ * Copyright (c) 1991, 2000, 2001 by Lucent Technologies.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ *
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHOR NOR LUCENT MAKES ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ *
+ ***************************************************************/
+OpenSSL
+The modules hashlib, posix, ssl, crypt use the OpenSSL library for added performance if made available by the operating system. Additionally, the Windows and macOS installers for Python may include a copy of the OpenSSL libraries, so we include a copy of the OpenSSL license here. For the OpenSSL 3.0 release, and later releases derived from that, the Apache License v2 applies:
+
+                              Apache License
+                        Version 2.0, January 2004
+                     https://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+expat
+The pyexpat extension is built using an included copy of the expat sources unless the build is configured --with-system-expat:
+
+Copyright (c) 1998, 1999, 2000 Thai Open Source Software Center Ltd
+                               and Clark Cooper
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+libffi
+The _ctypes C extension underlying the ctypes module is built using an included copy of the libffi sources unless the build is configured --with-system-libffi:
+
+Copyright (c) 1996-2008  Red Hat, Inc and others.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+zlib
+The zlib extension is built using an included copy of the zlib sources if the zlib version found on the system is too old to be used for the build:
+
+Copyright (C) 1995-2011 Jean-loup Gailly and Mark Adler
+
+This software is provided 'as-is', without any express or implied
+warranty.  In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source distribution.
+
+Jean-loup Gailly        Mark Adler
+jloup@gzip.org          madler@alumni.caltech.edu
+cfuhash
+The implementation of the hash table used by the tracemalloc is based on the cfuhash project:
+
+Copyright (c) 2005 Don Owens
+All rights reserved.
+
+This code is released under the BSD license:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+  * Neither the name of the author nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
+libmpdec
+The _decimal C extension underlying the decimal module is built using an included copy of the libmpdec library unless the build is configured --with-system-libmpdec:
+
+Copyright (c) 2008-2020 Stefan Krah. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+W3C C14N test suite
+The C14N 2.0 test suite in the test package (Lib/test/xmltestdata/c14n-20/) was retrieved from the W3C website at https://www.w3.org/TR/xml-c14n2-testcases/ and is distributed under the 3-clause BSD license:
+
+Copyright (c) 2013 W3C(R) (MIT, ERCIM, Keio, Beihang),
+All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+* Redistributions of works must retain the original copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the original copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of the W3C nor the names of its contributors may be
+  used to endorse or promote products derived from this work without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Audioop
+The audioop module uses the code base in g771.c file of the SoX project. https://sourceforge.net/projects/sox/files/sox/12.17.7/sox-12.17.7.tar.gz
+
+This source code is a product of Sun Microsystems, Inc. and is provided for unrestricted use. Users may copy or modify this source code without charge.
+
+SUN SOURCE CODE IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE WARRANTIES OF DESIGN, MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE.
+
+Sun source code is provided with no support and without any obligation on the part of Sun Microsystems, Inc. to assist in its use, correction, modification or enhancement.
+
+SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY THIS SOFTWARE OR ANY PART THEREOF.
+
+In no event will Sun Microsystems, Inc. be liable for any lost revenue or profits or other special, indirect and consequential damages, even if Sun has been advised of the possibility of such damages.
+
+Sun Microsystems, Inc. 2550 Garcia Avenue Mountain View, California 94043
+
+asyncio
+Parts of the asyncio module are incorporated from uvloop 0.16, which is distributed under the MIT license:
+
+Copyright (c) 2015-2021 MagicStack Inc.  http://magic.io
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+=====================================================================================
 
 6. uthash
 
@@ -373,7 +1163,7 @@ terms are listed below.
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   
--------------------------------------------------------------------------------  
+=====================================================================================
 
 7. json-c
    
@@ -400,7 +1190,7 @@ terms are listed below.
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
 
--------------------------------------------------------------------------------
+=====================================================================================
 
 8. zlib 
    
@@ -421,7 +1211,7 @@ terms are listed below.
      misrepresented as being the original software.
   3. This notice may not be removed or altered from any source distribution.
   
--------------------------------------------------------------------------------
+=====================================================================================
 
 9. Intel(R) MPI Benchmarks 
    Copyright (c) Intel Corporation.
@@ -474,7 +1264,7 @@ terms are listed below.
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
--------------------------------------------------------------------------------
+=====================================================================================
 
 10.  OpenPMIx
      Most files in this release are marked with the copyrights of the
@@ -573,7 +1363,7 @@ terms are listed below.
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
--------------------------------------------------------------------------------
+=====================================================================================
 
  11. libpciaccess
   (C) Copyright IBM Corporation 2006, 2007
@@ -674,39 +1464,13 @@ terms are listed below.
   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
   OTHER DEALINGS IN THE SOFTWARE.
 
--------------------------------------------------------------------------------
-
-12. Intel(R) oneAPI Level Zero (Level Zero)
-
-MIT License
-
-Copyright (C) 2019-2021 Intel Corporation
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
--------------------------------------------------------------------------------
+=====================================================================================
+=====================================================================================
  
   The following third party programs have their own third party programs. These
   additional third party program files are as follows:
   1. Intel(R) MPI Benchmarks <install_dir>/mpi/latest/opt/mpi/benchmarks/imb/license/third-party-programs.txt
-  2. Intel(R) Distribution for Python* <install_dir>/intelpython/latest/licensing/third-party-programs.txt
   
--------------------------------------------------------------------------------
+=====================================================================================
   
 * Other names and brands may be claimed as the property of others.
diff --git a/deps/ofi/include/rdma/fi_ext_psm2.h b/deps/ofi/include/rdma/fi_ext_psm2.h
deleted file mode 100644
index 3a48d83e1..000000000
--- a/deps/ofi/include/rdma/fi_ext_psm2.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2020 Intel Corporation. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef FI_EXT_PSM2_H
-#define FI_EXT_PSM2_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Provider specific name for fi_set_val() / fi_get_val() */
-#define	FI_PSM2_DISCONNECT	(1U | FI_PROV_SPECIFIC)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* FI_EXT_PSM2_H */
diff --git a/deps/ofi/lib/libfabric.so b/deps/ofi/lib/libfabric.so
index 193603003..c2c5008cd 100755
Binary files a/deps/ofi/lib/libfabric.so and b/deps/ofi/lib/libfabric.so differ
diff --git a/deps/ofi/lib/libfabric.so.1 b/deps/ofi/lib/libfabric.so.1
index 193603003..c2c5008cd 100755
Binary files a/deps/ofi/lib/libfabric.so.1 and b/deps/ofi/lib/libfabric.so.1 differ
diff --git a/deps/ofi/lib/prov/libpsm3-fi.so b/deps/ofi/lib/prov/libpsm3-fi.so
index 6dfdb8c17..ecb5972e1 100755
Binary files a/deps/ofi/lib/prov/libpsm3-fi.so and b/deps/ofi/lib/prov/libpsm3-fi.so differ
diff --git a/deps/ofi/lib/prov/libpsmx2-fi.so b/deps/ofi/lib/prov/libpsmx2-fi.so
index d16692935..efc6d127f 100755
Binary files a/deps/ofi/lib/prov/libpsmx2-fi.so and b/deps/ofi/lib/prov/libpsmx2-fi.so differ
diff --git a/deps/ofi/lib/prov/librxm-fi.so b/deps/ofi/lib/prov/librxm-fi.so
index 2018314bd..02e6c4378 100755
Binary files a/deps/ofi/lib/prov/librxm-fi.so and b/deps/ofi/lib/prov/librxm-fi.so differ
diff --git a/deps/ofi/lib/prov/libshm-fi.so b/deps/ofi/lib/prov/libshm-fi.so
index 7db1a1c78..a3d692051 100755
Binary files a/deps/ofi/lib/prov/libshm-fi.so and b/deps/ofi/lib/prov/libshm-fi.so differ
diff --git a/deps/ofi/lib/prov/libtcp-fi.so b/deps/ofi/lib/prov/libtcp-fi.so
index bce823911..e0321d75c 100755
Binary files a/deps/ofi/lib/prov/libtcp-fi.so and b/deps/ofi/lib/prov/libtcp-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-1.1-fi.so b/deps/ofi/lib/prov/libverbs-1.1-fi.so
old mode 100644
new mode 100755
index 2384ebc22..9b5517678
Binary files a/deps/ofi/lib/prov/libverbs-1.1-fi.so and b/deps/ofi/lib/prov/libverbs-1.1-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-1.12-fi.so b/deps/ofi/lib/prov/libverbs-1.12-fi.so
old mode 100644
new mode 100755
index 30f87d3be..5b1d225ac
Binary files a/deps/ofi/lib/prov/libverbs-1.12-fi.so and b/deps/ofi/lib/prov/libverbs-1.12-fi.so differ
diff --git a/deps/pmix/include/pmix.h b/deps/pmix/include/pmix.h
index d66067643..bb14ef1ec 100644
--- a/deps/pmix/include/pmix.h
+++ b/deps/pmix/include/pmix.h
@@ -444,7 +444,7 @@ PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata,
                                       const pmix_info_t directives[], size_t ndirs,
                                       pmix_op_cbfunc_t cbfunc, void *cbdata);
 
-/* Request an allocation operation from the host resource manager.
+/* Request an allocation operation from the host scheduler.
  * Several broad categories are envisioned, including the ability to:
  *
  * - request allocation of additional resources, including memory,
@@ -1066,9 +1066,6 @@ PMIX_EXPORT pmix_status_t PMIx_Compute_distances_nb(pmix_topology_t *topo,
  */
 PMIX_EXPORT pmix_status_t PMIx_Load_topology(pmix_topology_t *topo);
 
-
-PMIX_EXPORT void PMIx_Topology_destruct(pmix_topology_t *topo);
-
 /* Get the PU binding bitmap from its string representation
  *
  * cpuset_string - string representation of the binding bitmap
@@ -1087,8 +1084,6 @@ PMIX_EXPORT pmix_status_t PMIx_Parse_cpuset_string(const char *cpuset_string,
 
 PMIX_EXPORT pmix_status_t PMIx_Get_cpuset(pmix_cpuset_t *cpuset, pmix_bind_envelope_t ref);
 
-PMIX_EXPORT void PMIx_Cpuset_destruct(pmix_cpuset_t *cpuset);
-
 /* Get the relative locality of two local processes given their locality strings.
  *
  * locality1 - String returned by the PMIx_server_generate_locality_string API
@@ -1123,8 +1118,18 @@ PMIX_EXPORT void PMIx_Progress(void);
  * - pmix_iof_channel_t  (PMIX_IOF_CHANNEL)
  * - pmix_job_state_t  (PMIX_JOB_STATE)
  * - pmix_proc_state_t  (PMIX_PROC_STATE)
+ * - attribute string value of provided name
+ * - attribute name corresponding to provided string
+ * - pmix_link_state_t (PMIX_LINK_STATE)
+ * - pmix_device_type_t (PMIX_DEVTYPE)
+ * - pmix_value_cmp_t (enum)
+ * - pmix_info_t (PMIX_INFO)
+ * - pmix_value_t (PMIX_VALUE)
+ * - pmix_info_directives_t (PMIX_INFO_DIRECTIVES)
+ * - pmix_app_t (PMIX_APP)
  */
 PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t status);
+PMIX_EXPORT pmix_status_t PMIx_Error_code(const char *errname);
 PMIX_EXPORT const char* PMIx_Proc_state_string(pmix_proc_state_t state);
 PMIX_EXPORT const char* PMIx_Scope_string(pmix_scope_t scope);
 PMIX_EXPORT const char* PMIx_Persistence_string(pmix_persistence_t persist);
@@ -1144,6 +1149,7 @@ PMIX_EXPORT const char* PMIx_Value_comparison_string(pmix_value_cmp_t cmp);
 PMIX_EXPORT char* PMIx_Info_string(const pmix_info_t *info);
 PMIX_EXPORT char* PMIx_Value_string(const pmix_value_t *value);
 PMIX_EXPORT char* PMIx_Info_directives_string(pmix_info_directives_t directives);
+PMIX_EXPORT char* PMIx_App_string(const pmix_app_t *app);
 
 /* Get the PMIx version string. Note that the provided string is
  * statically defined and must NOT be free'd  */
@@ -1156,6 +1162,29 @@ PMIX_EXPORT pmix_status_t PMIx_Store_internal(const pmix_proc_t *proc,
                                               const char key[], pmix_value_t *val);
 
 
+/* Compute and return the size (in bytes) of the data
+ * payload in a pmix_value_t structure. Returns:
+ *
+ * - PMIX_SUCCESS if the value could be computed
+ *
+ * - an appropriate error value (e.g., PMIX_ERR_UNKNOWN_DATA_TYPE
+ *   if the data type is unknown) if the value could not be computed.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Value_get_size(const pmix_value_t *val,
+                                              size_t *size);
+
+/* Compute and return the size (in bytes) of the data
+ * payload in a pmix_info_t structure. Returns:
+ *
+ * - PMIX_SUCCESS if the value could be computed
+ *
+ * - an appropriate error value (e.g., PMIX_ERR_UNKNOWN_DATA_TYPE
+ *   if the data type is unknown) if the value could not be computed.
+ */
+PMIX_EXPORT pmix_status_t PMIx_Info_get_size(const pmix_info_t *val,
+                                             size_t *size);
+
+
 /******    DATA BUFFER PACK/UNPACK SUPPORT    ******/
 /**
  * Top-level interface function to pack one or more values into a
@@ -1394,7 +1423,7 @@ PMIX_EXPORT pmix_status_t PMIx_Data_copy_payload(pmix_data_buffer_t *dest,
  * @note This is a destructive operation. While the payload is
  * undisturbed, the function will clear the buffer's pointers to the
  * payload. Thus, the buffer and the payload are completely separated,
- * leaving the caller free to the buffer.
+ * leaving the caller free to release the buffer.
  *
  * @param buffer A pointer to the buffer whose payload is to be
  * unloaded.
@@ -1568,6 +1597,82 @@ PMIX_EXPORT bool PMIx_Data_decompress(const uint8_t *inbytes,
  * see them and know they exist. So include them here as well. */
 
 #ifndef PMIx_DEPRECATED_H
+
+/* load a key */
+PMIX_EXPORT void PMIx_Load_key(pmix_key_t key, const char *src);
+
+/* check a key */
+PMIX_EXPORT bool PMIx_Check_key(const char *key, const char *str);
+
+/* check to see if a key is a "reserved" key */
+PMIX_EXPORT bool PMIx_Check_reserved_key(const char *key);
+
+/* load a string into a pmix_nspace_t struct */
+PMIX_EXPORT void PMIx_Load_nspace(pmix_nspace_t nspace, const char *str);
+
+/* check two nspace structs for equality */
+PMIX_EXPORT bool PMIx_Check_nspace(const char *key1, const char *key2);
+
+/* check if a namespace is invalid */
+PMIX_EXPORT bool PMIx_Nspace_invalid(const char *nspace);
+
+/* load a process ID struct */
+PMIX_EXPORT void PMIx_Load_procid(pmix_proc_t *p, 
+                                  const char *ns,
+                                  pmix_rank_t rk);
+
+/* transfer a process ID struct (non-destructive) */
+PMIX_EXPORT void PMIx_Xfer_procid(pmix_proc_t *dst,
+                                  const pmix_proc_t *src);
+
+/* check two procIDs for equality */
+PMIX_EXPORT bool PMIx_Check_procid(const pmix_proc_t *a,
+                                   const pmix_proc_t *b);
+
+/* check two ranks for equality */
+PMIX_EXPORT bool PMIx_Check_rank(pmix_rank_t a,
+                                 pmix_rank_t b);
+
+/* check if procID is invalid */
+PMIX_EXPORT bool PMIx_Procid_invalid(const pmix_proc_t *p);
+
+PMIX_EXPORT int PMIx_Argv_count(char **a);
+PMIX_EXPORT pmix_status_t PMIx_Argv_append_nosize(char ***argv, const char *arg);
+PMIX_EXPORT pmix_status_t PMIx_Argv_prepend_nosize(char ***argv, const char *arg);
+PMIX_EXPORT pmix_status_t PMIx_Argv_append_unique_nosize(char ***argv, const char *arg);
+PMIX_EXPORT void PMIx_Argv_free(char **argv);
+PMIX_EXPORT char **PMIx_Argv_split_inter(const char *src_string,
+                                         int delimiter,
+                                         bool include_empty);
+PMIX_EXPORT char **PMIx_Argv_split_with_empty(const char *src_string, int delimiter);
+PMIX_EXPORT char **PMIx_Argv_split(const char *src_string, int delimiter);
+PMIX_EXPORT char *PMIx_Argv_join(char **argv, int delimiter);
+PMIX_EXPORT char **PMIx_Argv_copy(char **argv);
+PMIX_EXPORT pmix_status_t PMIx_Setenv(const char *name,
+                                      const char *value,
+                                      bool overwrite,
+                                      char ***env);
+
+/* initialize a value struct */
+PMIX_EXPORT void PMIx_Value_construct(pmix_value_t *val);
+
+/* free memory stored inside a value struct */
+PMIX_EXPORT void PMIx_Value_destruct(pmix_value_t *val);
+
+/* create and initialize an array of value structs */
+PMIX_EXPORT pmix_value_t* PMIx_Value_create(size_t n);
+
+/* free memory stored inside an array of coord structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Value_free(pmix_value_t *v, size_t n);
+
+/* Check the given value struct to determine if it includes
+ * a boolean value (includes strings for "true" and "false",
+ * including abbreviations such as "t" or "f"), and if so,
+ * then its value. A value type of PMIX_UNDEF is taken to imply
+ * a boolean "true". */
+PMIX_EXPORT pmix_boolean_t PMIx_Value_true(const pmix_value_t *v);
+
 /* Load data into a pmix_value_t structure. The data can be of any
  * PMIx data type - which means the load can be somewhat complex
  * to implement (e.g., in the case of a pmix_data_array_t). The
@@ -1582,8 +1687,6 @@ PMIX_EXPORT pmix_status_t PMIx_Value_unload(pmix_value_t *val,
                                             void **data,
                                             size_t *sz);
 
-PMIX_EXPORT void PMIx_Value_destruct(pmix_value_t *val);
-
 /* Transfer data from one pmix_value_t to another - this is actually
  * executed as a COPY operation, so the original data is not altered.
  */
@@ -1594,8 +1697,32 @@ PMIX_EXPORT pmix_status_t PMIx_Value_xfer(pmix_value_t *dest,
 PMIX_EXPORT pmix_value_cmp_t PMIx_Value_compare(pmix_value_t *v1,
                                                 pmix_value_t *v2);
 
+
+
 PMIX_EXPORT void PMIx_Data_array_destruct(pmix_data_array_t *d);
 
+
+/* initialize an info struct */
+PMIX_EXPORT void PMIx_Info_construct(pmix_info_t *p);
+
+/* free memory stored inside an info struct */
+PMIX_EXPORT void PMIx_Info_destruct(pmix_info_t *p);
+
+/* create and initialize an array of info structs */
+PMIX_EXPORT pmix_info_t* PMIx_Info_create(size_t n);
+
+/* free memory stored inside an array of coord structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Info_free(pmix_info_t *p, size_t n);
+
+/* Check the given info struct to determine if it includes
+ * a boolean value (includes strings for "true" and "false",
+ * including abbreviations such as "t" or "f"), and if so,
+ * then its value. A value type of PMIX_UNDEF is taken to imply
+ * a boolean "true" as the presence of the key defaults to
+ * indicating "true". */
+PMIX_EXPORT pmix_boolean_t PMIx_Info_true(const pmix_info_t *p);
+
 /* Load key/value data into a pmix_info_t struct. Note that this
  * effectively is a PMIX_LOAD_KEY operation to copy the key,
  * followed by a PMIx_Value_load to COPY the data into the
@@ -1610,6 +1737,288 @@ PMIX_EXPORT pmix_status_t PMIx_Info_load(pmix_info_t *info,
 PMIX_EXPORT pmix_status_t PMIx_Info_xfer(pmix_info_t *dest,
                                          const pmix_info_t *src);
 
+/* mark the info struct as required */
+PMIX_EXPORT void PMIx_Info_required(pmix_info_t *p);
+
+/* mark the info struct as optional */
+PMIX_EXPORT void PMIx_Info_optional(pmix_info_t *p);
+
+/* check if the info struct is required */
+PMIX_EXPORT bool PMIx_Info_is_required(const pmix_info_t *p);
+
+/* check if the info struct is optional */
+PMIX_EXPORT bool PMIx_Info_is_optional(const pmix_info_t *p);
+
+/* mark the info struct as processed */
+PMIX_EXPORT void PMIx_Info_processed(pmix_info_t *p);
+
+/* check if the info struct has been processed */
+PMIX_EXPORT bool PMIx_Info_was_processed(const pmix_info_t *p);
+
+/* mark the info struct as the end of an array */
+PMIX_EXPORT void PMIx_Info_set_end(pmix_info_t *p);
+
+/* check if the info struct is the end of an array */
+PMIX_EXPORT bool PMIx_Info_is_end(const pmix_info_t *p);
+
+/* mark the info as a qualifier */
+PMIX_EXPORT void PMIx_Info_qualifier(pmix_info_t *p);
+
+/* check if the info struct is a qualifier */
+PMIX_EXPORT bool PMIx_Info_is_qualifier(const pmix_info_t *p);
+
+/* mark the info struct as persistent - do NOT release its contents */
+PMIX_EXPORT void PMIx_Info_persistent(pmix_info_t *p);
+
+/* check if the info struct is persistent */
+PMIX_EXPORT bool PMIx_Info_is_persistent(const pmix_info_t *p);
+
+
+/* initialize a coord struct */
+PMIX_EXPORT void PMIx_Coord_construct(pmix_coord_t *m);
+
+/* free memory stored inside a coord struct */
+PMIX_EXPORT void PMIx_Coord_destruct(pmix_coord_t *m);
+
+/* create and initialize an array of coord structs */
+PMIX_EXPORT pmix_coord_t* PMIx_Coord_create(size_t dims,
+                                            size_t number);
+
+/* free memory stored inside an array of coord structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Coord_free(pmix_coord_t *m, size_t number);
+
+
+/* initialize a topology struct */
+PMIX_EXPORT void PMIx_Topology_construct(pmix_topology_t *t);
+
+/* free memory stored inside a topology struct */
+PMIX_EXPORT void PMIx_Topology_destruct(pmix_topology_t *topo);
+
+/* create and initialize an array of topology structs */
+PMIX_EXPORT pmix_topology_t* PMIx_Topology_create(size_t n);
+
+/* free memory stored inside an array of topology structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Topology_free(pmix_topology_t *t, size_t n);
+
+/* initialize a cpuset struct */
+PMIX_EXPORT void PMIx_Cpuset_construct(pmix_cpuset_t *cpuset);
+
+/* free memory stored inside a cpuset struct */
+PMIX_EXPORT void PMIx_Cpuset_destruct(pmix_cpuset_t *cpuset);
+
+/* create and initialize an array of cpuset structs */
+PMIX_EXPORT pmix_cpuset_t* PMIx_Cpuset_create(size_t n);
+
+/* free memory stored inside an array of cpuset structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Cpuset_free(pmix_cpuset_t *c, size_t n);
+
+/* initialize a geometry struct */
+PMIX_EXPORT void PMIx_Geometry_construct(pmix_geometry_t *g);
+
+/* free memory stored inside a cpuset struct */
+PMIX_EXPORT void PMIx_Geometry_destruct(pmix_geometry_t *g);
+
+/* create and initialize an array of cpuset structs */
+PMIX_EXPORT pmix_geometry_t* PMIx_Geometry_create(size_t n);
+
+/* free memory stored inside an array of cpuset structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Geometry_free(pmix_geometry_t *g, size_t n);
+
+/* initialize a device distance struct */
+PMIX_EXPORT void PMIx_Device_distance_construct(pmix_device_distance_t *d);
+
+/* free memory stored inside a device distance struct */
+PMIX_EXPORT void PMIx_Device_distance_destruct(pmix_device_distance_t *d);
+
+/* create and initialize an array of device distance structs */
+PMIX_EXPORT pmix_device_distance_t* PMIx_Device_distance_create(size_t n);
+
+/* free memory stored inside an array of device distance structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Device_distance_free(pmix_device_distance_t *d, size_t n);
+
+
+/* initialize a byte object struct */
+PMIX_EXPORT void PMIx_Byte_object_construct(pmix_byte_object_t *b);
+
+/* free memory stored inside a byte object struct */
+PMIX_EXPORT void PMIx_Byte_object_destruct(pmix_byte_object_t *g);
+
+/* create and initialize an array of byte object structs */
+PMIX_EXPORT pmix_byte_object_t* PMIx_Byte_object_create(size_t n);
+
+/* free memory stored inside an array of byte object structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Byte_object_free(pmix_byte_object_t *g, size_t n);
+
+/* load a byte object */
+PMIX_EXPORT void PMIx_Byte_object_load(pmix_byte_object_t *b,
+                                       char *d, size_t sz);
+
+/* initialize an endpoint struct */
+PMIX_EXPORT void PMIx_Endpoint_construct(pmix_endpoint_t *e);
+
+/* free memory stored inside an endpoint struct */
+PMIX_EXPORT void PMIx_Endpoint_destruct(pmix_endpoint_t *e);
+
+/* create and initialize an array of endpoint structs */
+PMIX_EXPORT pmix_endpoint_t* PMIx_Endpoint_create(size_t n);
+
+/* free memory stored inside an array of endpoint structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Endpoint_free(pmix_endpoint_t *e, size_t n);
+
+
+/* initialize an envar struct */
+PMIX_EXPORT void PMIx_Envar_construct(pmix_envar_t *e);
+
+/* free memory stored inside an envar struct */
+PMIX_EXPORT void PMIx_Envar_destruct(pmix_envar_t *e);
+
+/* create and initialize an array of envar structs */
+PMIX_EXPORT pmix_envar_t* PMIx_Envar_create(size_t n);
+
+/* free memory stored inside an array of envar structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Envar_free(pmix_envar_t *e, size_t n);
+
+/* load an envar struct */
+PMIX_EXPORT void PMIx_Envar_load(pmix_envar_t *e,
+                                 char *var,
+                                 char *value,
+                                 char separator);
+
+/* initialize a data buffer struct */
+PMIX_EXPORT void PMIx_Data_buffer_construct(pmix_data_buffer_t *b);
+
+/* free memory stored inside a data buffer struct */
+PMIX_EXPORT void PMIx_Data_buffer_destruct(pmix_data_buffer_t *b);
+
+/* create a data buffer struct */
+PMIX_EXPORT pmix_data_buffer_t* PMIx_Data_buffer_create(void);
+
+/* free memory stored inside a data buffer struct */
+PMIX_EXPORT void PMIx_Data_buffer_release(pmix_data_buffer_t *b);
+
+/* load a data buffer struct */
+PMIX_EXPORT void PMIx_Data_buffer_load(pmix_data_buffer_t *b,
+                                       char *bytes, size_t sz);
+
+/* unload a data buffer struct */
+PMIX_EXPORT void PMIx_Data_buffer_unload(pmix_data_buffer_t *b,
+                                         char **bytes, size_t *sz);
+
+
+/* initialize a proc struct */
+PMIX_EXPORT void PMIx_Proc_construct(pmix_proc_t *p);
+
+/* clear memory inside a proc struct */
+PMIX_EXPORT void PMIx_Proc_destruct(pmix_proc_t *p);
+
+/* create and initialize an array of proc structs */
+PMIX_EXPORT pmix_proc_t* PMIx_Proc_create(size_t n);
+
+/* free memory stored inside an array of proc structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Proc_free(pmix_proc_t *p, size_t n);
+
+/* load a proc struct */
+PMIX_EXPORT void PMIx_Proc_load(pmix_proc_t *p,
+                                char *nspace, pmix_rank_t rank);
+
+/* construct a multicluster nspace struct from cluster and nspace values */
+PMIX_EXPORT void PMIx_Multicluster_nspace_construct(pmix_nspace_t target,
+                                                    pmix_nspace_t cluster,
+                                                    pmix_nspace_t nspace);
+
+/* parse a multicluster nspace struct to separate out the cluster
+ * and nspace portions */
+PMIX_EXPORT void PMIx_Multicluster_nspace_parse(pmix_nspace_t target,
+                                                pmix_nspace_t cluster,
+                                                pmix_nspace_t nspace);
+
+
+/* initialize a proc info struct */
+PMIX_EXPORT void PMIx_Proc_info_construct(pmix_proc_info_t *p);
+
+/* clear memory inside a proc info struct */
+PMIX_EXPORT void PMIx_Proc_info_destruct(pmix_proc_info_t *p);
+
+/* create and initialize an array of proc info structs */
+PMIX_EXPORT pmix_proc_info_t* PMIx_Proc_info_create(size_t n);
+
+/* free memory stored inside an array of proc info structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Proc_info_free(pmix_proc_info_t *p, size_t n);
+
+
+/* initialize a proc stats struct */
+PMIX_EXPORT void PMIx_Proc_stats_construct(pmix_proc_stats_t *p);
+
+/* clear memory inside a proc stats struct */
+PMIX_EXPORT void PMIx_Proc_stats_destruct(pmix_proc_stats_t *p);
+
+/* create and initialize an array of proc stats structs */
+PMIX_EXPORT pmix_proc_stats_t* PMIx_Proc_stats_create(size_t n);
+
+/* free memory stored inside an array of proc stats structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Proc_stats_free(pmix_proc_stats_t *p, size_t n);
+
+
+/* initialize a disk stats struct */
+PMIX_EXPORT void PMIx_Disk_stats_construct(pmix_disk_stats_t *p);
+
+/* clear memory inside a disk stats struct */
+PMIX_EXPORT void PMIx_Disk_stats_destruct(pmix_disk_stats_t *p);
+
+/* create and initialize an array of disk stats structs */
+PMIX_EXPORT pmix_disk_stats_t* PMIx_Disk_stats_create(size_t n);
+
+/* free memory stored inside an array of disk stats structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Disk_stats_free(pmix_disk_stats_t *p, size_t n);
+
+
+/* initialize a net stats struct */
+PMIX_EXPORT void PMIx_Net_stats_construct(pmix_net_stats_t *p);
+
+/* clear memory inside a net stats struct */
+PMIX_EXPORT void PMIx_Net_stats_destruct(pmix_net_stats_t *p);
+
+/* create and initialize an array of net stats structs */
+PMIX_EXPORT pmix_net_stats_t* PMIx_Net_stats_create(size_t n);
+
+/* free memory stored inside an array of net stats structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Net_stats_free(pmix_net_stats_t *p, size_t n);
+
+
+/* initialize a pdata struct */
+PMIX_EXPORT void PMIx_Pdata_construct(pmix_pdata_t *p);
+
+/* clear memory inside a pdata struct */
+PMIX_EXPORT void PMIx_Pdata_destruct(pmix_pdata_t *p);
+
+/* create and initialize an array of pdata structs */
+PMIX_EXPORT pmix_pdata_t* PMIx_Pdata_create(size_t n);
+
+/* free memory stored inside an array of pdata structs (does
+ * not free the struct memory itself */
+PMIX_EXPORT void PMIx_Pdata_free(pmix_pdata_t *p, size_t n);
+
+
+PMIX_EXPORT void PMIx_App_construct(pmix_app_t *p);
+PMIX_EXPORT void PMIx_App_destruct(pmix_app_t *p);
+PMIX_EXPORT pmix_app_t* PMIx_App_create(size_t n);
+PMIX_EXPORT void PMIx_App_info_create(pmix_app_t *p, size_t n);
+PMIX_EXPORT void PMIx_App_free(pmix_app_t *p, size_t n);
+PMIX_EXPORT void PMIx_App_release(pmix_app_t *p);
+
 /* Constructing arrays of pmix_info_t for passing to an API can
  * be tedious since the pmix_info_t itself is not a "list object".
  * Since this is a very frequent operation, a set of APIs has been
@@ -1629,6 +2038,11 @@ PMIX_EXPORT pmix_status_t PMIx_Info_list_add(void *ptr,
                                              const void *value,
                                              pmix_data_type_t type);
 
+PMIX_EXPORT pmix_status_t PMIx_Info_list_prepend(void *ptr,
+                                                 const char *key,
+                                                 const void *value,
+                                                 pmix_data_type_t type);
+
 PMIX_EXPORT pmix_status_t PMIx_Info_list_insert(void *ptr, pmix_info_t *info);
 
 /* Transfer the data in an existing pmix_info_t struct to a list. This
@@ -1646,6 +2060,12 @@ PMIX_EXPORT pmix_status_t PMIx_Info_list_convert(void *ptr, pmix_data_array_t *p
 /* Release all data on the list and destruct all internal tracking */
 PMIX_EXPORT void PMIx_Info_list_release(void *ptr);
 
+/* retrieve the next info on the list - passing a NULL
+ * to the "prev" parameter will return the first pmix_info_t
+ * on the list. A return of NULL indicates the end of the list
+ */
+PMIX_EXPORT pmix_info_t* PMIx_Info_list_get_info(void *ptr, void *prev, void **next);
+
 #endif
 
 #if defined(c_plusplus) || defined(__cplusplus)
diff --git a/deps/pmix/include/pmix_common.h b/deps/pmix/include/pmix_common.h
index 621ead167..1d78bcb87 100644
--- a/deps/pmix/include/pmix_common.h
+++ b/deps/pmix/include/pmix_common.h
@@ -47,7 +47,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (c) 2020      Cisco Systems, Inc.  All rights reserved
- * Copyright (c) 2021-2022 Nanook Consulting  All rights reserved.
+ * Copyright (c) 2021-2023 Nanook Consulting  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -69,8 +69,6 @@
 #include <unistd.h> /* for uid_t and gid_t */
 #include <sys/types.h> /* for uid_t and gid_t */
 
-extern char **environ;
-
 /* Whether C compiler supports -fvisibility */
 #define PMIX_HAVE_VISIBILITY 1
 
@@ -158,6 +156,9 @@ typedef uint32_t pmix_rank_t;
 /* initialization attributes */
 #define PMIX_EXTERNAL_PROGRESS              "pmix.evext"            // (bool) The host shall progress the PMIx library via
                                                                     //        calls to PMIx_Progress
+#define PMIX_EXTERNAL_AUX_EVENT_BASE        "pmix.evaux"            // (void*) event base to be used for auxiliary
+                                                                    //        functions (e.g., capturing signals) that would
+                                                                    //        otherwise interfere with the host
 #define PMIX_SERVER_TOOL_SUPPORT            "pmix.srvr.tool"        // (bool) The host RM wants to declare itself as willing
                                                                     //        to accept tool connection requests
 #define PMIX_SERVER_REMOTE_CONNECTIONS      "pmix.srvr.remote"      // (bool) Allow connections from remote tools (do not use
@@ -201,6 +202,7 @@ typedef uint32_t pmix_rank_t;
 #define PMIX_CONNECT_TO_SYSTEM              "pmix.cnct.sys"         // (bool) The requestor requires that a connection be made only to
                                                                     //        a local system-level PMIx server
 #define PMIX_CONNECT_SYSTEM_FIRST           "pmix.cnct.sys.first"   // (bool) Preferentially look for a system-level PMIx server first
+#define PMIX_CONNECT_TO_SCHEDULER           "pmix.cnct.sched"       // (bool) Connect to the system scheduler
 #define PMIX_SERVER_URI                     "pmix.srvr.uri"         // (char*) URI of server to be contacted
 #define PMIX_MYSERVER_URI                   "pmix.mysrvr.uri"       // (char*) URI of this proc's listener socket
 #define PMIX_SERVER_HOSTNAME                "pmix.srvr.host"        // (char*) node where target server is located
@@ -459,6 +461,10 @@ typedef uint32_t pmix_rank_t;
 #define PMIX_DISPLAY_ALLOCATION             "pmix.dispalloc"        // (bool) display the resource allocation
 #define PMIX_DISPLAY_TOPOLOGY               "pmix.disptopo"         // (char*) comma-delimited list of hosts whose topology is
                                                                     //         to be displayed
+#define PMIX_DISPLAY_PROCESSORS             "pmix.dispcpus"         // (char*) comma-delimited list of hosts whose available
+                                                                    //         CPUs are to be displayed
+#define PMIX_DISPLAY_PARSEABLE_OUTPUT       "pmix.dispparse"        // (bool) display requested info in a format more amenable
+                                                                    //        to machine parsing
 #define PMIX_PPR                            "pmix.ppr"              // (char*) #procs to spawn on each identified resource
 #define PMIX_MAPBY                          "pmix.mapby"            // (char*) mapping policy
 #define PMIX_RANKBY                         "pmix.rankby"           // (char*) ranking policy
@@ -546,7 +552,7 @@ typedef uint32_t pmix_rank_t;
 #define PMIX_QUERY_PROC_TABLE               "pmix.qry.ptable"       // (pmix_data_array_t*) returns (pmix_data_array_t*) an array of pmix_proc_info_t
                                                                     //         REQUIRES a PMIX_NSPACE qualifier indicating the nspace being queried
 #define PMIX_QUERY_LOCAL_PROC_TABLE         "pmix.qry.lptable"      // (pmix_data_array_t*) returns (pmix_data_array_t*) an array of pmix_proc_info_t
-                                                                    //         of pmix_proc_info_t for procs in job on same node
+                                                                    //         for procs in job on same node
                                                                     //         REQUIRES a PMIX_NSPACE qualifier indicating the nspace being queried
 #define PMIX_QUERY_AUTHORIZATIONS           "pmix.qry.auths"        // (pmix_data_array_t*) return operations tool is authorized to perform. The contents
                                                                     //         of the array elements have not yet been standardized. NO QUALIFIERS
@@ -556,7 +562,8 @@ typedef uint32_t pmix_rank_t;
                                                                     //        SUPPORTED QUALIFIERS: PMIX_NSPACE/PMIX_RANK, or PMIX_PROCID of specific proc(s)
                                                                     //        whose info is being requested
 #define PMIX_QUERY_ALLOC_STATUS             "pmix.query.alloc"      // (char*) return a string reporting status of an allocation request
-                                                                    //         REQUIRES a PMIX_ALLOC_ID qualifier indicating the allocation request being queried
+                                                                    //         REQUIRES a PMIX_ALLOC_REQUEST_ID qualifier indicating the allocation request
+                                                                    //         being queried
 #define PMIX_TIME_REMAINING                 "pmix.time.remaining"   // (uint32_t) returns number of seconds remaining in allocation
                                                                     //         for the specified nspace (defaults to allocation containing the caller)
                                                                     //         SUPPORTED QUALIFIERS: PMIX_NSPACE of the nspace whose info is being requested
@@ -624,7 +631,7 @@ typedef uint32_t pmix_rank_t;
                                                                     //        the query on the key.
 
 
-/* PMIx_Get information retrieval attributes */
+/* PMIx_Get information retrieval qualifiers */
 #define PMIX_SESSION_INFO                   "pmix.ssn.info"         // (bool) Return information about the specified session. If information
                                                                     //        about a session other than the one containing the requesting
                                                                     //        process is desired, then the attribute array must contain a
@@ -833,6 +840,8 @@ typedef uint32_t pmix_rank_t;
 #define PMIX_ALLOC_FABRIC_ENDPTS_NODE       "pmix.alloc.endpts.nd"  // (size_t) number of endpoints to allocate per node
 #define PMIX_ALLOC_FABRIC_SEC_KEY           "pmix.alloc.nsec"       // (pmix_byte_object_t) fabric security key
 #define PMIX_ALLOC_QUEUE                    "pmix.alloc.queue"      // (char*) name of queue being referenced
+#define PMIX_ALLOC_PREEMPTIBLE              "pmix.alloc.preempt"    // (bool) by default, all jobs in the resulting allocation are to be
+                                                                    //        considered preemptible (overridable at per-job level)
 
 
 /* job control attributes */
@@ -1517,14 +1526,14 @@ typedef uint32_t pmix_info_directives_t;
 
 /* define a set of directives for allocation requests */
 typedef uint8_t pmix_alloc_directive_t;
-#define PMIX_ALLOC_NEW          1  // new allocation is being requested. The resulting allocation will be
-                                   // disjoint (i.e., not connected in a job sense) from the requesting allocation
-#define PMIX_ALLOC_EXTEND       2  // extend the existing allocation, either in time or as additional resources
-#define PMIX_ALLOC_RELEASE      3  // release part of the existing allocation. Attributes in the accompanying
-                                   // pmix\_info\_t array may be used to specify permanent release of the
-                                   // identified resources, or "lending" of those resources for some period
-                                   // of time.
-#define PMIX_ALLOC_REAQUIRE     4  // reacquire resources that were previously "lent" back to the scheduler
+#define PMIX_ALLOC_NEW          1   // new allocation is being requested. The resulting allocation will be
+                                    // disjoint (i.e., not connected in a job sense) from the requesting allocation
+#define PMIX_ALLOC_EXTEND       2   // extend the existing allocation, either in time or as additional resources
+#define PMIX_ALLOC_RELEASE      3   // release part or all of the existing allocation. Attributes in the accompanying
+                                    // pmix\_info\_t array may be used to specify permanent release of the
+                                    // identified resources, or "lending" of those resources for some period
+                                    // of time.
+#define PMIX_ALLOC_REAQUIRE     4   // reacquire resources that were previously "lent" back to the scheduler
 
 /* define a value boundary beyond which implementers are free
  * to define their own directive values */
@@ -1624,58 +1633,6 @@ static inline void* pmix_calloc(size_t n, size_t m)
     return calloc(n, m);
 }
 
-/* declare a convenience macro for checking keys */
-#define PMIX_CHECK_KEY(a, b) \
-    (0 == strncmp((a)->key, (b), PMIX_MAX_KEYLEN))
-
-#define PMIX_CHECK_RESERVED_KEY(a) \
-    (0 == strncmp((a), "pmix", 4))
-
-#define PMIX_LOAD_KEY(a, b)                                                 \
-    do {                                                                    \
-        memset((a), 0, PMIX_MAX_KEYLEN+1);                                  \
-        if (NULL != (b)) {                                                  \
-            pmix_strncpy((char*)(a), (const char*)(b), PMIX_MAX_KEYLEN);    \
-        }                                                                   \
-    }while(0)
-
-/* define a convenience macro for loading nspaces */
-#define PMIX_LOAD_NSPACE(a, b)                              \
-    do {                                                    \
-        memset((a), 0, PMIX_MAX_NSLEN+1);                   \
-        if (NULL != (b)) {                                  \
-            pmix_strncpy((char*)(a), (b), PMIX_MAX_NSLEN);  \
-        }                                                   \
-    }while(0)
-
-/* define a convenience macro for checking nspaces */
-#define PMIX_CHECK_NSPACE(a, b) \
-    (PMIX_NSPACE_INVALID((a)) || PMIX_NSPACE_INVALID((b)) || 0 == strncmp((a), (b), PMIX_MAX_NSLEN))
-
-/* define a convenience macro for loading names */
-#define PMIX_LOAD_PROCID(a, b, c)               \
-    do {                                        \
-        PMIX_LOAD_NSPACE((a)->nspace, (b));     \
-        (a)->rank = (c);                        \
-    }while(0)
-
-#define PMIX_XFER_PROCID(a, b)      \
-    memcpy((a), (b), sizeof(pmix_proc_t))
-
-#define PMIX_PROCID_XFER(a, b) PMIX_XFER_PROCID(a, b)
-
-/* define a convenience macro for checking names */
-#define PMIX_CHECK_PROCID(a, b) \
-    (PMIX_CHECK_NSPACE((a)->nspace, (b)->nspace) && ((a)->rank == (b)->rank || (PMIX_RANK_WILDCARD == (a)->rank || PMIX_RANK_WILDCARD == (b)->rank)))
-
-#define PMIX_CHECK_RANK(a, b) \
-    ((a) == (b) || (PMIX_RANK_WILDCARD == (a) || PMIX_RANK_WILDCARD == (b)))
-
-#define PMIX_NSPACE_INVALID(a) \
-    (NULL == (a) || 0 == pmix_nslen((a)))
-
-#define PMIX_PROCID_INVALID(a)  \
-    (PMIX_NSPACE_INVALID((a)->nspace) || PMIX_RANK_INVALID == (a)->rank)
 
 /**
  * Provide a safe version of strncpy that doesn't generate
@@ -1741,459 +1698,6 @@ static inline size_t pmix_nslen(const char *src)
     return i;
 }
 
-static inline
-int pmix_argv_count(char **argv)
-{
-    char **p;
-    int i;
-
-    if (NULL == argv)
-        return 0;
-
-    for (i = 0, p = argv; *p; i++, p++)
-        continue;
-
-    return i;
-}
-
-#define PMIX_ARGV_COUNT(r, a) \
-    (r) = pmix_argv_count(a)
-
-static inline
-pmix_status_t pmix_argv_append_nosize(char ***argv, const char *arg)
-{
-    int argc;
-
-    /* Create new argv. */
-
-    if (NULL == *argv) {
-        *argv = (char **) malloc(2 * sizeof(char *));
-        if (NULL == *argv) {
-            return PMIX_ERR_OUT_OF_RESOURCE;
-        }
-        argc = 0;
-        (*argv)[0] = NULL;
-        (*argv)[1] = NULL;
-    }
-
-    /* Extend existing argv. */
-    else {
-        /* count how many entries currently exist */
-        argc = pmix_argv_count(*argv);
-
-        *argv = (char **) realloc(*argv, (argc + 2) * sizeof(char *));
-        if (NULL == *argv) {
-            return PMIX_ERR_OUT_OF_RESOURCE;
-        }
-    }
-
-    /* Set the newest element to point to a copy of the arg string */
-
-    (*argv)[argc] = strdup(arg);
-    if (NULL == (*argv)[argc]) {
-        return PMIX_ERR_OUT_OF_RESOURCE;
-    }
-
-    argc = argc + 1;
-    (*argv)[argc] = NULL;
-
-    return PMIX_SUCCESS;
-}
-
-#define PMIX_ARGV_APPEND(r, a, b) \
-    (r) = pmix_argv_append_nosize(&(a), (b))
-
-static inline
-pmix_status_t pmix_argv_prepend_nosize(char ***argv, const char *arg)
-{
-    int argc;
-    int i;
-
-    /* Create new argv. */
-
-    if (NULL == *argv) {
-        *argv = (char **) malloc(2 * sizeof(char *));
-        if (NULL == *argv) {
-            return PMIX_ERR_OUT_OF_RESOURCE;
-        }
-        (*argv)[0] = strdup(arg);
-        (*argv)[1] = NULL;
-    } else {
-        /* count how many entries currently exist */
-        argc = pmix_argv_count(*argv);
-
-        *argv = (char **) realloc(*argv, (argc + 2) * sizeof(char *));
-        if (NULL == *argv) {
-            return PMIX_ERR_OUT_OF_RESOURCE;
-        }
-        (*argv)[argc + 1] = NULL;
-
-        /* shift all existing elements down 1 */
-        for (i = argc; 0 < i; i--) {
-            (*argv)[i] = (*argv)[i - 1];
-        }
-        (*argv)[0] = strdup(arg);
-    }
-
-    return PMIX_SUCCESS;
-}
-
-#define PMIX_ARGV_PREPEND(r, a, b) \
-    (r) = pmix_argv_prepend_nosize(&(a), b)
-
-static inline
-pmix_status_t pmix_argv_append_unique_nosize(char ***argv, const char *arg)
-{
-    int i;
-
-    /* if the provided array is NULL, then the arg cannot be present,
-     * so just go ahead and append
-     */
-    if (NULL == *argv) {
-        return pmix_argv_append_nosize(argv, arg);
-    }
-
-    /* see if this arg is already present in the array */
-    for (i = 0; NULL != (*argv)[i]; i++) {
-        if (0 == strcmp(arg, (*argv)[i])) {
-            /* already exists */
-            return PMIX_SUCCESS;
-        }
-    }
-
-    /* we get here if the arg is not in the array - so add it */
-    return pmix_argv_append_nosize(argv, arg);
-}
-
-#define PMIX_ARGV_APPEND_UNIQUE(r, a, b) \
-    (r) = pmix_argv_append_unique_nosize(a, b)
-
-static inline void pmix_argv_free(char **argv)
-{
-    char **p;
-
-    if (NULL == argv)
-        return;
-
-    for (p = argv; NULL != *p; ++p) {
-        pmix_free(*p);
-    }
-
-    pmix_free(argv);
-}
-
-#define PMIX_ARGV_FREE(a)  pmix_argv_free(a)
-
-static inline
-char **pmix_argv_split_inter(const char *src_string,
-                             int delimiter,
-                             bool include_empty)
-{
-    char arg[512];
-    char **argv = NULL;
-    const char *p;
-    char *argtemp;
-    size_t arglen;
-
-    while (src_string && *src_string) {
-        p = src_string;
-        arglen = 0;
-
-        while (('\0' != *p) && (*p != delimiter)) {
-            ++p;
-            ++arglen;
-        }
-
-        /* zero length argument, skip */
-
-        if (src_string == p) {
-            if (include_empty) {
-                arg[0] = '\0';
-                if (PMIX_SUCCESS != pmix_argv_append_nosize(&argv, arg)) {
-                    return NULL;
-                }
-            }
-            src_string = p + 1;
-            continue;
-        }
-
-        /* tail argument, add straight from the original string */
-
-        else if ('\0' == *p) {
-            if (PMIX_SUCCESS != pmix_argv_append_nosize(&argv, src_string)) {
-                return NULL;
-            }
-            src_string = p;
-            continue;
-        }
-
-        /* long argument, malloc buffer, copy and add */
-
-        else if (arglen > 511) {
-            argtemp = (char *) malloc(arglen + 1);
-            if (NULL == argtemp)
-                return NULL;
-
-            pmix_strncpy(argtemp, src_string, arglen);
-            argtemp[arglen] = '\0';
-
-            if (PMIX_SUCCESS != pmix_argv_append_nosize(&argv, argtemp)) {
-                free(argtemp);
-                return NULL;
-            }
-
-            free(argtemp);
-        }
-
-        /* short argument, copy to buffer and add */
-
-        else {
-            pmix_strncpy(arg, src_string, arglen);
-            arg[arglen] = '\0';
-
-            if (PMIX_SUCCESS != pmix_argv_append_nosize(&argv, arg)) {
-                return NULL;
-            }
-        }
-
-        src_string = p + 1;
-    }
-
-    /* All done */
-
-    return argv;
-}
-
-static inline
-char **pmix_argv_split_with_empty(const char *src_string, int delimiter)
-{
-    return pmix_argv_split_inter(src_string, delimiter, true);
-}
-
-static inline
-char **pmix_argv_split(const char *src_string, int delimiter)
-{
-    return pmix_argv_split_inter(src_string, delimiter, false);
-}
-
-#define PMIX_ARGV_SPLIT(a, b, c) \
-    (a) = pmix_argv_split(b, c)
-
-static inline
-char *pmix_argv_join(char **argv, int delimiter)
-{
-    char **p;
-    char *pp;
-    char *str;
-    size_t str_len = 0;
-    size_t i;
-
-    /* Bozo case */
-
-    if (NULL == argv || NULL == argv[0]) {
-        return strdup("");
-    }
-
-    /* Find the total string length in argv including delimiters.  The
-     last delimiter is replaced by the NULL character. */
-
-    for (p = argv; *p; ++p) {
-        str_len += strlen(*p) + 1;
-    }
-
-    /* Allocate the string. */
-
-    if (NULL == (str = (char *) malloc(str_len)))
-        return NULL;
-
-    /* Loop filling in the string. */
-
-    str[--str_len] = '\0';
-    p = argv;
-    pp = *p;
-
-    for (i = 0; i < str_len; ++i) {
-        if ('\0' == *pp) {
-
-            /* End of a string, fill in a delimiter and go to the next
-             string. */
-
-            str[i] = (char) delimiter;
-            ++p;
-            pp = *p;
-        } else {
-            str[i] = *pp++;
-        }
-    }
-
-    /* All done */
-
-    return str;
-}
-
-#define PMIX_ARGV_JOIN(a, b, c) \
-    (a) = pmix_argv_join(b, c)
-
-static inline
-char **pmix_argv_copy(char **argv)
-{
-    char **dupv = NULL;
-
-    if (NULL == argv)
-        return NULL;
-
-    /* create an "empty" list, so that we return something valid if we
-     were passed a valid list with no contained elements */
-    dupv = (char **) malloc(sizeof(char *));
-    dupv[0] = NULL;
-
-    while (NULL != *argv) {
-        if (PMIX_SUCCESS != pmix_argv_append_nosize(&dupv, *argv)) {
-            PMIX_ARGV_FREE(dupv);
-            return NULL;
-        }
-
-        ++argv;
-    }
-
-    /* All done */
-
-    return dupv;
-}
-
-#define PMIX_ARGV_COPY(a, b) \
-    (a) = pmix_argv_copy(b)
-
-/**
- * Portable version of setenv(3), allowing editing of any
- * environ-like array.
- *
- * @param name String name of the environment variable to look for
- * @param value String value to set (may be NULL)
- * @param overwrite Whether to overwrite any existing value with
- * the same name
- * @param env The environment to use
- *
- * @retval PMIX_ERR_OUT_OF_RESOURCE If internal malloc() fails.
- * @retval PMIX_ERR_EXISTS If the name already exists in \em env and
- * \em overwrite is false (and therefore the \em value was not
- * saved in \em env)
- * @retval PMIX_SUCESS If the value replaced another value or is
- * appended to \em env.
- *
- * \em env is expected to be a NULL-terminated array of pointers
- * (argv-style).  Note that unlike some implementations of
- * putenv(3), if \em value is inserted in \em env, it is copied.
- * So the caller can modify/free both \em name and \em value after
- * pmix_setenv() returns.
- *
- * The \em env array will be grown if necessary.
- *
- * It is permissible to invoke this function with the
- * system-defined \em environ variable.  For example:
- *
- * \code
- *   #include "pmix_common.h"
- *   pmix_setenv("foo", "bar", true, &environ);
- * \endcode
- *
- * NOTE: If you use the real environ, pmix_setenv() will turn
- * around and perform setenv() to put the value in the
- * environment.  This may very well lead to a memory leak, so its
- * use is strongly discouraged.
- *
- * It is also permissible to call this function with an empty \em
- * env, as long as it is pre-initialized with NULL:
- *
- * \code
- *   char **my_env = NULL;
- *   pmix_setenv("foo", "bar", true, &my_env);
- * \endcode
- */
-static inline
-pmix_status_t pmix_setenv(const char *name,
-                          const char *value,
-                          bool overwrite,
-                          char ***env)
-{
-    int i;
-    char newvalue[100000], compare[100000];
-    size_t len;
-    bool valid;
-
-    /* Check the bozo case */
-    if (NULL == env) {
-        return PMIX_ERR_BAD_PARAM;
-    }
-
-    if (NULL != value) {
-        /* check the string for unacceptable length - i.e., ensure
-         * it is NULL-terminated */
-        valid = false;
-        for (i = 0; i < 100000; i++) {
-            if ('\0' == value[i]) {
-                valid = true;
-                break;
-            }
-        }
-        if (!valid) {
-            return PMIX_ERR_BAD_PARAM;
-        }
-    }
-
-    /* If this is the "environ" array, use setenv */
-    if (*env == environ) {
-        if (NULL == value) {
-            /* this is actually an unsetenv request */
-            unsetenv(name);
-        } else {
-            setenv(name, value, overwrite);
-        }
-        return PMIX_SUCCESS;
-    }
-
-    /* Make the new value */
-    if (NULL == value) {
-        snprintf(newvalue, 100000, "%s=", name);
-    } else {
-        snprintf(newvalue, 100000, "%s=%s", name, value);
-    }
-
-    if (NULL == *env) {
-        pmix_argv_append_nosize(env, newvalue);
-        return PMIX_SUCCESS;
-    }
-
-    /* Make something easy to compare to */
-
-    snprintf(compare, 100000, "%s=", name);
-    len = strlen(compare);
-
-    /* Look for a duplicate that's already set in the env */
-
-    for (i = 0; (*env)[i] != NULL; ++i) {
-        if (0 == strncmp((*env)[i], compare, len)) {
-            if (overwrite) {
-                free((*env)[i]);
-                (*env)[i] = strdup(newvalue);
-                return PMIX_SUCCESS;
-            } else {
-                return PMIX_ERR_EXISTS;
-            }
-        }
-    }
-
-    /* If we found no match, append this value */
-
-    pmix_argv_append_nosize(env, newvalue);
-
-    /* All done */
-    return PMIX_SUCCESS;
-}
-
-#define PMIX_SETENV(r, a, b, c) \
-    (r) = pmix_setenv((a), (b), true, (c))
-
 
 /****    PMIX COORD    ****/
 /* define coordinate system views */
@@ -2216,59 +1720,6 @@ typedef struct pmix_coord {
     .dims = 0                       \
 }
 
-#define PMIX_COORD_CREATE(m, d, n)                                              \
-    do {                                                                        \
-        pmix_coord_t *_m;                                                       \
-        if (0 == (d)) {                                                         \
-            (m) = NULL;                                                         \
-        } else {                                                                \
-            _m = (pmix_coord_t*)pmix_malloc((d) * sizeof(pmix_coord_t));        \
-            if (NULL != _m) {                                                   \
-                memset((m), 0, (d)*sizeof(pmix_coord_t));                       \
-                _m->view = PMIX_COORD_VIEW_UNDEF;                               \
-                _m->dims = (n);                                                 \
-                if (0 == (n)) {                                                 \
-                    _m->coord = NULL;                                           \
-                } else {                                                        \
-                    _m->coord = (uint32_t*)pmix_malloc((n) * sizeof(uint32_t)); \
-                    if (NULL != _m->coord) {                                    \
-                        memset(_m->coord, 0, (n)*sizeof(uint32_t));             \
-                    }                                                           \
-                }                                                               \
-            }                                                                   \
-            (m) = _m;                                                           \
-        }                                                                       \
-    } while(0)
-
-#define PMIX_COORD_CONSTRUCT(m)             \
-    do {                                    \
-        (m)->view = PMIX_COORD_VIEW_UNDEF;  \
-        (m)->coord = NULL;                  \
-        (m)->dims = 0;                      \
-    } while(0)
-
-#define PMIX_COORD_DESTRUCT(m)              \
-    do {                                    \
-        (m)->view = PMIX_COORD_VIEW_UNDEF;  \
-        if (NULL != (m)->coord) {           \
-            pmix_free((m)->coord);          \
-            (m)->coord = NULL;              \
-            (m)->dims = 0;                  \
-        }                                   \
-    } while(0)
-
-#define PMIX_COORD_FREE(m, n)                       \
-    do {                                            \
-        size_t _nc_;                                \
-        if (NULL != (m)) {                          \
-            for (_nc_ = 0; _nc_ < (n); _nc_++) {    \
-                PMIX_COORD_DESTRUCT(&(m)[_nc_]);    \
-            }                                       \
-            free((m));                              \
-            (m) = NULL;                             \
-        }                                           \
-    } while(0)
-
 
 /****    PMIX LINK STATES    ****/
 typedef uint8_t pmix_link_state_t;
@@ -2289,21 +1740,6 @@ typedef struct{
     .bitmap = NULL              \
 }
 
-#define PMIX_CPUSET_CONSTRUCT(m) \
-    memset((m), 0, sizeof(pmix_cpuset_t))
-
-#define PMIX_CPUSET_CREATE(m, n)    \
-    do {                                                                    \
-        if (0 == (n))   {                                                   \
-            (m) = NULL;                                                     \
-        } else {                                                            \
-            (m) = (pmix_cpuset_t*)pmix_malloc((n) * sizeof(pmix_cpuset_t)); \
-            if (NULL != (m)) {                                              \
-                memset((m), 0, (n) * sizeof(pmix_cpuset_t));                \
-            }                                                               \
-        }                                                                   \
-    } while(0)
-
 
 /****    PMIX BIND ENVELOPE    ****/
 typedef uint8_t pmix_bind_envelope_t;
@@ -2323,20 +1759,6 @@ typedef struct {
     .topology = NULL                \
 }
 
-#define PMIX_TOPOLOGY_CONSTRUCT(m) \
-    memset((m), 0, sizeof(pmix_topology_t))
-
-#define PMIX_TOPOLOGY_CREATE(m, n) \
-    do {                                                                        \
-        if (0 == (n)) {                                                         \
-            (m) = NULL;                                                         \
-        } else {                                                                \
-            (m) = (pmix_topology_t*)pmix_malloc((n) * sizeof(pmix_topology_t)); \
-            if (NULL != (m)) {                                                  \
-                memset((m), 0, (n) * sizeof(pmix_topology_t));                  \
-            }                                                                   \
-        }                                                                       \
-    } while(0)
 
 /**** PMIX RELATIVE LOCALITY    ****/
 typedef uint16_t pmix_locality_t;
@@ -2370,48 +1792,6 @@ typedef struct pmix_geometry {
     .ncoords = 0                    \
 }
 
-#define PMIX_GEOMETRY_CONSTRUCT(m) \
-    memset((m), 0, sizeof(pmix_geometry_t));
-
-#define PMIX_GEOMETRY_DESTRUCT(m)                               \
-    do {                                                        \
-        if (NULL != (m)->uuid) {                                \
-            free((m)->uuid);                                    \
-            (m)->uuid = NULL;                                   \
-        }                                                       \
-        if (NULL != (m)->osname) {                              \
-            free((m)->osname);                                  \
-            (m)->osname = NULL;                                 \
-        }                                                       \
-        if (NULL != (m)->coordinates) {                         \
-            PMIX_COORD_FREE((m)->coordinates, (m)->ncoords);    \
-        }                                                       \
-    } while(0)
-
-#define PMIX_GEOMETRY_CREATE(m, n)                                              \
-    do {                                                                        \
-        if (0 == (n)) {                                                         \
-            (m) = NULL;                                                         \
-        } else {                                                                \
-            (m) = (pmix_geometry_t*)pmix_malloc((n) * sizeof(pmix_geometry_t)); \
-            if (NULL != (m)) {                                                  \
-                memset((m), 0, (n) * sizeof(pmix_geometry_t));                  \
-            }                                                                   \
-        }                                                                       \
-    } while(0)
-
-#define PMIX_GEOMETRY_FREE(m, n)                    \
-    do {                                            \
-        size_t _i;                                  \
-        if (NULL != (m)) {                          \
-            for (_i=0; _i < (n); _i++) {            \
-                PMIX_GEOMETRY_DESTRUCT(&(m)[_i]);   \
-            }                                       \
-            pmix_free((m));                         \
-            (m) = NULL;                             \
-        }                                           \
-    } while(0)
-
 
 /****    PMIX_DEVICE_TYPE    ****/
 typedef uint64_t pmix_device_type_t;
@@ -2441,54 +1821,6 @@ typedef struct pmix_device_distance {
     .maxdist = 0                        \
 }
 
-#define PMIX_DEVICE_DIST_CONSTRUCT(m)                       \
-    do {                                                    \
-        memset((m), 0, sizeof(pmix_device_distance_t));     \
-        (m)->mindist = UINT16_MAX;                          \
-        (m)->maxdist = UINT16_MAX;                          \
-    } while(0);
-
-#define PMIX_DEVICE_DIST_DESTRUCT(m)    \
-    do {                                \
-        if (NULL != ((m)->uuid)) {      \
-            pmix_free((m)->uuid);       \
-        }                               \
-        if (NULL != ((m)->osname)) {    \
-            pmix_free((m)->osname);     \
-        }                               \
-    } while(0)
-
-#define PMIX_DEVICE_DIST_CREATE(m, n)                                                           \
-    do {                                                                                        \
-        size_t _i;                                                                              \
-        pmix_device_distance_t *_m;                                                             \
-        if (0 == (n)) {                                                                         \
-            (m) = NULL;                                                                         \
-        } else {                                                                                \
-            _m = (pmix_device_distance_t*)pmix_malloc((n) * sizeof(pmix_device_distance_t));    \
-            if (NULL != _m) {                                                                   \
-                memset(_m, 0, (n)*sizeof(pmix_device_distance_t));                              \
-                for (_i=0; _i < (n); _i++) {                                                    \
-                    _m[_i].mindist = UINT16_MAX;                                                \
-                    _m[_i].maxdist = UINT16_MAX;                                                \
-                }                                                                               \
-            }                                                                                   \
-            (m) = _m;                                                                           \
-        }                                                                                       \
-    } while(0)
-
-#define PMIX_DEVICE_DIST_FREE(m, n)                     \
-    do {                                                \
-        size_t _i;                                      \
-        if (NULL != (m)) {                              \
-            for (_i=0; _i < (n); _i++) {                \
-                PMIX_DEVICE_DIST_DESTRUCT(&(m)[_i]);    \
-            }                                           \
-            pmix_free((m));                             \
-            (m) = NULL;                                 \
-        }                                               \
-    } while(0)
-
 
 /****    PMIX BYTE OBJECT    ****/
 typedef struct pmix_byte_object {
@@ -2502,53 +1834,6 @@ typedef struct pmix_byte_object {
     .size = 0                           \
 }
 
-#define PMIX_BYTE_OBJECT_CREATE(m, n)                                                   \
-    do {                                                                                \
-        if (0 == (n)) {                                                                 \
-            (m) = NULL;                                                                 \
-        } else {                                                                        \
-            (m) = (pmix_byte_object_t*)pmix_malloc((n) * sizeof(pmix_byte_object_t));   \
-            if (NULL != (m)) {                                                          \
-                memset((m), 0, (n)*sizeof(pmix_byte_object_t));                         \
-            }                                                                           \
-        }                                                                               \
-    } while(0)
-
-#define PMIX_BYTE_OBJECT_CONSTRUCT(m)   \
-    do {                                \
-        (m)->bytes = NULL;              \
-        (m)->size = 0;                  \
-    } while(0)
-
-#define PMIX_BYTE_OBJECT_DESTRUCT(m)    \
-    do {                                \
-        if (NULL != (m)->bytes) {       \
-            pmix_free((m)->bytes);      \
-        }                               \
-        (m)->bytes = NULL;              \
-        (m)->size = 0;                  \
-    } while(0)
-
-#define PMIX_BYTE_OBJECT_FREE(m, n)                     \
-    do {                                                \
-        size_t _bon;                                    \
-        if (NULL != (m)) {                              \
-            for (_bon=0; _bon < n; _bon++) {            \
-                PMIX_BYTE_OBJECT_DESTRUCT(&(m)[_bon]);  \
-            }                                           \
-            pmix_free((m));                             \
-            (m) = NULL;                                 \
-        }                                               \
-    } while(0)
-
-#define PMIX_BYTE_OBJECT_LOAD(b, d, s)      \
-    do {                                    \
-        (b)->bytes = (char*)(d);            \
-        (d) = NULL;                         \
-        (b)->size = (s);                    \
-        (s) = 0;                            \
-    } while(0)
-
 
 /****    PMIX ENDPOINT    ****/
 typedef struct pmix_endpoint {
@@ -2564,47 +1849,6 @@ typedef struct pmix_endpoint {
     .endpt = PMIX_BYTE_OBJECT_STATIC_INIT   \
 }
 
-#define PMIX_ENDPOINT_CONSTRUCT(m)      \
-    memset((m), 0, sizeof(pmix_endpoint_t))
-
-#define PMIX_ENDPOINT_DESTRUCT(m)       \
-    do {                                \
-        if (NULL != (m)->uuid) {        \
-            free((m)->uuid);            \
-        }                               \
-        if (NULL != (m)->osname) {      \
-            free((m)->osname);          \
-        }                               \
-        if (NULL != (m)->endpt.bytes) { \
-            free((m)->endpt.bytes);     \
-        }                               \
-    } while(0)
-
-#define PMIX_ENDPOINT_CREATE(m, n)                                              \
-    do {                                                                        \
-        if (0 == (n)) {                                                         \
-            (m) = NULL;                                                         \
-        } else {                                                                \
-            (m) = (pmix_endpoint_t*)pmix_malloc((n) * sizeof(pmix_endpoint_t)); \
-            if (NULL != (m)) {                                                  \
-                memset((m), 0, (n) * sizeof(pmix_endpoint_t));                  \
-            }                                                                   \
-        }                                                                       \
-    } while(0)
-
-#define PMIX_ENDPOINT_FREE(m, n)                    \
-    do {                                            \
-        size_t _n;                                  \
-        if (NULL != (m)) {                          \
-            for (_n=0; _n < (n); _n++) {            \
-                PMIX_ENDPOINT_DESTRUCT(&((m)[_n])); \
-            }                                       \
-            free((m));                              \
-            (m) = NULL;                             \
-        }                                           \
-    } while(0)
-
-
 
 /****    PMIX ENVAR STRUCT   ****/
 /* Provide a structure for specifying environment variable modifications
@@ -2628,57 +1872,24 @@ typedef struct {
     .separator = '\0'           \
 }
 
-#define PMIX_ENVAR_CREATE(m, n)                                             \
-    do {                                                                    \
-        if (0 == (n)) {                                                     \
-            (m) = NULL;                                                     \
-        } else {                                                            \
-            (m) = (pmix_envar_t*)pmix_malloc((n) * sizeof(pmix_envar_t));   \
-            if (NULL != (m)) {                                              \
-                memset((m), 0, (n) * sizeof(pmix_envar_t));                 \
-            }                                                               \
-        }                                                                   \
-    } while (0)
-#define PMIX_ENVAR_FREE(m, n)                       \
-    do {                                            \
-        size_t _ek;                                 \
-        if (NULL != (m)) {                          \
-            for (_ek=0; _ek < (n); _ek++) {         \
-               PMIX_ENVAR_DESTRUCT(&(m)[_ek]);      \
-            }                                       \
-            pmix_free((m));                         \
-        }                                           \
-    } while (0)
-#define PMIX_ENVAR_CONSTRUCT(m)        \
-    do {                               \
-        (m)->envar = NULL;             \
-        (m)->value = NULL;             \
-        (m)->separator = '\0';         \
-    } while(0)
-#define PMIX_ENVAR_DESTRUCT(m)         \
-    do {                               \
-        if (NULL != (m)->envar) {      \
-            pmix_free((m)->envar);     \
-            (m)->envar = NULL;         \
-        }                              \
-        if (NULL != (m)->value) {      \
-            pmix_free((m)->value);      \
-            (m)->value = NULL;         \
-        }                              \
-    } while(0)
-#define PMIX_ENVAR_LOAD(m, e, v, s)    \
-    do {                               \
-        if (NULL != (e)) {             \
-            (m)->envar = strdup(e);    \
-        }                              \
-        if (NULL != (v)) {             \
-            (m)->value = strdup(v);    \
-        }                              \
-        (m)->separator = (s);          \
-    } while(0)
 
+/****    PMIX DATA BUFFER    ****/
+typedef struct pmix_data_buffer {
+    /** Start of my memory */
+    char *base_ptr;
+    /** Where the next data will be packed to (within the allocated
+        memory starting at base_ptr) */
+    char *pack_ptr;
+    /** Where the next data will be unpacked from (within the
+        allocated memory starting as base_ptr) */
+    char *unpack_ptr;
+    /** Number of bytes allocated (starting at base_ptr) */
+    size_t bytes_allocated;
+    /** Number of bytes used by the buffer (i.e., amount of data --
+        including overhead -- packed in the buffer) */
+    size_t bytes_used;
+} pmix_data_buffer_t;
 
-/****    PMIX DATA BUFFER MACROS   ****/
 #define PMIX_DATA_BUFFER_STATIC_INIT    \
 {                                       \
     .base_ptr = NULL,                   \
@@ -2687,55 +1898,7 @@ typedef struct {
     .bytes_allocated = 0,               \
     .bytes_used = 0                     \
 }
-#define PMIX_DATA_BUFFER_CREATE(m)                                          \
-    do {                                                                    \
-        (m) = (pmix_data_buffer_t*)pmix_malloc(sizeof(pmix_data_buffer_t)); \
-        if (NULL != (m)) {                                                  \
-            memset((m), 0, sizeof(pmix_data_buffer_t));                     \
-        }                                                                   \
-    } while (0)
-#define PMIX_DATA_BUFFER_RELEASE(m)             \
-    do {                                        \
-        if (NULL != (m)->base_ptr) {            \
-            pmix_free((m)->base_ptr);           \
-        }                                       \
-        pmix_free((m));                         \
-        (m) = NULL;                             \
-    } while (0)
-#define PMIX_DATA_BUFFER_CONSTRUCT(m)       \
-    memset((m), 0, sizeof(pmix_data_buffer_t))
-#define PMIX_DATA_BUFFER_DESTRUCT(m)        \
-    do {                                    \
-        if (NULL != (m)->base_ptr) {        \
-            pmix_free((m)->base_ptr);       \
-            (m)->base_ptr = NULL;           \
-        }                                   \
-        (m)->pack_ptr = NULL;               \
-        (m)->unpack_ptr = NULL;             \
-        (m)->bytes_allocated = 0;           \
-        (m)->bytes_used = 0;                \
-    } while (0)
-#define PMIX_DATA_BUFFER_LOAD(b, d, s)  \
-    do {                                \
-        pmix_byte_object_t _bo;         \
-        _bo.bytes = (char*)(d);         \
-        _bo.size = (s);                 \
-        PMIx_Data_load((b), &_bo);      \
-    } while(0)
 
-#define PMIX_DATA_BUFFER_UNLOAD(b, d, s)    \
-    do {                                    \
-        pmix_byte_object_t _bo;             \
-        pmix_status_t _r;                   \
-        _r = PMIx_Data_unload((b), &_bo);   \
-        if (PMIX_SUCCESS == _r) {           \
-            (d) = _bo.bytes;                \
-            (s) = _bo.size;                 \
-        } else {                            \
-            (d) = NULL;                     \
-            (s) = 0;                        \
-        }                                   \
-    } while(0)
 
 /****    PMIX PROC OBJECT    ****/
 typedef struct pmix_proc {
@@ -2749,72 +1912,6 @@ typedef struct pmix_proc {
     .rank = PMIX_RANK_UNDEF     \
 }
 
-#define PMIX_PROC_CREATE(m, n)                                          \
-    do {                                                                \
-        if (0 == (n)) {                                                 \
-            (m) = NULL;                                                 \
-        } else {                                                        \
-            (m) = (pmix_proc_t*)pmix_malloc((n) * sizeof(pmix_proc_t)); \
-            if (NULL != (m)) {                                          \
-                memset((m), 0, (n) * sizeof(pmix_proc_t));              \
-            }                                                           \
-        }                                                               \
-    } while (0)
-
-#define PMIX_PROC_RELEASE(m)    \
-    do {                        \
-        pmix_free((m));         \
-        (m) = NULL;             \
-    } while (0)
-
-#define PMIX_PROC_CONSTRUCT(m)                  \
-    do {                                        \
-        memset((m), 0, sizeof(pmix_proc_t));    \
-    } while (0)
-
-#define PMIX_PROC_DESTRUCT(m)
-
-#define PMIX_PROC_FREE(m, n)                    \
-    do {                                        \
-        if (NULL != (m)) {                      \
-            pmix_free((m));                     \
-            (m) = NULL;                         \
-        }                                       \
-    } while (0)
-
-#define PMIX_PROC_LOAD(m, n, r)                                 \
-    do {                                                        \
-        PMIX_PROC_CONSTRUCT((m));                               \
-        pmix_strncpy((char*)(m)->nspace, (n), PMIX_MAX_NSLEN);  \
-        (m)->rank = (r);                                        \
-    } while(0)
-
-#define PMIX_MULTICLUSTER_NSPACE_CONSTRUCT(t, c, n)                         \
-    do {                                                                    \
-        size_t _len;                                                        \
-        memset((t), 0, PMIX_MAX_NSLEN+1);                                   \
-        _len = pmix_nslen((c));                                             \
-        if ((_len + pmix_nslen((n))) < PMIX_MAX_NSLEN) {                    \
-            pmix_strncpy((char*)(t), (c), PMIX_MAX_NSLEN);                  \
-            (t)[_len] = ':';                                                \
-            pmix_strncpy((char*)&(t)[_len+1], (n), PMIX_MAX_NSLEN - _len);  \
-        }                                                                   \
-    } while(0)
-
-#define PMIX_MULTICLUSTER_NSPACE_PARSE(t, c, n)             \
-    do {                                                    \
-        size_t _n, _j;                                      \
-        for (_n=0; '\0' != (t)[_n] && ':' != (t)[_n] &&     \
-             _n <= PMIX_MAX_NSLEN; _n++) {                  \
-            (c)[_n] = (t)[_n];                              \
-        }                                                   \
-        _n++;                                               \
-        for (_j=0; _n <= PMIX_MAX_NSLEN &&                  \
-             '\0' != (t)[_n]; _n++, _j++) {                 \
-            (n)[_j] = (t)[_n];                              \
-        }                                                   \
-    } while(0)
-
 
 /****    PMIX PROC INFO STRUCT    ****/
 typedef struct pmix_proc_info {
@@ -2836,50 +1933,6 @@ typedef struct pmix_proc_info {
     .state = PMIX_PROC_STATE_UNDEF  \
 }
 
-#define PMIX_PROC_INFO_CREATE(m, n)                                                 \
-    do {                                                                            \
-        if (0 == (n)) {                                                             \
-            (m) = NULL;                                                             \
-        } else {                                                                    \
-            (m) = (pmix_proc_info_t*)pmix_malloc((n) * sizeof(pmix_proc_info_t));   \
-            if (NULL != (m)) {                                                      \
-                memset((m), 0, (n) * sizeof(pmix_proc_info_t));                     \
-            }                                                                       \
-        }                                                                           \
-    } while (0)
-
-#define PMIX_PROC_INFO_RELEASE(m)      \
-    do {                               \
-        PMIX_PROC_INFO_FREE((m), 1);   \
-    } while (0)
-
-#define PMIX_PROC_INFO_CONSTRUCT(m)                 \
-    do {                                            \
-        memset((m), 0, sizeof(pmix_proc_info_t));   \
-    } while (0)
-
-#define PMIX_PROC_INFO_DESTRUCT(m)              \
-    do {                                        \
-        if (NULL != (m)->hostname) {            \
-            pmix_free((m)->hostname);           \
-            (m)->hostname = NULL;               \
-        }                                       \
-        if (NULL != (m)->executable_name) {     \
-            pmix_free((m)->executable_name);    \
-            (m)->executable_name = NULL;        \
-        }                                       \
-    } while(0)
-
-#define PMIX_PROC_INFO_FREE(m, n)                   \
-    do {                                            \
-        size_t _k;                                  \
-        if (NULL != (m)) {                          \
-            for (_k=0; _k < (n); _k++) {            \
-                PMIX_PROC_INFO_DESTRUCT(&(m)[_k]);  \
-            }                                       \
-            pmix_free((m));                         \
-        }                                           \
-    } while (0)
 
 
 /****    PMIX DATA ARRAY STRUCT    ****/
@@ -2897,42 +1950,11 @@ typedef struct pmix_data_array {
     .array = NULL                       \
 }
 
-/**** THE PMIX_DATA_ARRAY SUPPORT MACROS ARE DEFINED ****/
-/**** DOWN BELOW (NEAR THE BOTTOM OF THE FILE) TO    ****/
-/**** AVOID CIRCULAR DEPENDENCIES                    ****/
-
-
 /* we cannot forward-declare the pmix_regattr_t struct
  * as Cython doesn't know what to do with it. Thus, we
  * will utilize the void* entry of the pmix_value_t to
  * hold the pointer to pmix_regattr_t */
 
-/****    PMIX DATA BUFFER    ****/
-typedef struct pmix_data_buffer {
-    /** Start of my memory */
-    char *base_ptr;
-    /** Where the next data will be packed to (within the allocated
-        memory starting at base_ptr) */
-    char *pack_ptr;
-    /** Where the next data will be unpacked from (within the
-        allocated memory starting as base_ptr) */
-    char *unpack_ptr;
-    /** Number of bytes allocated (starting at base_ptr) */
-    size_t bytes_allocated;
-    /** Number of bytes used by the buffer (i.e., amount of data --
-        including overhead -- packed in the buffer) */
-    size_t bytes_used;
-} pmix_data_buffer_t;
-
-#define PMIX_DATA_BUFFER_STATIC_INIT    \
-{                                       \
-    .base_ptr = NULL,                   \
-    .pack_ptr = NULL,                   \
-    .unpack_ptr = NULL,                 \
-    .bytes_allocated = 0,               \
-    .bytes_used = 0                     \
-}
-
 /****   STATISTICS STRUCTURES  ****/
 typedef struct pmix_proc_stats {
     /* process ident info */
@@ -2974,57 +1996,6 @@ typedef struct pmix_proc_stats {
     .sample_time = {0, 0}               \
 }
 
-#define PMIX_PROC_STATS_CREATE(m, n)                                                \
-    do {                                                                            \
-        if (0 == (n)) {                                                             \
-            (m) = NULL;                                                             \
-        } else {                                                                    \
-            (m) = (pmix_proc_stats_t*)pmix_malloc((n) * sizeof(pmix_proc_stats_t)); \
-            if (NULL != (m)) {                                                      \
-                memset((m), 0, (n) * sizeof(pmix_proc_stats_t));                    \
-            }                                                                       \
-        }                                                                           \
-    } while (0)
-
-#define PMIX_PROC_STATS_RELEASE(m)      \
-    do {                                \
-        PMIX_PROC_STATS_FREE((m), 1);   \
-    } while (0)
-
-#define PMIX_PROC_STATS_CONSTRUCT(m)                \
-    do {                                            \
-        memset((m), 0, sizeof(pmix_proc_stats_t));  \
-    } while (0)
-
-#define PMIX_PROC_STATS_DESTRUCT(m)     \
-    do {                                \
-        if (NULL != (m)->node) {        \
-            pmix_free((m)->node);       \
-            (m)->node = NULL;           \
-        }                               \
-        if (NULL != (m)->cmd) {         \
-            pmix_free((m)->cmd);        \
-            (m)->cmd = NULL;            \
-        }                               \
-    } while(0)
-
-static inline void pmix_proc_stats_free(pmix_proc_stats_t *ps, size_t n)
-{
-    size_t k;
-
-    if (NULL != ps) {
-        for (k=0; k < n; k++) {
-            PMIX_PROC_STATS_DESTRUCT(&ps[k]);
-        }
-    }
-}
-
-#define PMIX_PROC_STATS_FREE(m, n)  \
-do {                                \
-    pmix_proc_stats_free(m, n);     \
-    pmix_free(m);                   \
-    (m) = NULL;                     \
-} while(0)
 
 typedef struct {
     char *disk;
@@ -3057,53 +2028,6 @@ typedef struct {
     .weighted_milliseconds_io = 0       \
 }
 
-#define PMIX_DISK_STATS_CREATE(m, n)                                                \
-    do {                                                                            \
-        if (0 == (n)) {                                                             \
-            (m) = NULL;                                                             \
-        } else {                                                                    \
-            (m) = (pmix_disk_stats_t*)pmix_malloc((n) * sizeof(pmix_disk_stats_t)); \
-            if (NULL != (m)) {                                                      \
-                memset((m), 0, (n) * sizeof(pmix_disk_stats_t));                    \
-            }                                                                       \
-        }                                                                           \
-    } while (0)
-
-#define PMIX_DISK_STATS_RELEASE(m)      \
-    do {                                \
-        PMIX_DISK_STATS_FREE((m), 1);   \
-    } while (0)
-
-#define PMIX_DISK_STATS_CONSTRUCT(m)                \
-    do {                                            \
-        memset((m), 0, sizeof(pmix_disk_stats_t));  \
-    } while (0)
-
-#define PMIX_DISK_STATS_DESTRUCT(m)     \
-    do {                                \
-        if (NULL != (m)->disk) {        \
-            pmix_free((m)->disk);       \
-            (m)->disk = NULL;           \
-        }                               \
-    } while(0)
-
-static inline void pmix_disk_stats_free(pmix_disk_stats_t *d, size_t n)
-{
-    size_t k;
-
-    if (NULL != d) {
-        for (k=0; k < n; k++) {
-            PMIX_DISK_STATS_DESTRUCT(&d[k]);
-        }
-    }
-}
-
-#define PMIX_DISK_STATS_FREE(m, n)  \
-do {                                \
-    pmix_disk_stats_free(m, n);     \
-    pmix_free(m);                   \
-    (m) = NULL;                     \
-} while(0)
 
 typedef struct {
     char *net_interface;
@@ -3126,53 +2050,6 @@ typedef struct {
     .num_send_errs = 0              \
 }
 
-#define PMIX_NET_STATS_CREATE(m, n)                                                 \
-    do {                                                                            \
-        if (0 == (n)) {                                                             \
-            (m) = NULL;                                                             \
-        } else {                                                                    \
-            (m) = (pmix_net_stats_t*)pmix_malloc((n) * sizeof(pmix_net_stats_t));   \
-            if (NULL != (m)) {                                                      \
-                memset((m), 0, (n) * sizeof(pmix_net_stats_t));                     \
-            }                                                                       \
-        }                                                                           \
-    } while (0)
-
-#define PMIX_NET_STATS_RELEASE(m)       \
-    do {                                \
-        PMIX_NET_STATS_FREE((m), 1);    \
-    } while (0)
-
-#define PMIX_NET_STATS_CONSTRUCT(m)                 \
-    do {                                            \
-        memset((m), 0, sizeof(pmix_net_stats_t));   \
-    } while (0)
-
-#define PMIX_NET_STATS_DESTRUCT(m)          \
-    do {                                    \
-        if (NULL != (m)->net_interface) {   \
-            pmix_free((m)->net_interface);  \
-            (m)->net_interface = NULL;      \
-        }                                   \
-    } while(0)
-
-static inline void pmix_net_stats_free(pmix_net_stats_t *nst, size_t n)
-{
-    size_t k;
-
-    if (NULL != nst) {
-        for (k=0; k < n; k++) {
-            PMIX_NET_STATS_DESTRUCT(&nst[k]);
-        }
-    }
-}
-
-#define PMIX_NET_STATS_FREE(m, n)   \
-do {                                \
-    pmix_net_stats_free(m, n);      \
-    pmix_free(m);                   \
-    (m) = NULL;                     \
-} while(0)
 
 typedef struct {
     char *node;
@@ -3220,58 +2097,6 @@ typedef struct {
     .nnetstats = 0                      \
 }
 
-#define PMIX_NODE_STATS_CREATE(m, n)                                                \
-    do {                                                                            \
-        if (0 == (n)) {                                                             \
-            (m) = NULL;                                                             \
-        } else {                                                                    \
-            (m) = (pmix_node_stats_t*)pmix_malloc((n) * sizeof(pmix_node_stats_t)); \
-            if (NULL != (m)) {                                                      \
-                memset((m), 0, (n) * sizeof(pmix_node_stats_t));                    \
-            }                                                                       \
-        }                                                                           \
-    } while (0)
-
-#define PMIX_NODE_STATS_CONSTRUCT(m)                \
-    do {                                            \
-        memset((m), 0, sizeof(pmix_node_stats_t));  \
-    } while (0)
-
-#define PMIX_NODE_STATS_DESTRUCT(m)                                 \
-    do {                                                            \
-        if (NULL != (m)->node) {                                    \
-            pmix_free((m)->node);                                   \
-            (m)->node = NULL;                                       \
-        }                                                           \
-        if (NULL != (m)->diskstats) {                               \
-            PMIX_DISK_STATS_FREE((m)->diskstats, (m)->ndiskstats);  \
-        }                                                           \
-        if (NULL != (m)->netstats) {                                \
-            PMIX_NET_STATS_FREE((m)->netstats, (m)->nnetstats);     \
-        }                                                           \
-    } while(0)
-
-static inline void pmix_node_stats_free(pmix_node_stats_t *nd, size_t n)
-{
-    size_t k;
-
-    if (NULL != nd) {
-        for (k=0; k < n; k++) {
-            PMIX_NODE_STATS_DESTRUCT(&nd[k]);
-        }
-    }
-}
-
-#define PMIX_NODE_STATS_FREE(m, n)  \
-do {                                \
-    pmix_node_stats_free(m, n);     \
-    pmix_free(m);                   \
-    (m) = NULL;                     \
-} while(0)
-
-#define PMIX_NODE_STATS_RELEASE(m)  \
-    pmix_node_stats_free(m, 1)
-
 
 /****    PMIX VALUE STRUCT    ****/
 
@@ -3340,26 +2165,6 @@ typedef struct pmix_value {
     .data.ptr = NULL            \
 }
 
-/* allocate and initialize a specified number of value structs */
-#define PMIX_VALUE_CREATE(m, n)                                             \
-    do {                                                                    \
-        if (0 == (n)) {                                                     \
-            (m) = NULL;                                                     \
-        } else {                                                            \
-            (m) = (pmix_value_t*)pmix_malloc((n) * sizeof(pmix_value_t));   \
-            if (NULL != (m)) {                                              \
-                memset((m), 0, (n)*sizeof(pmix_value_t));                   \
-            }                                                               \
-        }                                                                   \
-    } while (0)
-
-/* initialize a single value struct */
-#define PMIX_VALUE_CONSTRUCT(m)                 \
-    do {                                        \
-        memset((m), 0, sizeof(pmix_value_t));   \
-        (m)->type = PMIX_UNDEF;                 \
-    } while (0)
-
 #define PMIX_VALUE_GET_NUMBER(s, m, n, t)               \
     do {                                                \
         (s) = PMIX_SUCCESS;                             \
@@ -3393,6 +2198,8 @@ typedef struct pmix_value {
             (n) = (t)((m)->data.pid);                   \
         } else if (PMIX_PROC_RANK == (m)->type) {       \
             (n) = (t)((m)->data.rank);                  \
+        } else if (PMIX_STATUS == (m)->type) {          \
+            (n) = (t)((m)->data.status);                \
         } else {                                        \
             (s) = PMIX_ERR_BAD_PARAM;                   \
         }                                               \
@@ -3412,64 +2219,6 @@ typedef struct pmix_info {
     .value = PMIX_VALUE_STATIC_INIT \
 }
 
-/* utility macros for working with pmix_info_t structs */
-#define PMIX_INFO_CONSTRUCT(m)                  \
-    do {                                        \
-        memset((m), 0, sizeof(pmix_info_t));    \
-        (m)->value.type = PMIX_UNDEF;           \
-    } while (0)
-
-#define PMIX_INFO_CREATE(m, n)                                          \
-    do {                                                                \
-        pmix_info_t *_i;                                                \
-        if (0 == (n)) {                                                 \
-            (m) = NULL;                                                 \
-        } else {                                                        \
-            (m) = (pmix_info_t*)pmix_malloc((n) * sizeof(pmix_info_t)); \
-            if (NULL != (m)) {                                          \
-                _i = (pmix_info_t*)(m);                                 \
-                memset((m), 0, (n) * sizeof(pmix_info_t));              \
-                _i[(n)-1].flags = PMIX_INFO_ARRAY_END;                  \
-            }                                                           \
-        }                                                               \
-    } while (0)
-
-/* macros for setting and unsetting the "reqd" flag
- * in a pmix_info_t */
-#define PMIX_INFO_REQUIRED(m)       \
-    ((m)->flags |= PMIX_INFO_REQD)
-#define PMIX_INFO_OPTIONAL(m)       \
-    ((m)->flags &= ~PMIX_INFO_REQD)
-
-/* macros for testing the "reqd" flag in a pmix_info_t */
-#define PMIX_INFO_IS_REQUIRED(m)    \
-    ((m)->flags & PMIX_INFO_REQD)
-#define PMIX_INFO_IS_OPTIONAL(m)    \
-    !((m)->flags & PMIX_INFO_REQD)
-
-/* macros for setting and testing the "reqd processed" flag */
-#define PMIX_INFO_PROCESSED(m)  \
-    ((m)->flags |= PMIX_INFO_REQD_PROCESSED)
-#define PMIX_INFO_WAS_PROCESSED(m)  \
-    ((m)->flags & PMIX_INFO_REQD_PROCESSED)
-
-/* macro for testing end of the array */
-#define PMIX_INFO_SET_END(m)    \
-    ((m)->flags |= PMIX_INFO_ARRAY_END)
-#define PMIX_INFO_IS_END(m)         \
-    ((m)->flags & PMIX_INFO_ARRAY_END)
-
-/* macro for testing if qualifier */
-#define PMIX_INFO_SET_QUALIFIER(i)   \
-    ((i)->flags |= PMIX_INFO_QUALIFIER)
-#define PMIX_INFO_IS_QUALIFIER(i)    \
-    ((i)->flags & PMIX_INFO_QUALIFIER)
-
-/* macro for setting and testing the "donot release" flag */
-#define PMIX_INFO_SET_PERSISTENT(ii) \
-    ((ii)->flags |= PMIX_INFO_PERSISTENT)
-#define PMIX_INFO_IS_PERSISTENT(ii)  \
-    ((ii)->flags & PMIX_INFO_PERSISTENT)
 
 typedef enum {
     PMIX_BOOL_TRUE,
@@ -3477,74 +2226,6 @@ typedef enum {
     PMIX_NON_BOOL
 } pmix_boolean_t;
 
-/**
- * Provide a check to see if a value is "true" or
- * "false", whether given as a string or boolean
- * input.
- */
-static inline pmix_boolean_t pmix_check_true(const pmix_value_t *value)
-{
-    char *ptr;
-
-    if (PMIX_UNDEF == value->type) {
-        return PMIX_BOOL_TRUE; // default to true
-    }
-    if (PMIX_BOOL == value->type) {
-        if (value->data.flag) {
-            return PMIX_BOOL_TRUE;
-        } else {
-            return PMIX_BOOL_FALSE;
-        }
-    }
-    if (PMIX_STRING == value->type) {
-        if (NULL == value->data.string) {
-            return PMIX_BOOL_TRUE;
-        }
-        ptr = value->data.string;
-        /* Trim leading whitespace */
-        while (isspace(*ptr)) {
-            ++ptr;
-        }
-        if ('\0' == *ptr) {
-            return PMIX_BOOL_TRUE;
-        }
-        if (isdigit(*ptr)) {
-            if (0 == atoi(ptr)) {
-                return PMIX_BOOL_FALSE;
-            } else {
-                return PMIX_BOOL_TRUE;
-            }
-        } else if (0 == strncasecmp(ptr, "yes", 3) ||
-                   0 == strncasecmp(ptr, "true", 4)) {
-            return PMIX_BOOL_TRUE;
-        } else if (0 == strncasecmp(ptr, "no", 2) ||
-                   0 == strncasecmp(ptr, "false", 5)) {
-            return PMIX_BOOL_FALSE;
-        }
-    }
-
-    return PMIX_NON_BOOL;
-}
-
-/* provide a macro version of it for those preferring
- * that syntax in their codes where they know the
- * value being checked IS a boolean of some form
- */
-#define PMIX_CHECK_TRUE(a) \
-    (PMIX_BOOL_TRUE == pmix_check_true(a) ? true : false)
-
-#define PMIX_CHECK_BOOL(a) \
-    (PMIX_NON_BOOL == pmix_check_true(a) ? false : true)
-
-/* define a special macro for checking if a boolean
- * info is true - when info structs are provided, a
- * type of PMIX_UNDEF is taken to imply a boolean "true"
- * as the presence of the key defaults to indicating
- * "true". Also supports passing of string representations
- * such as "t" or "f" */
-#define PMIX_INFO_TRUE(m)   \
-    (PMIX_BOOL_TRUE == pmix_check_true(&(m)->value) ? true : false)
-
 
 /****    PMIX LOOKUP RETURN STRUCT    ****/
 typedef struct pmix_pdata {
@@ -3560,24 +2241,6 @@ typedef struct pmix_pdata {
     .value = PMIX_VALUE_STATIC_INIT \
 }
 
-/* utility macros for working with pmix_pdata_t structs */
-#define PMIX_PDATA_CREATE(m, n)                                             \
-    do {                                                                    \
-        if (0 == (n)) {                                                     \
-            (m) = NULL;                                                     \
-        } else {                                                            \
-            (m) = (pmix_pdata_t*)pmix_malloc((n) * sizeof(pmix_pdata_t));   \
-            if (NULL != (m)) {                                              \
-                memset((m), 0, (n) * sizeof(pmix_pdata_t));                 \
-            }                                                               \
-        }                                                                   \
-    } while (0)
-
-#define PMIX_PDATA_CONSTRUCT(m)                 \
-    do {                                        \
-        memset((m), 0, sizeof(pmix_pdata_t));   \
-        (m)->value.type = PMIX_UNDEF;           \
-    } while (0)
 
 
 /****    PMIX APP STRUCT    ****/
@@ -3602,37 +2265,6 @@ typedef struct pmix_app {
     .ninfo = 0                  \
 }
 
-/* utility macros for working with pmix_app_t structs */
-#define PMIX_APP_CREATE(m, n)                                           \
-    do {                                                                \
-        if (0 == (n)) {                                                 \
-            (m) = NULL;                                                 \
-        } else {                                                        \
-            (m) = (pmix_app_t*)pmix_malloc((n) * sizeof(pmix_app_t));   \
-            if (NULL != (m)) {                                          \
-                memset((m), 0, (n) * sizeof(pmix_app_t));               \
-            }                                                           \
-        }                                                               \
-    } while (0)
-
-#define PMIX_APP_INFO_CREATE(m, n)                  \
-    do {                                            \
-        (m)->ninfo = (n);                           \
-        PMIX_INFO_CREATE((m)->info, (m)->ninfo);    \
-    } while(0)
-
-#define PMIX_APP_RELEASE(m)                     \
-    do {                                        \
-        PMIX_APP_DESTRUCT((m));                 \
-        pmix_free((m));                         \
-        (m) = NULL;                             \
-    } while (0)
-
-#define PMIX_APP_CONSTRUCT(m)                   \
-    do {                                        \
-        memset((m), 0, sizeof(pmix_app_t));     \
-    } while (0)
-
 
 /****    PMIX QUERY STRUCT    ****/
 typedef struct pmix_query {
@@ -3648,65 +2280,6 @@ typedef struct pmix_query {
     .nqual = 0                  \
 }
 
-/* utility macros for working with pmix_query_t structs */
-#define PMIX_QUERY_CREATE(m, n)                                             \
-    do {                                                                    \
-        if (0 == (n)) {                                                     \
-            (m) = NULL;                                                     \
-        } else {                                                            \
-            (m) = (pmix_query_t*)pmix_malloc((n) * sizeof(pmix_query_t));   \
-            if (NULL != (m)) {                                              \
-                memset((m), 0, (n) * sizeof(pmix_query_t));                 \
-            }                                                               \
-        }                                                                   \
-    } while (0)
-
-#define PMIX_QUERY_QUALIFIERS_CREATE(m, n)                  \
-    do {                                                    \
-        (m)->nqual = (n);                                   \
-        PMIX_INFO_CREATE((m)->qualifiers, (m)->nqual);      \
-    } while(0)
-
-#define PMIX_QUERY_RELEASE(m)       \
-    do {                            \
-        PMIX_QUERY_DESTRUCT((m));   \
-        pmix_free((m));             \
-        (m) = NULL;                 \
-    } while (0)
-
-#define PMIX_QUERY_CONSTRUCT(m)                 \
-    do {                                        \
-        memset((m), 0, sizeof(pmix_query_t));   \
-    } while (0)
-
-#define PMIX_QUERY_DESTRUCT(m)                                  \
-    do {                                                        \
-        size_t _qi;                                             \
-        if (NULL != (m)->keys) {                                \
-            for (_qi=0; NULL != (m)->keys[_qi]; _qi++) {        \
-                pmix_free((m)->keys[_qi]);                      \
-            }                                                   \
-            pmix_free((m)->keys);                               \
-            (m)->keys = NULL;                                   \
-        }                                                       \
-        if (NULL != (m)->qualifiers) {                          \
-            PMIX_INFO_FREE((m)->qualifiers, (m)->nqual);        \
-            (m)->qualifiers = NULL;                             \
-            (m)->nqual = 0;                                     \
-        }                                                       \
-    } while (0)
-
-#define PMIX_QUERY_FREE(m, n)                       \
-    do {                                            \
-        size_t _qs;                                 \
-        if (NULL != (m)) {                          \
-            for (_qs=0; _qs < (n); _qs++) {         \
-                PMIX_QUERY_DESTRUCT(&((m)[_qs]));   \
-            }                                       \
-            pmix_free((m));                         \
-            (m) = NULL;                             \
-        }                                           \
-    } while (0)
 
 /****    ATTRIBUTE REGISTRATION STRUCT   ****/
 typedef struct pmix_regattr_t {
@@ -3724,82 +2297,6 @@ typedef struct pmix_regattr_t {
     .description = NULL             \
 }
 
-#define PMIX_REGATTR_CONSTRUCT(a)                       \
-    do {                                                \
-        if (NULL != (a)) {                              \
-            (a)->name = NULL;                           \
-            memset((a)->string, 0, PMIX_MAX_KEYLEN+1);  \
-            (a)->type = PMIX_UNDEF;                     \
-            (a)->description = NULL;                    \
-        }                                               \
-    } while(0)
-
-#define PMIX_REGATTR_LOAD(a, n, k, t, v)                        \
-    do {                                                        \
-        pmix_status_t _rgl;                                     \
-        if (NULL != (n)) {                                      \
-            (a)->name = strdup((n));                            \
-        }                                                       \
-        if (NULL != (k)) {                                      \
-            PMIX_LOAD_KEY((a)->string, (k));                    \
-        }                                                       \
-        (a)->type = (t);                                        \
-        if (NULL != (v)) {                                      \
-            PMIX_ARGV_APPEND(_rgl, &(a)->description, (v));     \
-        }                                                       \
-    } while(0)
-
-#define PMIX_REGATTR_DESTRUCT(a)                    \
-    do {                                            \
-        if (NULL != (a)) {                          \
-            if (NULL != (a)->name) {                \
-                pmix_free((a)->name);               \
-            }                                       \
-            if (NULL != (a)->description) {         \
-                PMIX_ARGV_FREE((a)->description);   \
-            }                                       \
-        }                                           \
-    } while(0)
-
-#define PMIX_REGATTR_CREATE(m, n)                                               \
-    do {                                                                        \
-        if (0 == (n)) {                                                         \
-            (m) = NULL;                                                         \
-        } else {                                                                \
-            (m) = (pmix_regattr_t*)pmix_malloc((n) * sizeof(pmix_regattr_t));   \
-            if (NULL != (m)) {                                                  \
-                memset((m), 0, (n) * sizeof(pmix_regattr_t));                   \
-            }                                                                   \
-        }                                                                       \
-    } while (0)
-
-#define PMIX_REGATTR_FREE(m, n)                         \
-    do {                                                \
-        size_t _ra;                                     \
-        pmix_regattr_t *_m = (pmix_regattr_t*)(m);      \
-        if (NULL != (m)) {                              \
-            for (_ra=0; _ra < (n); _ra++) {             \
-                PMIX_REGATTR_DESTRUCT(&((_m)[_ra]));    \
-            }                                           \
-            pmix_free((m));                             \
-            (m) = NULL;                                 \
-        }                                               \
-    } while (0)
-
-#define PMIX_REGATTR_XFER(a, b)                                         \
-    do {                                                                \
-        size_t _n;                                                      \
-        PMIX_REGATTR_CONSTRUCT((a));                                    \
-        if (NULL != ((b)->name)) {                                      \
-            (a)->name = strdup((b)->name);                              \
-        }                                                               \
-        PMIX_LOAD_KEY((a)->string, (b)->string);                        \
-        (a)->type = (b)->type;                                          \
-        if (NULL != (b)->description) {                                 \
-            PMIX_ARGV_COPY((a)->description, (b)->description);         \
-        }                                                               \
-    } while(0)
-
 
 /****    FABRIC STRUCT    ****/
 /* Define a pmix_fabric_t struct for
@@ -3835,6 +2332,7 @@ typedef enum {
     PMIX_FABRIC_UPDATE_INFO
 } pmix_fabric_operation_t;
 
+
 /****    CALLBACK FUNCTIONS FOR NON-BLOCKING OPERATIONS    ****/
 
 typedef void (*pmix_release_cbfunc_t)(void *cbdata);
@@ -4070,288 +2568,9 @@ typedef void (*pmix_device_dist_cbfunc_t)(pmix_status_t status,
 
 
 
-#define PMIX_DATA_ARRAY_INIT(m, t)      \
-    do {                                \
-        (m)->array = NULL;              \
-        (m)->type = (t);                \
-        (m)->size = 0;                  \
-    } while(0)
-
-#define PMIX_DATA_ARRAY_CONSTRUCT(m, n, t)                                  \
-    do {                                                                    \
-        (m)->type = (t);                                                    \
-        (m)->size = (n);                                                    \
-        if (0 < (n)) {                                                      \
-            if (PMIX_INFO == (t)) {                                         \
-                PMIX_INFO_CREATE((m)->array, (n));                          \
-                                                                            \
-            } else if (PMIX_PROC == (t)) {                                  \
-                PMIX_PROC_CREATE((m)->array, (n));                          \
-                                                                            \
-            } else if (PMIX_PROC_INFO == (t)) {                             \
-                PMIX_PROC_INFO_CREATE((m)->array, (n));                     \
-                                                                            \
-            } else if (PMIX_ENVAR == (t)) {                                 \
-                PMIX_ENVAR_CREATE((m)->array, (n));                         \
-                                                                            \
-            } else if (PMIX_VALUE == (t)) {                                 \
-                PMIX_VALUE_CREATE((m)->array, (n));                         \
-                                                                            \
-            } else if (PMIX_PDATA == (t)) {                                 \
-                PMIX_PDATA_CREATE((m)->array, (n));                         \
-                                                                            \
-            } else if (PMIX_QUERY == (t)) {                                 \
-                PMIX_QUERY_CREATE((m)->array, (n));                         \
-                                                                            \
-            } else if (PMIX_APP == (t)) {                                   \
-                PMIX_APP_CREATE((m)->array, (n));                           \
-                                                                            \
-            } else if (PMIX_BYTE_OBJECT == (t) ||                           \
-                       PMIX_COMPRESSED_STRING == (t)) {                     \
-                PMIX_BYTE_OBJECT_CREATE((m)->array, (n));                   \
-                                                                            \
-            } else if (PMIX_ALLOC_DIRECTIVE == (t) ||                       \
-                       PMIX_PROC_STATE == (t) ||                            \
-                       PMIX_PERSIST == (t) ||                               \
-                       PMIX_SCOPE == (t) ||                                 \
-                       PMIX_DATA_RANGE == (t) ||                            \
-                       PMIX_BYTE == (t) ||                                  \
-                       PMIX_INT8 == (t) ||                                  \
-                       PMIX_UINT8 == (t) ||                                 \
-                       PMIX_POINTER == (t)) {                               \
-                (m)->array = pmix_calloc((n), sizeof(int8_t));              \
-                                                                            \
-            } else if (PMIX_STRING == (t)) {                                \
-                (m)->array = pmix_calloc((n), sizeof(char*));               \
-                                                                            \
-            } else if (PMIX_SIZE == (t)) {                                  \
-                (m)->array = pmix_calloc((n), sizeof(size_t));              \
-                                                                            \
-            } else if (PMIX_PID == (t)) {                                   \
-                (m)->array = pmix_calloc((n), sizeof(pid_t));               \
-                                                                            \
-            } else if (PMIX_INT == (t) ||                                   \
-                       PMIX_UINT == (t) ||                                  \
-                       PMIX_STATUS == (t)) {                                \
-                (m)->array = pmix_calloc((n), sizeof(int));                 \
-                                                                            \
-            } else if (PMIX_IOF_CHANNEL == (t) ||                           \
-                       PMIX_DATA_TYPE == (t) ||                             \
-                       PMIX_INT16 == (t) ||                                 \
-                       PMIX_UINT16 == (t)) {                                \
-                (m)->array = pmix_calloc((n), sizeof(int16_t));             \
-                                                                            \
-            } else if (PMIX_PROC_RANK == (t) ||                             \
-                       PMIX_INFO_DIRECTIVES == (t) ||                       \
-                       PMIX_INT32 == (t) ||                                 \
-                       PMIX_UINT32 == (t)) {                                \
-                (m)->array = pmix_calloc((n), sizeof(int32_t));             \
-                                                                            \
-            } else if (PMIX_INT64 == (t) ||                                 \
-                       PMIX_UINT64 == (t)) {                                \
-                (m)->array = pmix_calloc((n), sizeof(int64_t));             \
-                                                                            \
-            } else if (PMIX_FLOAT == (t)) {                                 \
-                (m)->array = pmix_calloc((n), sizeof(float));               \
-                                                                            \
-            } else if (PMIX_DOUBLE == (t)) {                                \
-                (m)->array = pmix_calloc((n), sizeof(double));              \
-                                                                            \
-            } else if (PMIX_TIMEVAL == (t)) {                               \
-                (m)->array = pmix_calloc((n), sizeof(struct timeval));      \
-                                                                            \
-            } else if (PMIX_TIME == (t)) {                                  \
-                (m)->array = pmix_calloc((n), sizeof(time_t));              \
-                                                                            \
-            } else if (PMIX_REGATTR == (t)) {                               \
-                PMIX_REGATTR_CREATE((m)->array, (n));                       \
-                                                                            \
-            } else if (PMIX_BOOL == (t)) {                                  \
-                (m)->array = pmix_calloc((n), sizeof(bool));                \
-                                                                            \
-            } else if (PMIX_COORD == (t)) {                                 \
-                (m)->array = pmix_calloc((n), sizeof(pmix_coord_t));        \
-                                                                            \
-            } else if (PMIX_LINK_STATE == (t)) {                            \
-                (m)->array = pmix_calloc((n), sizeof(pmix_link_state_t));   \
-                                                                            \
-            } else if (PMIX_ENDPOINT == (t)) {                              \
-                PMIX_ENDPOINT_CREATE((m)->array, n);                        \
-                                                                            \
-            } else if (PMIX_PROC_NSPACE == (t)) {                           \
-                (m)->array = pmix_calloc((n), sizeof(pmix_nspace_t));       \
-                                                                            \
-            } else if (PMIX_PROC_STATS == (t)) {                            \
-                PMIX_PROC_STATS_CREATE((m)->array, n);                      \
-                                                                            \
-            } else if (PMIX_DISK_STATS == (t)) {                            \
-                PMIX_DISK_STATS_CREATE((m)->array, n);                      \
-                                                                            \
-            } else if (PMIX_NET_STATS == (t)) {                             \
-                PMIX_NET_STATS_CREATE((m)->array, n);                       \
-                                                                            \
-            } else if (PMIX_NODE_STATS == (t)) {                            \
-                PMIX_NODE_STATS_CREATE((m)->array, n);                      \
-                                                                            \
-            } else if (PMIX_DEVICE_DIST == (t)) {                           \
-                PMIX_DEVICE_DIST_CREATE((m)->array, n);                     \
-                                                                            \
-            } else if (PMIX_GEOMETRY == (t)) {                              \
-                PMIX_GEOMETRY_CREATE((m)->array, n);                        \
-                                                                            \
-            } else if (PMIX_REGATTR == (t)) {                               \
-                PMIX_REGATTR_CREATE((m)->array, n);                         \
-                                                                            \
-            } else if (PMIX_PROC_CPUSET == (t)) {                           \
-                PMIX_CPUSET_CREATE((m)->array, n);                          \
-            } else {                                                        \
-                (m)->array = NULL;                                          \
-                (m)->size = 0;                                              \
-            }                                                               \
-        } else {                                                            \
-            (m)->array = NULL;                                              \
-        }                                                                   \
-    } while(0)
-#define PMIX_DATA_ARRAY_CREATE(m, n, t)                                     \
-    do {                                                                    \
-        (m) = (pmix_data_array_t*)pmix_malloc(sizeof(pmix_data_array_t));   \
-        if (NULL != (m)) {                                                  \
-            memset((m), 0, sizeof(pmix_data_array_t));                      \
-            PMIX_DATA_ARRAY_CONSTRUCT((m), (n), (t));                       \
-        }                                                                   \
-    } while(0)
 
 #include <pmix_deprecated.h>
 
-/********    STANDARD MACROS FOR DARRAY AND VALUE SUPPORT     ********/
-
-/* release the memory in the value struct data field */
-#define PMIX_VALUE_DESTRUCT(m) PMIx_Value_destruct(m)
-
-/* release a single pmix_value_t struct, including its data */
-#define PMIX_VALUE_RELEASE(m)       \
-    do {                            \
-        PMIX_VALUE_DESTRUCT((m));   \
-        pmix_free((m));             \
-        (m) = NULL;                 \
-    } while (0)
-
-#define PMIX_VALUE_FREE(m, n)                           \
-    do {                                                \
-        size_t _vv;                                     \
-        if (NULL != (m)) {                              \
-            for (_vv=0; _vv < (n); _vv++) {             \
-                PMIX_VALUE_DESTRUCT(&((m)[_vv]));       \
-            }                                           \
-            pmix_free((m));                             \
-            (m) = NULL;                                 \
-        }                                               \
-    } while (0)
-
-#define PMIX_INFO_DESTRUCT(m)                   \
-    do {                                        \
-        if (!PMIX_INFO_IS_PERSISTENT((m))) {    \
-            PMIX_VALUE_DESTRUCT(&(m)->value);   \
-        }                                       \
-    } while (0)
-
-#define PMIX_INFO_FREE(m, n)                        \
-    do {                                            \
-        size_t _is;                                 \
-        if (NULL != (m)) {                          \
-            for (_is=0; _is < (n); _is++) {         \
-                PMIX_INFO_DESTRUCT(&((m)[_is]));    \
-            }                                       \
-            pmix_free((m));                         \
-            (m) = NULL;                             \
-        }                                           \
-    } while (0)
-
-#define PMIX_APP_DESTRUCT(m)                                    \
-    do {                                                        \
-        if (NULL != (m)->cmd) {                                 \
-            pmix_free((m)->cmd);                                \
-            (m)->cmd = NULL;                                    \
-        }                                                       \
-        if (NULL != (m)->argv) {                                \
-            pmix_argv_free((m)->argv);                          \
-            (m)->argv = NULL;                                   \
-        }                                                       \
-        if (NULL != (m)->env) {                                 \
-            pmix_argv_free((m)->env);                           \
-            (m)->env = NULL;                                    \
-        }                                                       \
-        if (NULL != (m)->cwd) {                                 \
-            pmix_free((m)->cwd);                                \
-            (m)->cwd = NULL;                                    \
-        }                                                       \
-        if (NULL != (m)->info) {                                \
-            PMIX_INFO_FREE((m)->info, (m)->ninfo);              \
-            (m)->info = NULL;                                   \
-            (m)->ninfo = 0;                                     \
-        }                                                       \
-    } while (0)
-
-static inline void pmix_app_free(pmix_app_t *ap, size_t n)
-{
-    size_t k;
-
-    if (NULL != ap) {
-        for (k=0; k < n; k++) {
-            PMIX_APP_DESTRUCT(&ap[k]);
-        }
-    }
-}
-
-#define PMIX_APP_FREE(m, n)     \
-    do {                        \
-        pmix_app_free(m, n);    \
-        pmix_free(m);           \
-        (m) = NULL;             \
-    } while (0)
-
-
-#define PMIX_DATA_ARRAY_DESTRUCT(m) PMIx_Data_array_destruct(m)
-
-#define PMIX_DATA_ARRAY_FREE(m)             \
-    do {                                    \
-        if (NULL != (m)) {                  \
-            PMIX_DATA_ARRAY_DESTRUCT(m);    \
-            pmix_free((m));                 \
-            (m) = NULL;                     \
-        }                                   \
-    } while(0)
-
-#define PMIX_PDATA_RELEASE(m)                   \
-    do {                                        \
-        PMIX_VALUE_DESTRUCT(&(m)->value);       \
-        pmix_free((m));                         \
-        (m) = NULL;                             \
-    } while (0)
-
-#define PMIX_PDATA_DESTRUCT(m)                  \
-    do {                                        \
-        PMIX_VALUE_DESTRUCT(&(m)->value);       \
-    } while (0)
-
-static inline void pmix_pdata_free(pmix_pdata_t *pd, size_t n)
-{
-    size_t k;
-
-    if (NULL != pd) {
-        for (k=0; k < n; k++) {
-            PMIX_PDATA_DESTRUCT(&pd[k]);
-        }
-    }
-}
-
-#define PMIX_PDATA_FREE(m, n)   \
-do {                            \
-    pmix_pdata_free(m, n);      \
-    pmix_free(m);               \
-    (m) = NULL;                 \
-} while(0)
-
 #if defined(c_plusplus) || defined(__cplusplus)
 }
 #endif
diff --git a/deps/pmix/include/pmix_deprecated.h b/deps/pmix/include/pmix_deprecated.h
index ddefa8f82..254dfb270 100644
--- a/deps/pmix/include/pmix_deprecated.h
+++ b/deps/pmix/include/pmix_deprecated.h
@@ -4,7 +4,8 @@
  *                         All rights reserved.
  * Copyright (c) 2015      Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2021-2022 Nanook Consulting  All rights reserved.
+ * Copyright (c) 2021-2023 Nanook Consulting.  All rights reserved.
+ * Copyright (c) 2023      Triad National Security, LLC. All rights reserved.
  * $COPYRIGHT$
  *
  * Redistribution and use in source and binary forms, with or without
@@ -72,7 +73,7 @@ PMIX_EXPORT pmix_status_t PMIx_tool_connect_to_server(pmix_proc_t *proc,
 /* DATATYPES */
 #define PMIX_BUFFER                     26
 
-/* CONSTANTS */
+/****    PMIX ERROR CONSTANTS    ****/
 #define PMIX_ERR_SILENT                             -2
 #define PMIX_ERR_DEBUGGER_RELEASE                   -3
 #define PMIX_ERR_PROC_ABORTED                       -7
@@ -203,30 +204,244 @@ PMIX_EXPORT pmix_status_t PMIx_tool_connect_to_server(pmix_proc_t *proc,
                                                                     //        its namespace in the event
 
 
+/* DUPLICATES */
+
 /* Bring some function definitions across from pmix.h for now-deprecated
  * macros that utilize them. We have to do this as there are people who
  * only included pmix_common.h if they were using macros but not APIs */
 
+PMIX_EXPORT void PMIx_Load_key(pmix_key_t key, const char *src);
+PMIX_EXPORT bool PMIx_Check_key(const char *key, const char *str);
+PMIX_EXPORT bool PMIx_Check_reserved_key(const char *key);
+PMIX_EXPORT void PMIx_Load_nspace(pmix_nspace_t nspace, const char *str);
+PMIX_EXPORT bool PMIx_Check_nspace(const char *key1, const char *key2);
+PMIX_EXPORT bool PMIx_Nspace_invalid(const char *nspace);
+PMIX_EXPORT void PMIx_Load_procid(pmix_proc_t *p,
+                                  const char *ns,
+                                  pmix_rank_t rk);
+PMIX_EXPORT void PMIx_Xfer_procid(pmix_proc_t *dst,
+                                  const pmix_proc_t *src);
+PMIX_EXPORT bool PMIx_Check_procid(const pmix_proc_t *a,
+                                   const pmix_proc_t *b);
+PMIX_EXPORT bool PMIx_Check_rank(pmix_rank_t a,
+                                 pmix_rank_t b);
+PMIX_EXPORT bool PMIx_Procid_invalid(const pmix_proc_t *p);
+
+PMIX_EXPORT int PMIx_Argv_count(char **a);
+PMIX_EXPORT pmix_status_t PMIx_Argv_append_nosize(char ***argv, const char *arg);
+PMIX_EXPORT pmix_status_t PMIx_Argv_prepend_nosize(char ***argv, const char *arg);
+PMIX_EXPORT pmix_status_t PMIx_Argv_append_unique_nosize(char ***argv, const char *arg);
+PMIX_EXPORT void PMIx_Argv_free(char **argv);
+PMIX_EXPORT char **PMIx_Argv_split_inter(const char *src_string,
+                                         int delimiter,
+                                         bool include_empty);
+PMIX_EXPORT char **PMIx_Argv_split_with_empty(const char *src_string, int delimiter);
+PMIX_EXPORT char **PMIx_Argv_split(const char *src_string, int delimiter);
+PMIX_EXPORT char *PMIx_Argv_join(char **argv, int delimiter);
+PMIX_EXPORT char **PMIx_Argv_copy(char **argv);
+PMIX_EXPORT pmix_status_t PMIx_Setenv(const char *name,
+                                      const char *value,
+                                      bool overwrite,
+                                      char ***env);
+
+
+PMIX_EXPORT void PMIx_Value_construct(pmix_value_t *val);
+PMIX_EXPORT void PMIx_Value_destruct(pmix_value_t *val);
+PMIX_EXPORT pmix_value_t* PMIx_Value_create(size_t n);
+PMIX_EXPORT void PMIx_Value_free(pmix_value_t *v, size_t n);
+PMIX_EXPORT pmix_boolean_t PMIx_Value_true(const pmix_value_t *v);
 PMIX_EXPORT pmix_status_t PMIx_Value_load(pmix_value_t *val,
                                           const void *data,
                                           pmix_data_type_t type);
 PMIX_EXPORT pmix_status_t PMIx_Value_unload(pmix_value_t *val,
                                             void **data,
                                             size_t *sz);
-PMIX_EXPORT void PMIx_Value_destruct(pmix_value_t *val);
 PMIX_EXPORT pmix_status_t PMIx_Value_xfer(pmix_value_t *dest,
                                           const pmix_value_t *src);
 PMIX_EXPORT pmix_value_cmp_t PMIx_Value_compare(pmix_value_t *v1,
                                                 pmix_value_t *v2);
-PMIX_EXPORT void PMIx_Data_array_destruct(pmix_data_array_t *d);
 
+
+PMIX_EXPORT void PMIx_Info_construct(pmix_info_t *p);
+PMIX_EXPORT void PMIx_Info_destruct(pmix_info_t *p);
+PMIX_EXPORT pmix_info_t* PMIx_Info_create(size_t n);
+PMIX_EXPORT void PMIx_Info_free(pmix_info_t *p, size_t n);
+PMIX_EXPORT pmix_boolean_t PMIx_Info_true(const pmix_info_t *p);
 PMIX_EXPORT pmix_status_t PMIx_Info_load(pmix_info_t *info,
                                          const char *key,
                                          const void *data,
                                          pmix_data_type_t type);
-
 PMIX_EXPORT pmix_status_t PMIx_Info_xfer(pmix_info_t *dest,
                                          const pmix_info_t *src);
+PMIX_EXPORT void PMIx_Info_required(pmix_info_t *p);
+PMIX_EXPORT void PMIx_Info_optional(pmix_info_t *p);
+PMIX_EXPORT bool PMIx_Info_is_required(const pmix_info_t *p);
+PMIX_EXPORT bool PMIx_Info_is_optional(const pmix_info_t *p);
+PMIX_EXPORT void PMIx_Info_processed(pmix_info_t *p);
+PMIX_EXPORT bool PMIx_Info_was_processed(const pmix_info_t *p);
+PMIX_EXPORT void PMIx_Info_set_end(pmix_info_t *p);
+PMIX_EXPORT bool PMIx_Info_is_end(const pmix_info_t *p);
+PMIX_EXPORT void PMIx_Info_qualifier(pmix_info_t *p);
+PMIX_EXPORT bool PMIx_Info_is_qualifier(const pmix_info_t *p);
+PMIX_EXPORT void PMIx_Info_persistent(pmix_info_t *p);
+PMIX_EXPORT bool PMIx_Info_is_persistent(const pmix_info_t *p);
+
+
+PMIX_EXPORT void PMIx_Coord_construct(pmix_coord_t *m);
+PMIX_EXPORT void PMIx_Coord_destruct(pmix_coord_t *m);
+PMIX_EXPORT pmix_coord_t* PMIx_Coord_create(size_t dims,
+                                            size_t number);
+PMIX_EXPORT void PMIx_Coord_free(pmix_coord_t *m, size_t number);
+
+
+PMIX_EXPORT void PMIx_Topology_construct(pmix_topology_t *t);
+PMIX_EXPORT void PMIx_Topology_destruct(pmix_topology_t *topo);
+PMIX_EXPORT pmix_topology_t* PMIx_Topology_create(size_t n);
+PMIX_EXPORT void PMIx_Topology_free(pmix_topology_t *t, size_t n);
+
+
+PMIX_EXPORT void PMIx_Cpuset_construct(pmix_cpuset_t *c);
+PMIX_EXPORT void PMIx_Cpuset_destruct(pmix_cpuset_t *c);
+PMIX_EXPORT pmix_cpuset_t* PMIx_Cpuset_create(size_t n);
+PMIX_EXPORT void PMIx_Cpuset_free(pmix_cpuset_t *c, size_t n);
+
+
+PMIX_EXPORT void PMIx_Geometry_construct(pmix_geometry_t *g);
+PMIX_EXPORT void PMIx_Geometry_destruct(pmix_geometry_t *g);
+PMIX_EXPORT pmix_geometry_t* PMIx_Geometry_create(size_t n);
+PMIX_EXPORT void PMIx_Geometry_free(pmix_geometry_t *g, size_t n);
+
+
+PMIX_EXPORT void PMIx_Device_distance_construct(pmix_device_distance_t *d);
+PMIX_EXPORT void PMIx_Device_distance_destruct(pmix_device_distance_t *d);
+PMIX_EXPORT pmix_device_distance_t* PMIx_Device_distance_create(size_t n);
+PMIX_EXPORT void PMIx_Device_distance_free(pmix_device_distance_t *d, size_t n);
+
+
+PMIX_EXPORT void PMIx_Byte_object_construct(pmix_byte_object_t *b);
+PMIX_EXPORT void PMIx_Byte_object_destruct(pmix_byte_object_t *b);
+PMIX_EXPORT pmix_byte_object_t* PMIx_Byte_object_create(size_t n);
+PMIX_EXPORT void PMIx_Byte_object_free(pmix_byte_object_t *b, size_t n);
+PMIX_EXPORT void PMIx_Byte_object_load(pmix_byte_object_t *b,
+                                       char *d, size_t sz);
+
+
+PMIX_EXPORT void PMIx_Endpoint_construct(pmix_endpoint_t *e);
+PMIX_EXPORT void PMIx_Endpoint_destruct(pmix_endpoint_t *e);
+PMIX_EXPORT pmix_endpoint_t* PMIx_Endpoint_create(size_t n);
+PMIX_EXPORT void PMIx_Endpoint_free(pmix_endpoint_t *e, size_t n);
+
+
+PMIX_EXPORT void PMIx_Envar_construct(pmix_envar_t *e);
+PMIX_EXPORT void PMIx_Envar_destruct(pmix_envar_t *e);
+PMIX_EXPORT pmix_envar_t* PMIx_Envar_create(size_t n);
+PMIX_EXPORT void PMIx_Envar_free(pmix_envar_t *e, size_t n);
+PMIX_EXPORT void PMIx_Envar_load(pmix_envar_t *e,
+                                 char *var,
+                                 char *value,
+                                 char separator);
+
+
+PMIX_EXPORT void PMIx_Data_buffer_construct(pmix_data_buffer_t *b);
+PMIX_EXPORT void PMIx_Data_buffer_destruct(pmix_data_buffer_t *b);
+PMIX_EXPORT pmix_data_buffer_t* PMIx_Data_buffer_create(void);
+PMIX_EXPORT void PMIx_Data_buffer_release(pmix_data_buffer_t *b);
+PMIX_EXPORT void PMIx_Data_buffer_load(pmix_data_buffer_t *b,
+                                       char *bytes, size_t sz);
+PMIX_EXPORT void PMIx_Data_buffer_unload(pmix_data_buffer_t *b,
+                                         char **bytes, size_t *sz);
+
+
+PMIX_EXPORT void PMIx_Proc_construct(pmix_proc_t *p);
+PMIX_EXPORT void PMIx_Proc_destruct(pmix_proc_t *p);
+PMIX_EXPORT pmix_proc_t* PMIx_Proc_create(size_t n);
+PMIX_EXPORT void PMIx_Proc_free(pmix_proc_t *p, size_t n);
+PMIX_EXPORT void PMIx_Proc_load(pmix_proc_t *p,
+                                const char *nspace, pmix_rank_t rank);
+PMIX_EXPORT void PMIx_Multicluster_nspace_construct(pmix_nspace_t target,
+                                                    pmix_nspace_t cluster,
+                                                    pmix_nspace_t nspace);
+PMIX_EXPORT void PMIx_Multicluster_nspace_parse(pmix_nspace_t target,
+                                                pmix_nspace_t cluster,
+                                                pmix_nspace_t nspace);
+
+
+PMIX_EXPORT void PMIx_Proc_info_construct(pmix_proc_info_t *p);
+PMIX_EXPORT void PMIx_Proc_info_destruct(pmix_proc_info_t *p);
+PMIX_EXPORT pmix_proc_info_t* PMIx_Proc_info_create(size_t n);
+PMIX_EXPORT void PMIx_Proc_info_free(pmix_proc_info_t *p, size_t n);
+
+
+PMIX_EXPORT void PMIx_Proc_stats_construct(pmix_proc_stats_t *p);
+PMIX_EXPORT void PMIx_Proc_stats_destruct(pmix_proc_stats_t *p);
+PMIX_EXPORT pmix_proc_stats_t* PMIx_Proc_stats_create(size_t n);
+PMIX_EXPORT void PMIx_Proc_stats_free(pmix_proc_stats_t *p, size_t n);
+
+
+PMIX_EXPORT void PMIx_Disk_stats_construct(pmix_disk_stats_t *p);
+PMIX_EXPORT void PMIx_Disk_stats_destruct(pmix_disk_stats_t *p);
+PMIX_EXPORT pmix_disk_stats_t* PMIx_Disk_stats_create(size_t n);
+PMIX_EXPORT void PMIx_Disk_stats_free(pmix_disk_stats_t *p, size_t n);
+
+
+PMIX_EXPORT void PMIx_Net_stats_construct(pmix_net_stats_t *p);
+PMIX_EXPORT void PMIx_Net_stats_destruct(pmix_net_stats_t *p);
+PMIX_EXPORT pmix_net_stats_t* PMIx_Net_stats_create(size_t n);
+PMIX_EXPORT void PMIx_Net_stats_free(pmix_net_stats_t *p, size_t n);
+
+
+PMIX_EXPORT void PMIx_Node_stats_construct(pmix_node_stats_t *p);
+PMIX_EXPORT void PMIx_Node_stats_destruct(pmix_node_stats_t *p);
+PMIX_EXPORT pmix_node_stats_t* PMIx_Node_stats_create(size_t n);
+PMIX_EXPORT void PMIx_Node_stats_free(pmix_node_stats_t *p, size_t n);
+
+
+PMIX_EXPORT void PMIx_Pdata_construct(pmix_pdata_t *p);
+PMIX_EXPORT void PMIx_Pdata_destruct(pmix_pdata_t *p);
+PMIX_EXPORT pmix_pdata_t* PMIx_Pdata_create(size_t n);
+PMIX_EXPORT void PMIx_Pdata_free(pmix_pdata_t *p, size_t n);
+
+
+PMIX_EXPORT void PMIx_App_construct(pmix_app_t *p);
+PMIX_EXPORT void PMIx_App_destruct(pmix_app_t *p);
+PMIX_EXPORT pmix_app_t* PMIx_App_create(size_t n);
+PMIX_EXPORT void PMIx_App_info_create(pmix_app_t *p, size_t n);
+PMIX_EXPORT void PMIx_App_free(pmix_app_t *p, size_t n);
+PMIX_EXPORT void PMIx_App_release(pmix_app_t *p);
+
+
+PMIX_EXPORT void PMIx_Query_construct(pmix_query_t *p);
+PMIX_EXPORT void PMIx_Query_destruct(pmix_query_t *p);
+PMIX_EXPORT pmix_query_t* PMIx_Query_create(size_t n);
+PMIX_EXPORT void PMIx_Query_qualifiers_create(pmix_query_t *p, size_t n);
+PMIX_EXPORT void PMIx_Query_free(pmix_query_t *p, size_t n);
+PMIX_EXPORT void PMIx_Query_release(pmix_query_t *p);
+
+
+PMIX_EXPORT void PMIx_Regattr_construct(pmix_regattr_t *p);
+PMIX_EXPORT void PMIx_Regattr_destruct(pmix_regattr_t *p);
+PMIX_EXPORT pmix_regattr_t* PMIx_Regattr_create(size_t n);
+PMIX_EXPORT void PMIx_Regattr_free(pmix_regattr_t *p, size_t n);
+PMIX_EXPORT void PMIx_Regattr_load(pmix_regattr_t *info,
+                                   const char *name,
+                                   const char *key,
+                                   pmix_data_type_t type,
+                                   const char *description);
+PMIX_EXPORT void PMIx_Regattr_xfer(pmix_regattr_t *dest,
+                                   const pmix_regattr_t *src);
+
+
+PMIX_EXPORT void PMIx_Fabric_construct(pmix_regattr_t *p);
+
+
+PMIX_EXPORT void PMIx_Data_array_init(pmix_data_array_t *p,
+                                      pmix_data_type_t type);
+PMIX_EXPORT void PMIx_Data_array_construct(pmix_data_array_t *p,
+                                           size_t num, pmix_data_type_t type);
+PMIX_EXPORT void PMIx_Data_array_destruct(pmix_data_array_t *d);
+PMIX_EXPORT pmix_data_array_t* PMIx_Data_array_create(size_t n, pmix_data_type_t type);
+PMIX_EXPORT void PMIx_Data_array_free(pmix_data_array_t *p);
+
 
 PMIX_EXPORT void* PMIx_Info_list_start(void);
 
@@ -235,6 +450,11 @@ PMIX_EXPORT pmix_status_t PMIx_Info_list_add(void *ptr,
                                              const void *value,
                                              pmix_data_type_t type);
 
+PMIX_EXPORT pmix_status_t PMIx_Info_list_prepend(void *ptr,
+                                                 const char *key,
+                                                 const void *value,
+                                                 pmix_data_type_t type);
+
 PMIX_EXPORT pmix_status_t PMIx_Info_list_insert(void *ptr, pmix_info_t *info);
 
 PMIX_EXPORT pmix_status_t PMIx_Info_list_xfer(void *ptr,
@@ -244,6 +464,106 @@ PMIX_EXPORT pmix_status_t PMIx_Info_list_convert(void *ptr, pmix_data_array_t *p
 
 PMIX_EXPORT void PMIx_Info_list_release(void *ptr);
 
+PMIX_EXPORT pmix_info_t* PMIx_Info_list_get_info(void *ptr, void *prev, void **next);
+
+/* Macros that have been converted to functions */
+
+#define PMIX_LOAD_KEY(a, b) \
+    PMIx_Load_key(a, b)
+
+#define PMIX_CHECK_KEY(a, b) \
+    PMIx_Check_key((a)->key, b)
+
+#define PMIX_CHECK_RESERVED_KEY(a) \
+    PMIx_Check_reserved_key(a)
+
+#define PMIX_LOAD_NSPACE(a, b) \
+    PMIx_Load_nspace(a, b)
+
+/* define a convenience macro for checking nspaces */
+#define PMIX_CHECK_NSPACE(a, b) \
+    PMIx_Check_nspace(a, b)
+
+#define PMIX_NSPACE_INVALID(a) \
+    PMIx_Nspace_invalid(a)
+
+#define PMIX_LOAD_PROCID(a, b, c) \
+    PMIx_Load_procid(a, b, c)
+
+#define PMIX_XFER_PROCID(a, b) \
+    PMIx_Xfer_procid(a, b)
+
+#define PMIX_PROCID_XFER(a, b) PMIX_XFER_PROCID(a, b)
+
+#define PMIX_CHECK_PROCID(a, b) \
+    PMIx_Check_procid(a, b)
+
+#define PMIX_CHECK_RANK(a, b) \
+    PMIx_Check_rank(a, b)
+
+#define PMIX_PROCID_INVALID(a)  \
+    PMIx_Procid_invalid(a)
+
+#define PMIX_ARGV_COUNT(r, a) \
+    (r) = PMIx_Argv_count(a)
+
+#define PMIX_ARGV_APPEND(r, a, b) \
+    (r) = PMIx_Argv_append_nosize(&(a), (b))
+
+#define PMIX_ARGV_PREPEND(r, a, b) \
+    (r) = PMIx_Argv_prepend_nosize(&(a), b)
+
+#define PMIX_ARGV_APPEND_UNIQUE(r, a, b) \
+    (r) = PMIx_Argv_append_unique_nosize(a, b)
+
+// free(a) is called inside PMIx_Argv_free().
+#define PMIX_ARGV_FREE(a)    \
+    do {                     \
+        PMIx_Argv_free((a)); \
+        (a) = NULL;          \
+    } while (0)
+
+#define PMIX_ARGV_SPLIT(a, b, c) \
+    (a) = PMIx_Argv_split(b, c)
+
+#define PMIX_ARGV_JOIN(a, b, c) \
+    (a) = PMIx_Argv_join(b, c)
+
+#define PMIX_ARGV_COPY(a, b) \
+    (a) = PMIx_Argv_copy(b)
+
+#define PMIX_SETENV(r, a, b, c) \
+    (r) = PMIx_Setenv((a), (b), true, (c))
+
+#define PMIX_VALUE_CONSTRUCT(m) \
+    PMIx_Value_construct(m)
+
+#define PMIX_VALUE_DESTRUCT(m) \
+    PMIx_Value_destruct(m)
+
+#define PMIX_VALUE_CREATE(m, n) \
+    (m) = PMIx_Value_create(n)
+
+// free(m) is called inside PMIx_Value_free().
+#define PMIX_VALUE_RELEASE(m)  \
+    do {                       \
+        PMIx_Value_free(m, 1); \
+        (m) = NULL;            \
+    } while (0)
+
+// free(m) is called inside PMIx_Value_free().
+#define PMIX_VALUE_FREE(m, n)   \
+    do {                        \
+        PMIx_Value_free(m, n);  \
+        (m) = NULL;             \
+    } while (0)
+
+#define PMIX_CHECK_TRUE(a) \
+    (PMIX_BOOL_TRUE == PMIx_Value_true(a) ? true : false)
+
+#define PMIX_CHECK_BOOL(a) \
+    (PMIX_NON_BOOL == PMIx_Value_true(a) ? false : true)
+
 #define PMIX_VALUE_LOAD(v, d, t) \
     PMIx_Value_load((v), (d), (t))
 
@@ -267,6 +587,57 @@ PMIX_EXPORT void PMIx_Info_list_release(void *ptr);
 #define PMIX_VALUE_XFER_DIRECT(r, v, s)     \
     (r) = PMIx_Value_xfer((v), (s))
 
+#define PMIX_INFO_CONSTRUCT(m) \
+    PMIx_Info_construct(m)
+
+#define PMIX_INFO_DESTRUCT(m) \
+    PMIx_Info_destruct(m)
+
+#define PMIX_INFO_CREATE(m, n) \
+    (m) = PMIx_Info_create(n)
+
+// free(m) is called inside PMIx_Info_free().
+#define PMIX_INFO_FREE(m, n)  \
+    do {                      \
+        PMIx_Info_free(m, n); \
+        (m) = NULL;           \
+    } while (0)
+
+#define PMIX_INFO_REQUIRED(m) \
+    PMIx_Info_required(m)
+
+#define PMIX_INFO_OPTIONAL(m) \
+    PMIx_Info_optional(m)
+
+#define PMIX_INFO_IS_REQUIRED(m) \
+    PMIx_Info_is_required(m)
+
+#define PMIX_INFO_IS_OPTIONAL(m) \
+    PMIx_Info_is_optional(m)
+
+#define PMIX_INFO_PROCESSED(m)  \
+    PMIx_Info_processed(m)
+
+#define PMIX_INFO_WAS_PROCESSED(m)  \
+    PMIx_Info_was_processed(m)
+
+#define PMIX_INFO_SET_END(m)    \
+    PMIx_Info_set_end(m)
+#define PMIX_INFO_IS_END(m)         \
+    PMIx_Info_is_end(m)
+
+#define PMIX_INFO_SET_QUALIFIER(i)   \
+    PMIx_Info_qualifier(i)
+
+#define PMIX_INFO_IS_QUALIFIER(i)    \
+    PMIx_Info_is_qualifier(i)
+
+#define PMIX_INFO_SET_PERSISTENT(ii) \
+    PMIx_Info_persistent(ii)
+
+#define PMIX_INFO_IS_PERSISTENT(ii)  \
+    PMIx_Info_is_persistent(ii)
+
 #define PMIX_INFO_LOAD(i, k, d, t)  \
     (void) PMIx_Info_load(i, k, d, t)
 
@@ -295,12 +666,18 @@ PMIX_EXPORT void PMIx_Info_list_release(void *ptr);
         }                                                                       \
     } while (0)
 
+#define PMIX_INFO_TRUE(m)   \
+    (PMIX_BOOL_TRUE == PMIx_Info_true(m) ? true : false)
+
 #define PMIX_INFO_LIST_START(p)    \
     (p) = PMIx_Info_list_start()
 
 #define PMIX_INFO_LIST_ADD(r, p, a, v, t)     \
     (r) = PMIx_Info_list_add((p), (a), (v), (t))
 
+#define PMIX_INFO_LIST_PREPEND(r, p, a, v, t)     \
+    (r) = PMIx_Info_list_prepend((p), (a), (v), (t))
+
 #define PMIX_INFO_LIST_INSERT(r, p, i)     \
     (r) = PMIx_Info_list_insert((p), (i))
 
@@ -313,9 +690,462 @@ PMIX_EXPORT void PMIx_Info_list_release(void *ptr);
 #define PMIX_INFO_LIST_RELEASE(p) \
     PMIx_Info_list_release((p))
 
+#define PMIX_TOPOLOGY_CONSTRUCT(m) \
+    PMIx_Topology_construct(m)
+
+#define PMIX_TOPOLOGY_CREATE(m, n) \
+    (m) = PMIx_Topology_create(n)
+
 #define PMIX_TOPOLOGY_DESTRUCT(x) \
     PMIx_Topology_destruct(x)
 
+// free(m) is called inside PMIx_Topology_free().
+#define PMIX_TOPOLOGY_FREE(m, n)  \
+    do {                          \
+        PMIx_Topology_free(m, n); \
+        (m) = NULL;               \
+    } while (0)
+
+#define PMIX_COORD_CREATE(m, n, d)  \
+    (m) = PMIx_Coord_create(d, n)
+
+#define PMIX_COORD_CONSTRUCT(m) \
+    PMIx_Coord_construct(m)
+
+#define PMIX_COORD_DESTRUCT(m)  \
+    PMIx_Coord_destruct(m)
+
+// free(m) is called inside PMIx_Coord_free().
+#define PMIX_COORD_FREE(m, n)   \
+    do {                        \
+        PMIx_Coord_free(m, n);  \
+        (m) = NULL;             \
+    } while(0)
+
+#define PMIX_CPUSET_CONSTRUCT(m) \
+    PMIx_Cpuset_construct(m)
+
+#define PMIX_CPUSET_DESTRUCT(m) \
+    PMIx_Cpuset_destruct(m)
+
+#define PMIX_CPUSET_CREATE(m, n) \
+    (m) = PMIx_Cpuset_create(n)
+
+// free(m) is called inside PMIx_Cpuset_free().
+#define PMIX_CPUSET_FREE(m, n)    \
+    do {                          \
+        PMIx_Cpuset_free(m, n);   \
+        (m) = NULL;               \
+    } while(0)
+
+#define PMIX_GEOMETRY_CONSTRUCT(m) \
+    PMIx_Geometry_construct(m)
+
+#define PMIX_GEOMETRY_DESTRUCT(m) \
+    PMIx_Geometry_destruct(m)
+
+#define PMIX_GEOMETRY_CREATE(m, n) \
+    (m) = PMIx_Geometry_create(n)
+
+// free(m) is called inside PMIx_Geometry_free().
+#define PMIX_GEOMETRY_FREE(m, n)    \
+    do {                            \
+        PMIx_Geometry_free(m, n);   \
+        (m) = NULL;                 \
+    } while(0)
+
+#define PMIX_DEVICE_DIST_CONSTRUCT(m) \
+    PMIx_Device_distance_construct(m)
+
+#define PMIX_DEVICE_DIST_DESTRUCT(m) \
+    PMIx_Device_distance_destruct(m)
+
+#define PMIX_DEVICE_DIST_CREATE(m, n) \
+    (m) = PMIx_Device_distance_create(n)
+
+// free(m) is called inside PMIx_Device_distance_free().
+#define PMIX_DEVICE_DIST_FREE(m, n)      \
+    do {                                 \
+        PMIx_Device_distance_free(m, n); \
+        (m) = NULL;                      \
+    } while(0)
+
+#define PMIX_BYTE_OBJECT_CONSTRUCT(m) \
+    PMIx_Byte_object_construct(m)
+
+#define PMIX_BYTE_OBJECT_DESTRUCT(m) \
+    PMIx_Byte_object_destruct(m)
+
+#define PMIX_BYTE_OBJECT_CREATE(m, n) \
+    (m) = PMIx_Byte_object_create(n)
+
+// free(m) is called inside PMIx_Byte_object_free().
+#define PMIX_BYTE_OBJECT_FREE(m, n)  \
+    do {                             \
+        PMIx_Byte_object_free(m, n); \
+        (m) = NULL;                  \
+    } while(0)
+
+#define PMIX_BYTE_OBJECT_LOAD(b, d, s)  \
+    do {                                \
+        PMIx_Byte_object_load(b, d, s); \
+        (d) = NULL;                     \
+        (s) = 0;                        \
+    } while(0)
+
+#define PMIX_ENDPOINT_CONSTRUCT(m) \
+    PMIx_Endpoint_construct(m)
+
+#define PMIX_ENDPOINT_DESTRUCT(m) \
+    PMIx_Endpoint_destruct(m)
+
+#define PMIX_ENDPOINT_CREATE(m, n) \
+    (m) = PMIx_Endpoint_create(n)
+
+// free(m) is called inside PMIx_Endpoint_free().
+#define PMIX_ENDPOINT_FREE(m, n)  \
+    do {                          \
+        PMIx_Endpoint_free(m, n); \
+        (m) = NULL;               \
+    } while(0)
+
+#define PMIX_ENVAR_CREATE(m, n) \
+    (m) = PMIx_Envar_create(n)
+
+// free(m) is called inside PMIx_Envar_free().
+#define PMIX_ENVAR_FREE(m, n)   \
+    do {                        \
+        PMIx_Envar_free(m, n);  \
+        (m) = NULL;             \
+    } while(0)
+
+#define PMIX_ENVAR_CONSTRUCT(m) \
+    PMIx_Envar_construct(m)
+
+#define PMIX_ENVAR_DESTRUCT(m) \
+    PMIx_Envar_destruct(m)
+
+#define PMIX_ENVAR_LOAD(m, e, v, s) \
+    PMIx_Envar_load(m, e, v, s)
+
+#define PMIX_DATA_BUFFER_CREATE(m)  \
+    (m) = PMIx_Data_buffer_create()
+
+// free(m) is called inside PMIx_Data_buffer_release().
+#define PMIX_DATA_BUFFER_RELEASE(m)  \
+    do {                             \
+        PMIx_Data_buffer_release(m); \
+        (m) = NULL;                  \
+    } while (0)
+
+#define PMIX_DATA_BUFFER_CONSTRUCT(m)       \
+    PMIx_Data_buffer_construct(m)
+
+#define PMIX_DATA_BUFFER_DESTRUCT(m)        \
+    PMIx_Data_buffer_destruct(m)
+
+#define PMIX_DATA_BUFFER_LOAD(b, d, s)  \
+    PMIx_Data_buffer_load(b, d, s)
+
+#define PMIX_DATA_BUFFER_UNLOAD(b, d, s)    \
+    PMIx_Data_buffer_unload(b, &(d), &(s))
+
+#define PMIX_PROC_CREATE(m, n) \
+    (m) = PMIx_Proc_create(n)
+
+#define PMIX_PROC_CONSTRUCT(m) \
+    PMIx_Proc_construct(m)
+
+#define PMIX_PROC_DESTRUCT(m) \
+    PMIx_Proc_destruct(m)
+
+// free(m) is called inside PMIx_Proc_free().
+#define PMIX_PROC_FREE(m, n)    \
+    do {                        \
+        PMIx_Proc_free(m, n);   \
+        (m) = NULL;             \
+    } while (0)
+
+// free(m) is called inside PMIx_Proc_free().
+#define PMIX_PROC_RELEASE(m)    \
+do {                            \
+    PMIx_Proc_free(m, 1);       \
+    (m) = NULL;                 \
+} while(0)
+
+#define PMIX_PROC_LOAD(m, n, r) \
+    PMIx_Proc_load(m, n, r)
+
+#define PMIX_MULTICLUSTER_NSPACE_CONSTRUCT(t, c, n) \
+    PMIx_Multicluster_nspace_construct(t, c, n)
+
+#define PMIX_MULTICLUSTER_NSPACE_PARSE(t, c, n) \
+    PMIx_Multicluster_nspace_parse(t, c, n)
+
+#define PMIX_PROC_INFO_CREATE(m, n) \
+    (m) = PMIx_Proc_info_create(n)
+
+#define PMIX_PROC_INFO_CONSTRUCT(m) \
+    PMIx_Proc_info_construct(m)
+
+#define PMIX_PROC_INFO_DESTRUCT(m) \
+    PMIx_Proc_info_destruct(m)
+
+// free(m) is called inside PMIx_Proc_info_free().
+#define PMIX_PROC_INFO_FREE(m, n)  \
+    do {                           \
+        PMIx_Proc_info_free(m, n); \
+        (m) = NULL;                \
+    } while (0)
+
+// free(m) is called inside PMIx_Proc_info_free().
+#define PMIX_PROC_INFO_RELEASE(m)   \
+do {                                \
+    PMIx_Proc_info_free(m, 1)       \
+    (m) = NULL;                     \
+} while(0)
+
+#define PMIX_PROC_STATS_CONSTRUCT(m) \
+    PMIx_Proc_stats_construct(m)
+
+#define PMIX_PROC_STATS_DESTRUCT(m) \
+    PMIx_Proc_stats_destruct(m)
+
+#define PMIX_PROC_STATS_CREATE(m, n) \
+    (m) = PMIx_Proc_stats_create(n)
+
+// free(m) is called inside PMIx_Proc_stats_free().
+#define PMIX_PROC_STATS_FREE(m, n)  \
+do {                                \
+    PMIx_Proc_stats_free(m, n);     \
+    (m) = NULL;                     \
+} while(0)
+
+// free(m) is called inside PMIx_Proc_stats_free().
+#define PMIX_PROC_STATS_RELEASE(m)  \
+do {                                \
+    PMIx_Proc_stats_free(m, 1);     \
+    (m) = NULL;                     \
+} while(0)
+
+#define PMIX_DISK_STATS_CONSTRUCT(m) \
+    PMIx_Disk_stats_construct(m)
+
+#define PMIX_DISK_STATS_DESTRUCT(m) \
+    PMIx_Disk_stats_destruct(m)
+
+#define PMIX_DISK_STATS_CREATE(m, n) \
+    (m) = PMIx_Disk_stats_create(n)
+
+// free(m) is called inside PMIx_Disk_stats_free().
+#define PMIX_DISK_STATS_FREE(m, n)  \
+do {                                \
+    PMIx_Disk_stats_free(m, n);     \
+    (m) = NULL;                     \
+} while(0)
+
+// free(m) is called inside PMIx_Disk_stats_free().
+#define PMIX_DISK_STATS_RELEASE(m) \
+do {                               \
+    PMIx_Disk_stats_free(m, 1);    \
+    (m) = NULL;                    \
+} while(0)
+
+#define PMIX_NET_STATS_CONSTRUCT(m) \
+    PMIx_Net_stats_construct(m)
+
+#define PMIX_NET_STATS_DESTRUCT(m) \
+    PMIx_Net_stats_destruct(m)
+
+#define PMIX_NET_STATS_CREATE(m, n) \
+    (m) = PMIx_Net_stats_create(n)
+
+// free(m) is called inside PMIx_Net_stats_free().
+#define PMIX_NET_STATS_FREE(m, n)  \
+do {                               \
+    PMIx_Net_stats_free(m, n);     \
+    (m) = NULL;                    \
+} while(0)
+
+#define PMIX_NET_STATS_RELEASE(m)  \
+do {                               \
+    PMIx_Net_stats_free(m, 1);     \
+    (m) = NULL;                    \
+} while(0)
+
+#define PMIX_NODE_STATS_CONSTRUCT(m) \
+    PMIx_Node_stats_construct(m)
+
+#define PMIX_NODE_STATS_DESTRUCT(m) \
+    PMIx_Node_stats_destruct(m)
+
+#define PMIX_NODE_STATS_CREATE(m, n) \
+    (m) = PMIx_Node_stats_create(n)
+
+// free(m) is called inside PMIx_Node_stats_free().
+#define PMIX_NODE_STATS_FREE(m, n)  \
+do {                                \
+    PMIx_Node_stats_free(m, n);     \
+    (m) = NULL;                     \
+} while(0)
+
+// free(m) is called inside PMIx_Node_stats_free().
+#define PMIX_NODE_STATS_RELEASE(m)  \
+do {                                \
+    PMIx_Node_stats_free(m, 1);     \
+    (m) = NULL;                     \
+} while(0)
+
+#define PMIX_PDATA_CONSTRUCT(m) \
+    PMIx_Pdata_construct(m)
+
+#define PMIX_PDATA_DESTRUCT(m) \
+    PMIx_Pdata_destruct(m)
+
+#define PMIX_PDATA_CREATE(m, n) \
+    (m) = PMIx_Pdata_create(n)
+
+// free(m) is called inside PMIx_Pdata_free().
+#define PMIX_PDATA_FREE(m, n)   \
+do {                            \
+    PMIx_Pdata_free(m, n);      \
+    (m) = NULL;                 \
+} while(0)
+
+// free(m) is called inside PMIx_Pdata_free().
+#define PMIX_PDATA_RELEASE(m)   \
+do {                            \
+    PMIx_Pdata_free(m, 1);      \
+    (m) = NULL;                 \
+} while(0)
+
+#define PMIX_APP_CONSTRUCT(m) \
+    PMIx_App_construct(m)
+
+#define PMIX_APP_DESTRUCT(m) \
+    PMIx_App_destruct(m)
+
+#define PMIX_APP_CREATE(m, n) \
+    (m) = PMIx_App_create(n)
+
+#define PMIX_APP_INFO_CREATE(m, n) \
+    PMIx_App_info_create(m, n)
+
+// free(m) is called inside PMIx_App_free().
+#define PMIX_APP_RELEASE(m)     \
+    do {                        \
+        PMIx_App_free(m, 1);    \
+        (m) = NULL;             \
+    } while (0)
+
+// free(m) is called inside PMIx_App_free().
+#define PMIX_APP_FREE(m, n)     \
+    do {                        \
+        PMIx_App_free(m, n);    \
+        (m) = NULL;             \
+    } while (0)
+
+#define PMIX_QUERY_CONSTRUCT(m) \
+    PMIx_Query_construct(m)
+
+#define PMIX_QUERY_DESTRUCT(m) \
+    PMIx_Query_destruct(m)
+
+#define PMIX_QUERY_CREATE(m, n) \
+    (m) = PMIx_Query_create(n)
+
+#define PMIX_QUERY_QUALIFIERS_CREATE(m, n) \
+    PMIx_Query_qualifiers_create(m, n)
+
+// free(m) is called inside PMIx_Query_release().
+#define PMIX_QUERY_RELEASE(m)       \
+    do {                            \
+        PMIx_Query_release((m));    \
+        (m) = NULL;                 \
+    } while (0)
+
+// free(m) is called inside PMIx_Query_free().
+#define PMIX_QUERY_FREE(m, n)   \
+    do {                        \
+        PMIx_Query_free(m, n);  \
+        (m) = NULL;             \
+    } while (0)
+
+#define PMIX_REGATTR_CONSTRUCT(a) \
+    PMIx_Regattr_construct(a)
+
+#define PMIX_REGATTR_LOAD(a, n, k, t, v) \
+    PMIx_Regattr_load(a, n, k, t, v)
+
+#define PMIX_REGATTR_DESTRUCT(a) \
+    PMIx_Regattr_destruct(a)
+
+#define PMIX_REGATTR_CREATE(m, n) \
+    (m)= PMIx_Regattr_create(n)
+
+// free(m) is called inside PMIx_Regattr_free().
+#define PMIX_REGATTR_FREE(m, n)     \
+    do {                            \
+        PMIx_Regattr_free(m, n);    \
+        (m) = NULL;                 \
+    } while (0)
+
+#define PMIX_REGATTR_XFER(a, b) \
+    PMIx_Regattr_xfer(a, b)
+
+#define PMIX_DATA_ARRAY_INIT(m, t) \
+    PMIx_Data_array_init(m, t)
+
+#define PMIX_DATA_ARRAY_CONSTRUCT(m, n, t) \
+    PMIx_Data_array_construct(m, n, t)
+
+#define PMIX_DATA_ARRAY_DESTRUCT(m) \
+    PMIx_Data_array_destruct(m)
+
+#define PMIX_DATA_ARRAY_CREATE(m, n, t) \
+    (m) = PMIx_Data_array_create(n, t)
+
+// free(m) is called inside PMIx_Data_array_free().
+#define PMIX_DATA_ARRAY_FREE(m)     \
+    do {                            \
+        PMIx_Data_array_free(m);    \
+        (m) = NULL;                 \
+    } while(0)
+
+
+// functions that are no longer visible from inside
+// the PMIx library
+
+#define pmix_argv_append_nosize(a, b) \
+    PMIx_Argv_append_nosize(a, b)
+
+#define pmix_argv_append_unique_nosize(a, b) \
+    PMIx_Argv_append_unique_nosize(a, b)
+
+#define pmix_argv_split(a, b) \
+    PMIx_Argv_split(a, b)
+
+#define pmix_argv_split_with_empty(a, b) \
+    PMIx_Argv_split_with_empty(a, b)
+
+#define pmix_argv_free(a) \
+    PMIx_Argv_free(a)
+
+#define pmix_argv_count(a) \
+    PMIx_Argv_count(a)
+
+#define pmix_argv_join(a, b) \
+    PMIx_Argv_join(a, b)
+
+#define pmix_argv_prepend_nosize(a, b) \
+    PMIx_Argv_prepend_nosize(a, b)
+
+#define pmix_argv_copy(a) \
+    PMIx_Argv_copy(a)
+
+#define pmix_setenv(a, b, c, d) \
+    PMIx_Setenv(a, b, c, d)
+
 #if defined(c_plusplus) || defined(__cplusplus)
 }
 #endif
diff --git a/deps/pmix/include/pmix_version.h b/deps/pmix/include/pmix_version.h
index e28c729cb..14d7a3f2f 100644
--- a/deps/pmix/include/pmix_version.h
+++ b/deps/pmix/include/pmix_version.h
@@ -19,7 +19,7 @@
 /* define PMIx version */
 #define PMIX_VERSION_MAJOR 4L
 #define PMIX_VERSION_MINOR 2L
-#define PMIX_VERSION_RELEASE 2L
+#define PMIX_VERSION_RELEASE 6L
 
-#define PMIX_NUMERIC_VERSION 0x00040202
+#define PMIX_NUMERIC_VERSION 0x00040206
 #endif
diff --git a/deps/pmix/update_pmix.sh b/deps/pmix/update_pmix.sh
index a94957b31..06a32ce73 100755
--- a/deps/pmix/update_pmix.sh
+++ b/deps/pmix/update_pmix.sh
@@ -22,7 +22,8 @@ install_path=`pwd`/_install
 
 git clone $REPO
 pushd ./openpmix
-target_version="v4.2.2"
+#version sould be clarified and aligned with Borealis maintainer
+target_version="v4.2.6-1"
 git checkout ${target_version}
 
 git submodule update --init --recursive
diff --git a/doc/rst/Makefile b/doc/rst/Makefile
index 69fe55ecf..b194eba3a 100755
--- a/doc/rst/Makefile
+++ b/doc/rst/Makefile
@@ -11,6 +11,12 @@ BUILDDIR      = build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
+ditaxml:
+	$ pip3 install --upgrade --force-reinstall git+https://github.com/intel-sandbox/tools.docs.sphinx2dita@main#egg=sphinx2dita
+	$ sphinx-build -D extensions=sphinx2dita,sphinx.ext.todo,breathe -b ditaxml source build/ditaxml -d build/doctrees
+	@echo
+	@echo "Build finished. The ditaxml files are in $(BUILDDIR)/ditaxml."
+
 .PHONY: help Makefile
 
 # Catch-all target: route all unknown targets to Sphinx using the new
diff --git a/doc/rst/source/advanced-configuration/collective-algorithms-selection.rst b/doc/rst/source/advanced-configuration/collective-algorithms-selection.rst
index e9e67aa32..f2ac377ab 100644
--- a/doc/rst/source/advanced-configuration/collective-algorithms-selection.rst
+++ b/doc/rst/source/advanced-configuration/collective-algorithms-selection.rst
@@ -4,4 +4,4 @@ Selection of Collective Algorithms
 
 |product_short| supports manual selection of collective algorithms for different message size ranges.
 
-Refer to :ref:`Collective algorithms selection` section for details.
+Refer to :ref:`Collective Algorithms Selection <collective-algorithms-selection>` section for details.
diff --git a/doc/rst/source/advanced-configuration/low-precision-datatypes.rst b/doc/rst/source/advanced-configuration/low-precision-datatypes.rst
index e47c22f10..d78904476 100644
--- a/doc/rst/source/advanced-configuration/low-precision-datatypes.rst
+++ b/doc/rst/source/advanced-configuration/low-precision-datatypes.rst
@@ -18,4 +18,4 @@ For BF16 <-> FP32 conversion |product_short| provides ``AVX512F`` and ``AVX512_B
 For FP16 <-> FP32 conversion |product_short| provides ``F16C`` and ``AVX512F``-based implementations.
 Both implementations require GCC 4.9 or higher.
 
-Refer to :ref:`Low-precision datatypes` for details about relevant environment variables.
+Refer to :ref:`Low-precision datatypes <low-precision-datatypes>` for details about relevant environment variables.
diff --git a/doc/rst/source/advanced-configuration/operation-fusion.rst b/doc/rst/source/advanced-configuration/operation-fusion.rst
index 39e9aa5b3..7d9a25a24 100644
--- a/doc/rst/source/advanced-configuration/operation-fusion.rst
+++ b/doc/rst/source/advanced-configuration/operation-fusion.rst
@@ -2,17 +2,17 @@
 Fusion of Communication Operations
 ==================================
 
-In some cases, it may be beneficial to postpone execution of communication operations and execute them all together as a single operation in a batch mode. 
-This can reduce operation setup overhead and improve interconnect saturation. 
+In some cases, it may be beneficial to postpone execution of communication operations and execute them all together as a single operation in a batch mode.
+This can reduce operation setup overhead and improve interconnect saturation.
 
 |product_short| provides several knobs to enable and control such optimization:
 
 - The fusion is enabled by :ref:`CCL_FUSION`.
 - The advanced configuration is controlled by:
 
-  * :ref:`CCL_FUSION_BYTES_THRESHOLD`
-  * :ref:`CCL_FUSION_COUNT_THRESHOLD`
-  * :ref:`CCL_FUSION_CYCLE_MS`
+  * :ref:`CCL_FUSION_BYTES_THRESHOLD <CCL_FUSION_BYTES_THRESHOLD>`
+  * :ref:`CCL_FUSION_COUNT_THRESHOLD <CCL_FUSION_COUNT_THRESHOLD>`
+  * :ref:`CCL_FUSION_CYCLE_MS <CCL_FUSION_CYCLE_MS>`
 
 .. note::
     For now, this functionality is supported for ``allreduce`` operations only.
diff --git a/doc/rst/source/advanced-configuration/operation-prioritization.rst b/doc/rst/source/advanced-configuration/operation-prioritization.rst
index 416ec3527..9f2d707f0 100644
--- a/doc/rst/source/advanced-configuration/operation-prioritization.rst
+++ b/doc/rst/source/advanced-configuration/operation-prioritization.rst
@@ -2,7 +2,7 @@
 Prioritization of Communication Operations
 ==========================================
 
-|product_short| supports prioritization of communication operations that controls the order in which individual communication operations are executed. 
+|product_short| supports prioritization of communication operations that controls the order in which individual communication operations are executed.
 This allows to postpone execution of non-urgent operations to complete urgent operations earlier, which may be beneficial for many use cases.
 
 The communication prioritization is controlled by priority value. Note that the priority must be a non-negative number with a higher number standing for a higher priority.
@@ -13,4 +13,4 @@ There are the following prioritization modes:
 -	Direct - you explicitly specify priority using ``priority`` field in operation attribute.
 -	LIFO (Last In, First Out) - priority is implicitly increased on each operation call. In this case, you do not have to specify priority.
 
-The prioritization mode is controlled by :ref:`CCL_PRIORITY`.
+The prioritization mode is controlled by :ref:`CCL_PRIORITY <CCL_PRIORITY>`.
diff --git a/doc/rst/source/benchmark.rst b/doc/rst/source/benchmark.rst
new file mode 100644
index 000000000..654005ee4
--- /dev/null
+++ b/doc/rst/source/benchmark.rst
@@ -0,0 +1,236 @@
+oneCCL Benchmark User Guide
+=====================================
+
+The oneCCL benchmark provides performance measurements for the collective operations in oneCCL, such as:
+
+* ``allreduce``
+* ``reduce``
+* ``allgather``
+* ``alltoall``
+* ``alltoallv``
+* ``reduce-scatter``
+* ``broadcast``
+
+The benchmark is distributed with the oneCCL package. You can find it in the examples directory within the oneCCL installation path. 
+
+
+Build oneCCL Benchmark
+***********************
+
+CPU-Only
+^^^^^^^^^
+
+To build the benchmark, complete the following steps: 
+ 
+1. Configure your environment. Source the installed oneCCL library for the CPU-only support: 
+ 
+   .. code:: 
+      
+      source <ccl installation dir>/ccl/latest/env/vars.sh --ccl-configuration=cpu
+
+2. Navigate to ``<oneCCL install location>/share/doc/ccl/examples``
+3. Build the benchmark with the following command:
+
+   .. code::
+
+      cmake -S . -B build -DCMAKE_INSTALL_PREFIX=$(pwd)/build/_install && cmake --build build -j $(nproc) -t install
+ 
+CPU-GPU
+^^^^^^^^
+ 
+1. Configure your environment. 
+   
+   * Source the Intel(R) oneAPI DPC++/C++ Compiler. See the `documentation <https://www.intel.com/content/www/us/en/docs/dpcpp-cpp-compiler/get-started-guide/current/overview.html>`_ for the instructions.
+   * Source the installed oneCCL library for the CPU-GPU support: 
+   
+     .. code::
+      
+        source <ccl installation dir>/ccl/latest/env/vars.sh --ccl-configuration=cpu_gpu_dpcpp
+
+2. Navigate to ``<oneCCL install location>/share/doc/ccl/examples``.
+3. Build the SYCL benchmark with the following command:
+
+   .. code::
+
+      cmake -S . -B build -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCOMPUTE_BACKEND=dpcpp -DCMAKE_INSTALL_PREFIX=$(pwd)/build/_install && cmake --build build -j $(nproc) -t install
+
+
+Run oneCCL Benchmark
+*********************
+
+To run the benchmark, use the following command:
+
+.. code::
+
+   mpirun -np <N> -ppn <P> benchmark [arguments]
+
+Where:
+
+* ``N`` is the overall number of processes 
+* ``P`` is the number of processes within a node
+
+The benchmark reports:
+
+* ``#bytes`` - the message size in the number of bytes
+* ``#repetitions`` - the number of iterations
+* ``t_min`` - the average time across iterations of the fastest process in each iteration
+* ``t_max`` - the average time across iterations of the slowest process in each iteration
+* ``t_avg`` - the average time across processes and iterations
+* ``stddev`` - standard deviation
+* ``wait_t_avg`` - the average wait time after the collective call returns and until it completes To enable, use the ``-x`` option. 
+
+Notice that ``t_min``, ``t_max``, and ``t_avg`` measure the total collective execution time. It means the timer starts before calling oneCCL API and ends once the collective completes. 
+While ``wait_t_avg`` only measures the wait time. It means the timer starts after the collective API call returns control to the host/calling thread and ends once the collective completes. 
+Thus, ``wait_t_avg`` does not include the time spent on the oneCCL API call, while ``t_min``, ``t_max``, and ``t_avg`` include that time. Time is reported in `μsec`.
+
+
+Benchmark Arguments
+^^^^^^^^^^^^^^^^^^^^^
+
+To see the benchmark arguments, use the ``--help`` argument.
+
+The benchmark accepts the following arguments:
+
+.. list-table:: 
+   :widths: 25 50 25
+   :header-rows: 1
+
+   * - Option
+     - Description
+     - Default Value 
+   * - ``-b``, ``--backend``
+     - Specify the backend. The possible values are ``host`` and ``sycl``. For a CPU-only build, the backend is automatically set to ``host``, and only the host option is available. 
+       For a CPU-GPU build, ``host`` and ``sycl`` options are available, and ``sycl`` is the default value. The host value allocates buffers in the host (CPU) memory, while the ``sycl`` value allocates buffers in the device (GPU) memory.
+     -  ``sycl``
+   * - ``-i``, ``--iters``
+     - Specify the number of iterations executed by the benchmark. 
+     - ``16``
+   * - ``-w``, ``--warmup_iters``
+     - Specify the number of the warmup iterations. It means the number of iterations the benchmark runs before starting the timing of the iterations specified with the ``-i`` argument. 
+     - ``16``
+   * - ``-j``, ``--iter_policy``
+     - Specify the iteration policy. Possible values are ``off`` and ``auto``.  
+       When the iteration policy is ``off``, the number of iterations is the same across the message sizes. 
+       When the iteration policy is ``auto``, the number of iterations reduces based on the message size of the collective operation. 
+     - ``auto``
+   * - ``-n``, ``--buf_count``
+     - Specify the number of collective operations the benchmark calls in each iteration. Each collective uses different ``send`` and ``receive`` buffers. 
+       The explicit wait calls are placed for each collective after all of them are called. 
+     - ``1``
+   * - ``-f``, ``--min_elem_count``
+     - Specify the minimum number of elements used for the collective.
+     - ``1``
+   * - ``-t``, ``--max_elem_count``
+     - Specify the maximum number of elements used for the collective. 
+     - ``128``
+   * - ``-y``, ``--elem_counts``
+     - Specify a list with the number of elements used for the collective, , such as ``[-y 4, 8, 32, 131072]``.
+     - ``[1, 2, 4, 8, 16, 32, 64, 128]``
+   * - ``-c``, ``--check``
+     - Check for correctness. The possible values are ``off`` (disable checking), ``last`` (check the last iteration), and ``all`` (check all the iterations). 
+     - ``last``
+   * - ``-p``, ``--cache``
+     - Specify whether to use persistent collectives (``p=1``) or not (``p=0``). 
+     
+       .. note:: A collective is persistent when the same collective is called with the same parameters multiple times. OneCCL generates a schedule for each collective it runs and can apply optimizations when persistent collectives are used. 
+                 It means the schedule is generated once and reused across the subsequent invocations, saving the time to generate the schedule. 
+     
+     - ``1`` 
+   * - ``-q``, ``--inplace``
+     - Specify for oneCCL to use in-place (``1``) or out-of-place (``0``) buffers. With the in-place buffers, the send and receive buffers used by the collective are the same. 
+       With the out-of-place, the buffers are different. 
+     - ``0`` 
+   * - ``-a``, ``--sycl_dev_type``
+     - Specify the type of the SYCL device. The possible values are ``host``, ``cpu``, and ``gpu``. 
+     - ``gpu``
+   * - ``-g``, ``--sycl_root_dev``
+     - Specify to use the root devices (``0``) and sub-devices (``1``). 
+     - ``0`` 
+   * - ``-m``, ``--sycl_mem_type``
+     - Specify the type of SYCL memory. The possible values are ``usm`` (unified shared memory) and ``buf`` (buffers). 
+     - ``usm``
+   * - ``-u``, ``--sycl_usm_type``
+     - Specify the type of SYCL device. The possible values are ``device`` or ``shared``. 
+     - ``device`` 
+   * - ``-e``, ``--sycl_queue_type`` 
+     - Specify the type of the SYCL queue. The possible values are ``in_order`` and ``out_order``. 
+     - ``out_order``
+   * - ``-l``, ``--coll``
+     - Specify the collective to run. Accept a comma-separated list, without whitespace characters, of collectives to run. The available collectives are ``allreduce``, ``reduce``, ``alltoallv``, ``alltoall``, ``allgatherv``, ``reduce-scatter``, ``broadcast``. 
+     - ``allreduce`` 
+   * - ``-d``, ``--dtype``
+     - Specify the datatype. Accept a comma-separated list, without whitespace characters, of datatypes to benchmark. The available types are ``int8``, ``int32``, ``int64``, ``uint64``, ``float16``, ``float32``, and ``bfloat16``. 
+     - ``float32``
+   * - ``-r``, ``--reduction``
+     - Specify the type of the reduction. Accept a coma-separated list, without whitespace characters, of the reduction operations to run. The available operations are ``sum``, ``prod``, ``min``, and ``max``. 
+     - ``sum``
+   * - ``-o``, ``--csv_filepath`` 
+     - Specify to store the output in the specified CSV file. User specifies the csv_filepath/file_to_store CSV-formatted data into
+     - 
+   * - ``-x``, ``--ext``
+     - Specify to show the additional information. The possible values are ``off``, ``auto``, and ``on``. With ``on``, it also displays the average wait time. 
+     - ``auto`` 
+   * - ``-h``, ``--help``
+     - Show all of the supported options.
+     -
+
+.. note:: 
+   
+   The ``-t`` and ``-f`` options specify the count in number of elements, so the total number of bytes is obtained by multiplying the number of elements by the number of bytes of the data type the collective uses. 
+   For instance, with ``-f 128`` and ``fp32`` datatype, the total amount of bytes is 512 (128 element count * 4 bytes FP32).
+   The benchmark runs and reports time for message sizes that correspond to the ``-t`` and ``-f`` arguments and all message sizes that are powers of two in between these two numbers. 
+
+
+Example
+********
+
+GPU
+^^^^
+
+The following example shows how to run the benchmark with the GPU buffers:
+
+.. code::
+   
+   mpirun -n <N> -ppn <P> benchmark -a gpu -m usm -u device -l allreduce -i 20 -j off -f 1024 -t 67108864 -d float32 -p 0 -e in_order
+
+
+The above command runs:
+
+* The ``allreduce`` collective operation 
+* With a total of ``N`` processes
+* With ``P`` processes per node allocating the memory in the GPU
+* Using SYCL Unified Shared Memory (USM) of the device type
+* 20 iterations
+* With the element count from 1024 to 67108864 (the benchmark runs with all the powers on two in that range) of float32 datatype, assuming the collective is not persistent and using a SYCL in-order queue
+
+
+Similar for ``allreduce`` and ``reduce_scatter``:
+
+.. code::
+   
+   mpirun -n <N> -ppn <P> benchmark -a gpu -m usm -u device -l allreduce,reduce_scatter -i 20 -j off -f 1024 -t 67108864 -d float32 -p 0 -e in_order 
+
+.. note:: In this case, the time reported is the accumulated time corresponding to the execution time of ``allreduce`` and ``reduce_scatter``. 
+
+CPU
+^^^^
+
+.. code::
+
+   mpirun -b host -n <N> -ppn <P> benchmark -l allreduce -i 20 -j off -f 1024 -t 67108864 -d float32 -p 0 
+
+
+The above command specifies to run 
+
+* The ``allreduce`` collective operation 
+* With a total of ``N`` processes
+* With ``P`` processes per node
+* 20 iterations
+* With the element count from 1024 to 67108864 (the benchmark runs with all the powers on two in that range) of float32 datatype, assuming the collective is not persistent
+
+
+Similar for ``allreduce`` and ``reduce_scatter``:
+
+.. code::
+
+   mpirun -b host -n <N> -ppn <P> benchmark -l allreduce,reduce_scatter -i 20 -j off -f 1024 -t 67108864 -d float32 -p 0 
diff --git a/doc/rst/source/env-variables.rst b/doc/rst/source/env-variables.rst
index b1c9ab64a..46ba589b9 100644
--- a/doc/rst/source/env-variables.rst
+++ b/doc/rst/source/env-variables.rst
@@ -2,6 +2,8 @@
 Environment Variables
 =====================
 
+.. _collective-algorithms-selection:
+
 Collective Algorithms Selection
 ###############################
 oneCCL supports collective operations for the host (CPU) memory buffers and device (GPU) memory buffers. Below you can see how to select the collective algorithm depending on the type of buffer being utilized. 
@@ -68,13 +70,13 @@ CCL_ALLGATHERV_MONOLITHIC_PIPELINE_KERNEL
    * - <value>
      - Description
    * - ``1``
-     - Uses compute kernels to transfer data across GPUs for the ``ALLGATHERV`` collective. 
+     - Uses compute kernels to transfer data across GPUs for the ``ALLGATHERV`` collective. The default value.
    * - ``0``
-     - Uses copy engines to transfer data across GPUs for the ``ALLGATHERV`` collective. The default value.
+     - Uses copy engines to transfer data across GPUs for the ``ALLGATHERV`` collective. 
   
 **Description**
 
-Set this environment variable to enable compute kernels for the ``ALLGATHERV`` collective using device (GPU) buffers. 
+Set this environment variable to enable compute kernels that pipeline data transfers across tiles in the same GPU with data transfers across different GPUs,  for the ``ALLGATHERV ``collective using device (GPU) buffers.
 
 
 
@@ -98,13 +100,13 @@ CCL_REDUCE_SCATTER_MONOLITHIC_PIPELINE_KERNEL
    * - <value>
      - Description
    * - ``1``
-     - Uses compute kernels for the ``ALLREDUCE``, ``REDUCE``, and ``REDUCE_SCATTER`` collectives. 
+     - Uses compute kernels for the ``ALLREDUCE``, ``REDUCE``, and ``REDUCE_SCATTER`` collectives. The default value. 
    * - ``0``
-     - Uses copy engines to transfer data across GPUs for the ``ALLREDUCE``, ``REDUCE``, and ``REDUCE_SCATTER collectives``. The default value. 
+     - Uses copy engines to transfer data across GPUs for the ``ALLREDUCE``, ``REDUCE``, and ``REDUCE_SCATTER collectives``. 
   
 **Description**
 
-Set this environment variable to enable compute kernels, that pipeline data transfers across tiles in the same GPU and across different GPUs, for the ``ALLREDUCE``, ``REDUCE``, and ``REDUCE_SCATTER`` collectives using the device (GPU) buffers. 
+Set this environment variable to enable compute kernels that pipeline data transfers across tiles in the same GPU with data transfers across different GPUs for the ``ALLREDUCE``, ``REDUCE``, and ``REDUCE_SCATTER`` collectives using the device (GPU) buffers.  
  
 
 CCL_ALLTOALLV_MONOLITHIC_KERNEL 
@@ -135,6 +137,38 @@ CCL_ALLTOALLV_MONOLITHIC_KERNEL
 Set this environment variable to enable compute kernels for the ``ALLTOALL`` and ``ALLTOALLV`` collectives using device (GPU) buffers
 ``CCL_<coll_name>_SCALEOUT``. 
 
+
+CCL_SKIP_SCHEDULER  
+++++++++++++++++++
+
+**Syntax**
+
+::
+
+  CCL_SKIP_SCHEDULER=<value> 
+
+**Arguments**
+
+.. list-table:: 
+   :widths: 25 50
+   :header-rows: 1
+   :align: left
+
+   * - <value>
+     - Description
+   * - ``1``
+     - Enable SYCL kernels
+   * - ``0``
+     - Disable SYCL kernels. The default value. 
+
+**Description**
+
+Set this environment variable to ``1`` to enable the SYCL kernel-based implementation for ``ALLGATHERV``, ``ALLREDUCE``, and ``REDUCE_SCATTER``. 
+This new optimization enhances all message sizes and supports the ``int32``, ``fp32``, ``fp16``, and ``bf16`` data types, sum operations, and single nodes. 
+oneCCL falls back to other implementations when the support is unavailable with SYCL kernels. Therefore, you can safely set this environment variable.
+
+
+
 SCALEOUT
 ++++++++
 
@@ -423,6 +457,7 @@ Workers
 
 The group of environment variables to control worker threads.
 
+.. _CCL_WORKER_COUNT:
 
 CCL_WORKER_COUNT
 ****************
@@ -448,6 +483,7 @@ CCL_WORKER_COUNT
 
 Set this environment variable to specify the number of |product_short| worker threads.
 
+.. _CCL_WORKER_AFFINITY:
 
 CCL_WORKER_AFFINITY
 *******************
@@ -519,6 +555,8 @@ ATL
 The group of environment variables to control ATL (abstract transport layer).
 
 
+.. _CCL_ATL_TRANSPORT:
+
 CCL_ATL_TRANSPORT
 *****************
 **Syntax**
@@ -792,6 +830,7 @@ CCL_MNIC_COUNT
 Set this environment variable to specify the maximum number of NICs to be selected.
 The actual number of NICs selected may be smaller due to limitations on transport level or system configuration.
 
+.. _low-precision-datatypes:
 
 Low-precision datatypes
 #######################
@@ -946,6 +985,7 @@ CCL_FUSION
 Set this environment variable to control fusion of collective operations.
 The real fusion depends on additional settings described below.
 
+.. _CCL_FUSION_BYTES_THRESHOLD:
 
 CCL_FUSION_BYTES_THRESHOLD
 **************************
@@ -972,6 +1012,7 @@ CCL_FUSION_BYTES_THRESHOLD
 
 Set this environment variable to specify the threshold of the number of bytes for a collective operation to be fused.
 
+.. _CCL_FUSION_COUNT_THRESHOLD:
 
 CCL_FUSION_COUNT_THRESHOLD
 **************************
@@ -999,6 +1040,8 @@ CCL_FUSION_COUNT_THRESHOLD
 Set this environment variable to specify count threshold for a collective operation to be fused.
 
 
+.. _CCL_FUSION_CYCLE_MS:
+
 CCL_FUSION_CYCLE_MS
 *******************
 **Syntax**
@@ -1026,6 +1069,7 @@ CCL_FUSION_CYCLE_MS
 
 Set this environment variable to specify the frequency of checking for collectives operations to be fused.
 
+.. _CCL_PRIORITY:
 
 CCL_PRIORITY
 ############
diff --git a/doc/rst/source/general-configuration/operation-execution.rst b/doc/rst/source/general-configuration/operation-execution.rst
index 177866e9e..fc308a0c2 100644
--- a/doc/rst/source/general-configuration/operation-execution.rst
+++ b/doc/rst/source/general-configuration/operation-execution.rst
@@ -4,9 +4,9 @@
 Execution of Communication Operations
 =====================================
 
-Communication operations are executed by CCL worker threads (workers). The number of workers is controlled by the :ref:`CCL_WORKER_COUNT` environment variable.
+Communication operations are executed by CCL worker threads (workers). The number of workers is controlled by the :ref:`CCL_WORKER_COUNT <CCL_WORKER_COUNT>` environment variable.
 
-Workers affinity is controlled by :ref:`CCL_WORKER_AFFINITY`.
+Workers affinity is controlled by :ref:`CCL_WORKER_AFFINITY <CCL_WORKER_AFFINITY>`.
 
 By setting workers affinity you can specify which CPU cores are used by CCL workers. The general rule of thumb is to use different CPU cores for compute (e.g. by specifying ``KMP_AFFINITY``) and for CCL communication.
 
@@ -28,7 +28,7 @@ In the example below, |product_short| creates four workers per process and pins
 Explicit setup
 ##############
 
-To set affinity explicitly for all local workers, pass ID of the cores to the ``CCL_WORKER_AFFINITY`` environment variable. 
+To set affinity explicitly for all local workers, pass ID of the cores to the ``CCL_WORKER_AFFINITY`` environment variable.
 
 .. rubric:: Example
 
diff --git a/doc/rst/source/general-configuration/transport-selection.rst b/doc/rst/source/general-configuration/transport-selection.rst
index b22798fc3..c8056aeaa 100644
--- a/doc/rst/source/general-configuration/transport-selection.rst
+++ b/doc/rst/source/general-configuration/transport-selection.rst
@@ -6,6 +6,6 @@ Transport Selection
 
 |product_short| supports two transports for inter-process communication: |mpi|_ and `libfabric* <https://github.com/ofiwg/libfabric>`_.
 
-The transport selection is controlled by :ref:`CCL_ATL_TRANSPORT`.
+The transport selection is controlled by :ref:`CCL_ATL_TRANSPORT <CCL_ATL_TRANSPORT>`.
 
 In case of MPI over libfabric implementation (for example, |mpi| 2021) or in case of direct libfabric transport, the selection of specific libfabric provider is controlled by the ``FI_PROVIDER`` environment variable.
diff --git a/doc/rst/source/index.rst b/doc/rst/source/index.rst
index 828df326c..1ff4567fe 100644
--- a/doc/rst/source/index.rst
+++ b/doc/rst/source/index.rst
@@ -40,6 +40,12 @@
    api.rst
    env-variables.rst
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Benchmark User Guide
+
+   benchmark.rst
+
 .. toctree::
    :hidden: 
    :caption: Notices and Disclaimers
diff --git a/doc/rst/source/introduction/installation.rst b/doc/rst/source/introduction/installation.rst
index bd74cf699..e0c11ead1 100644
--- a/doc/rst/source/introduction/installation.rst
+++ b/doc/rst/source/introduction/installation.rst
@@ -91,3 +91,9 @@ You can customize CLI-based installation (for example, specify directory, compil
   ::
 
      make -j VERBOSE=1 install
+
+Find More
+*********
+- `oneCCL Get Started Guide <https://www.intel.com/content/www/us/en/docs/oneccl/get-started-guide/2021-11/overview.html>`_
+- `oneCCL GitHub Source Code Repository <https://github.com/oneapi-src/oneCCL>`_
+- `oneCCL Documentation <https://oneapi-src.github.io/oneCCL/??#>`_
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c54bff196..8435d41b4 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -126,9 +126,8 @@ add_subdirectory(cpu)
 
 if ("${COMPUTE_BACKEND}" STREQUAL "dpcpp")
     add_subdirectory(sycl)
-    #TODO: add cpu support
-    add_subdirectory(pt2pt)
 endif()
 add_subdirectory(common)
 add_subdirectory(benchmark)
 add_subdirectory(external_launcher)
+add_subdirectory(pt2pt)
diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
index 4362a392e..4e39bc2b7 100644
--- a/examples/benchmark/include/benchmark.hpp
+++ b/examples/benchmark/include/benchmark.hpp
@@ -44,7 +44,7 @@ using namespace cl::sycl::access;
 #include "bf16.hpp"
 #include "coll.hpp"
 
-/* free letters: k e v z */
+/* free letters: k v z */
 void print_help_usage(const char* app) {
     PRINT("\nUSAGE:\n"
           "\t%s [OPTIONS]\n\n"
@@ -68,6 +68,7 @@ void print_help_usage(const char* app) {
           "\t[-g,--sycl_root_dev <select root devices only]: %d\n"
           "\t[-m,--sycl_mem_type <sycl memory type>]: %s\n"
           "\t[-u,--sycl_usm_type <sycl usm type>]: %s\n"
+          "\t[-e,--sycl_queue_type <sycl queue type>]: %s\n"
 #endif // CCL_ENABLE_SYCL
           "\t[-l,--coll <collectives list/all>]: %s\n"
           "\t[-d,--dtype <datatypes list/all>]: %s\n"
@@ -97,6 +98,7 @@ void print_help_usage(const char* app) {
           DEFAULT_SYCL_ROOT_DEV,
           sycl_mem_names[DEFAULT_SYCL_MEM_TYPE].c_str(),
           sycl_usm_names[DEFAULT_SYCL_USM_TYPE].c_str(),
+          sycl_queue_names[DEFAULT_SYCL_QUEUE_TYPE].c_str(),
 #endif // CCL_ENABLE_SYCL
           DEFAULT_COLL_LIST,
           DEFAULT_DTYPES_LIST,
@@ -244,6 +246,20 @@ int set_sycl_usm_type(const std::string& option_value, sycl_usm_type_t& usm) {
 
     return 0;
 }
+
+int set_sycl_queue_type(const std::string& option_value, sycl_queue_type_t& queue) {
+    std::string option_name = "sycl_queue_type";
+    std::set<std::string> supported_option_values{ sycl_queue_names[SYCL_QUEUE_OUT_ORDER],
+                                                   sycl_queue_names[SYCL_QUEUE_IN_ORDER] };
+
+    if (check_supported_options(option_name, option_value, supported_option_values))
+        return -1;
+
+    queue = (option_value == sycl_queue_names[SYCL_QUEUE_OUT_ORDER]) ? SYCL_QUEUE_OUT_ORDER
+                                                                     : SYCL_QUEUE_IN_ORDER;
+
+    return 0;
+}
 #endif // CCL_ENABLE_SYCL
 
 int set_datatypes(std::string option_value,
@@ -552,7 +568,7 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
 #endif // CCL_ENABLE_NUMA
 
 #ifdef CCL_ENABLE_SYCL
-    const char* sycl_options = "a:g:m:u:";
+    const char* sycl_options = "a:g:m:u:e:";
     memcpy(short_options + strlen(short_options), sycl_options, strlen(sycl_options));
 #endif // CCL_ENABLE_SYCL
 
@@ -576,6 +592,7 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
         { "sycl_root_dev", required_argument, nullptr, 'g' },
         { "sycl_mem_type", required_argument, nullptr, 'm' },
         { "sycl_usm_type", required_argument, nullptr, 'u' },
+        { "sycl_queue_type", required_argument, nullptr, 'e' },
 #endif // CCL_ENABLE_SYCL
         { "coll", required_argument, nullptr, 'l' },
         { "dtype", required_argument, nullptr, 'd' },
@@ -683,6 +700,12 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
                     errors++;
                 }
                 break;
+            case 'e':
+                if (set_sycl_queue_type(optarg, options.sycl_queue_type)) {
+                    PRINT("failed to parse 'sycl_queue_type' option");
+                    errors++;
+                }
+                break;
 #endif // CCL_ENABLE_SYCL
             case 'l':
                 if (strcmp("all", optarg) == 0) {
@@ -734,10 +757,11 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
     }
 
     if (options.inplace) {
-        //TODO: "allgatherv"
-        std::initializer_list<std::string> supported_colls = { "allreduce",
-                                                               "alltoall",
-                                                               "alltoallv" };
+        //TODO: "allgatherv", "reduce_scatter" it'd pass with sycl kernels
+        // they must be checked with schedule architicture
+        std::initializer_list<std::string> supported_colls = {
+            "allgatherv", "allreduce", "alltoall", "alltoallv", "reduce_scatter"
+        };
         for (auto name : options.coll_names) {
             if (!is_inplace_supported(name, supported_colls)) {
                 PRINT("inplace is not supported for %s yet", name.c_str());
@@ -810,36 +834,38 @@ void print_user_options(const user_options_t& options, const ccl::communicator&
     std::string sycl_dev_type_str = find_str_val(sycl_dev_names, options.sycl_dev_type);
     std::string sycl_mem_type_str = find_str_val(sycl_mem_names, options.sycl_mem_type);
     std::string sycl_usm_type_str = find_str_val(sycl_usm_names, options.sycl_usm_type);
-#endif
+    std::string sycl_queue_type_str = find_str_val(sycl_queue_names, options.sycl_queue_type);
+#endif // CCL_ENABLE_SYCL
 
     PRINT_BY_ROOT(comm,
                   "\noptions:"
-                  "\n  processes:      %d"
-                  "\n  backend:        %s"
-                  "\n  iters:          %zu"
-                  "\n  warmup_iters:   %zu"
-                  "\n  iter_policy:    %s"
-                  "\n  buf_count:      %zu"
-                  "\n  min_elem_count: %zu"
-                  "\n  max_elem_count: %zu"
-                  "\n  elem_counts:    %s"
-                  "\n  check:          %s"
-                  "\n  cache:          %d"
-                  "\n  inplace:        %d"
+                  "\n  processes:       %d"
+                  "\n  backend:         %s"
+                  "\n  iters:           %zu"
+                  "\n  warmup_iters:    %zu"
+                  "\n  iter_policy:     %s"
+                  "\n  buf_count:       %zu"
+                  "\n  min_elem_count:  %zu"
+                  "\n  max_elem_count:  %zu"
+                  "\n  elem_counts:     %s"
+                  "\n  check:           %s"
+                  "\n  cache:           %d"
+                  "\n  inplace:         %d"
 #ifdef CCL_ENABLE_NUMA
-                  "\n  numa_node:      %s"
+                  "\n  numa_node:       %s"
 #endif // CCL_ENABLE_NUMA
 #ifdef CCL_ENABLE_SYCL
-                  "\n  sycl_dev_type:  %s"
-                  "\n  sycl_root_dev:  %d"
-                  "\n  sycl_mem_type:  %s"
-                  "\n  sycl_usm_type:  %s"
+                  "\n  sycl_dev_type:   %s"
+                  "\n  sycl_root_dev:   %d"
+                  "\n  sycl_mem_type:   %s"
+                  "\n  sycl_usm_type:   %s"
+                  "\n  sycl_queue_type: %s"
 #endif // CCL_ENABLE_SYCL
-                  "\n  collectives:    %s"
-                  "\n  datatypes:      %s"
-                  "\n  reductions:     %s"
-                  "\n  extended info:  %s"
-                  "\n  csv_filepath:   %s",
+                  "\n  collectives:     %s"
+                  "\n  datatypes:       %s"
+                  "\n  reductions:      %s"
+                  "\n  extended info:   %s"
+                  "\n  csv_filepath:    %s",
                   comm.size(),
                   backend_str.c_str(),
                   options.iters,
@@ -862,6 +888,7 @@ void print_user_options(const user_options_t& options, const ccl::communicator&
                   options.sycl_root_dev,
                   sycl_mem_type_str.c_str(),
                   sycl_usm_type_str.c_str(),
+                  sycl_queue_type_str.c_str(),
 #endif // CCL_ENABLE_SYCL
                   collectives_str.c_str(),
                   datatypes_str.c_str(),
diff --git a/examples/benchmark/include/config.hpp b/examples/benchmark/include/config.hpp
index 771dd8c73..8d83922cc 100644
--- a/examples/benchmark/include/config.hpp
+++ b/examples/benchmark/include/config.hpp
@@ -34,23 +34,24 @@
 #else // CCL_ENABLE_SYCL
 #define DEFAULT_BACKEND BACKEND_HOST
 #endif // CCL_ENABLE_SYCL
-#define DEFAULT_ITERS          (16)
-#define DEFAULT_WARMUP_ITERS   (16)
-#define DEFAULT_ITER_POLICY    ITER_POLICY_AUTO
-#define DEFAULT_BUF_COUNT      (1)
-#define DEFAULT_MIN_ELEM_COUNT (1)
-#define DEFAULT_MAX_ELEM_COUNT (128)
-#define DEFAULT_CHECK_VALUES   CHECK_LAST_ITER
-#define DEFAULT_EXT_VALUES     EXT_AUTO
-#define DEFAULT_CACHE_OPS      (1)
-#define DEFAULT_INPLACE        (0)
-#define DEFAULT_RANKS_PER_PROC (1)
-#define DEFAULT_NUMA_NODE      (-1)
-#define DEFAULT_NUMA_NODE_STR  "<default>"
-#define DEFAULT_SYCL_DEV_TYPE  SYCL_DEV_GPU
-#define DEFAULT_SYCL_ROOT_DEV  (0)
-#define DEFAULT_SYCL_MEM_TYPE  SYCL_MEM_USM
-#define DEFAULT_SYCL_USM_TYPE  SYCL_USM_DEVICE
+#define DEFAULT_ITERS           (16)
+#define DEFAULT_WARMUP_ITERS    (16)
+#define DEFAULT_ITER_POLICY     ITER_POLICY_AUTO
+#define DEFAULT_BUF_COUNT       (1)
+#define DEFAULT_MIN_ELEM_COUNT  (1)
+#define DEFAULT_MAX_ELEM_COUNT  (128)
+#define DEFAULT_CHECK_VALUES    CHECK_LAST_ITER
+#define DEFAULT_EXT_VALUES      EXT_AUTO
+#define DEFAULT_CACHE_OPS       (1)
+#define DEFAULT_INPLACE         (0)
+#define DEFAULT_RANKS_PER_PROC  (1)
+#define DEFAULT_NUMA_NODE       (-1)
+#define DEFAULT_NUMA_NODE_STR   "<default>"
+#define DEFAULT_SYCL_DEV_TYPE   SYCL_DEV_GPU
+#define DEFAULT_SYCL_ROOT_DEV   (0)
+#define DEFAULT_SYCL_MEM_TYPE   SYCL_MEM_USM
+#define DEFAULT_SYCL_USM_TYPE   SYCL_USM_DEVICE
+#define DEFAULT_SYCL_QUEUE_TYPE SYCL_QUEUE_OUT_ORDER
 
 #define DEFAULT_COLL_LIST       "allreduce"
 #define DEFAULT_DTYPES_LIST     "float32"
diff --git a/examples/benchmark/include/sycl_coll.hpp b/examples/benchmark/include/sycl_coll.hpp
index 26cec4117..6bb3c09fe 100644
--- a/examples/benchmark/include/sycl_coll.hpp
+++ b/examples/benchmark/include/sycl_coll.hpp
@@ -60,8 +60,17 @@ struct sycl_base_coll : base_coll, private strategy {
                     ASSERT(0, "unexpected bench_alloc_type %d", bench_alloc_type);
 
                 for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                    size_t multiplier = send_multiplier;
+                    if (strcmp(coll_strategy::class_name(), "allgatherv") == 0 &&
+                        base_coll::get_inplace()) {
+                        // for allgatherv, send_multiplier = 1 and recv_multiplier = comm_size
+                        // since recv_buffer is comm_size*sizeof(send_buffer).
+                        // for inplace, send_buffer is same size as recv_buffer
+                        // TODO: create prepare_internal for allgatherv
+                        multiplier = recv_multiplier;
+                    }
                     send_bufs[idx][rank_idx] = allocator.allocate(
-                        base_coll::get_max_elem_count() * send_multiplier, usm_alloc_type);
+                        base_coll::get_max_elem_count() * multiplier, usm_alloc_type);
 
                     if (base_coll::get_inplace()) {
                         recv_bufs[idx][rank_idx] = send_bufs[idx][rank_idx];
@@ -162,10 +171,22 @@ struct sycl_base_coll : base_coll, private strategy {
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
-                stream.get_native()
-                    .memcpy(send_bufs[b_idx][rank_idx], host_send_buf.data(), send_bytes)
-                    .wait();
-
+                if (strcmp(coll_strategy::class_name(), "allgatherv") == 0 &&
+                    base_coll::get_inplace()) {
+                    // for inplace allgatherv, the input data needs to be at an index comm_rank*send_count
+                    // of the send_buffer rather than at index 0 for the non-inplace case
+                    //  TODO: create prepare_internal for allgatherv
+                    stream.get_native()
+                        .memcpy((char*)(send_bufs[b_idx][rank_idx]) + send_bytes * comm_rank,
+                                host_send_buf.data(),
+                                send_bytes)
+                        .wait();
+                }
+                else {
+                    stream.get_native()
+                        .memcpy(send_bufs[b_idx][rank_idx], host_send_buf.data(), send_bytes)
+                        .wait();
+                }
                 if (!base_coll::get_inplace()) {
                     stream.get_native().memset(recv_bufs[b_idx][rank_idx], -1, recv_bytes).wait();
                 }
diff --git a/examples/benchmark/include/types.hpp b/examples/benchmark/include/types.hpp
index cc73e323e..ccb7b9a0e 100644
--- a/examples/benchmark/include/types.hpp
+++ b/examples/benchmark/include/types.hpp
@@ -40,6 +40,7 @@ typedef enum { EXT_OFF, EXT_AUTO, EXT_ON } ext_values_t;
 typedef enum { SYCL_DEV_HOST, SYCL_DEV_CPU, SYCL_DEV_GPU } sycl_dev_type_t;
 typedef enum { SYCL_MEM_USM, SYCL_MEM_BUF } sycl_mem_type_t;
 typedef enum { SYCL_USM_SHARED, SYCL_USM_DEVICE } sycl_usm_type_t;
+typedef enum { SYCL_QUEUE_OUT_ORDER, SYCL_QUEUE_IN_ORDER } sycl_queue_type_t;
 
 std::map<backend_type_t, std::string> backend_names = { std::make_pair(BACKEND_HOST, "host"),
                                                         std::make_pair(BACKEND_SYCL, "sycl") };
@@ -69,7 +70,12 @@ std::map<sycl_mem_type_t, std::string> sycl_mem_names = { std::make_pair(SYCL_ME
 std::map<sycl_usm_type_t, std::string> sycl_usm_names = { std::make_pair(SYCL_USM_SHARED, "shared"),
                                                           std::make_pair(SYCL_USM_DEVICE,
                                                                          "device") };
-#endif
+
+std::map<sycl_queue_type_t, std::string> sycl_queue_names = {
+    std::make_pair(SYCL_QUEUE_OUT_ORDER, "out_order"),
+    std::make_pair(SYCL_QUEUE_IN_ORDER, "in_order")
+};
+#endif // CCL_ENABLE_SYCL
 
 std::map<ccl::datatype, std::string> dtype_names = {
     std::make_pair(ccl::datatype::int8, "int8"),
@@ -133,6 +139,7 @@ typedef struct user_options_t {
     int sycl_root_dev;
     sycl_mem_type_t sycl_mem_type;
     sycl_usm_type_t sycl_usm_type;
+    sycl_queue_type_t sycl_queue_type;
 #endif // CCL_ENABLE_SYCL
     std::list<std::string> coll_names;
     std::list<std::string> dtypes;
@@ -163,6 +170,7 @@ typedef struct user_options_t {
         sycl_root_dev = DEFAULT_SYCL_ROOT_DEV;
         sycl_mem_type = DEFAULT_SYCL_MEM_TYPE;
         sycl_usm_type = DEFAULT_SYCL_USM_TYPE;
+        sycl_queue_type = DEFAULT_SYCL_QUEUE_TYPE;
 #endif // CCL_ENABLE_SYCL
         coll_names = tokenize<std::string>(DEFAULT_COLL_LIST, ',');
         dtypes = tokenize<std::string>(DEFAULT_DTYPES_LIST, ',');
diff --git a/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp b/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
index ca8bbd9f7..5f439b5ad 100644
--- a/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
+++ b/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
@@ -68,7 +68,7 @@ struct sycl_allgatherv_coll : sycl_base_coll<Dtype, allgatherv_strategy_impl> {
             Dtype value;
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                 value = host_send_buf[e_idx];
-                if (value != sbuf_expected) {
+                if (!base_coll::get_inplace() && (value != sbuf_expected)) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
                               << sbuf_expected << ", got " << value << std::endl;
diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
index 6e20cbd98..34613c0a0 100644
--- a/examples/benchmark/src/benchmark.cpp
+++ b/examples/benchmark/src/benchmark.cpp
@@ -138,12 +138,14 @@ void run(ccl::communicator& service_comm,
 
                             double coll_start_time = when();
                             for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) {
-                                match_id_stream << "coll_" << coll->name() << "_" << coll_idx
-                                                << "_count_" << count << "_buf_" << buf_idx
-                                                << "_dt_" << dtype_name << "_rt_" << reduction;
-                                bench_attr.set<ccl::operation_attr_id::match_id>(
-                                    ccl::string_class(match_id_stream.str()));
-                                match_id_stream.str("");
+                                if (options.cache_ops) {
+                                    match_id_stream << "coll_" << coll->name() << "_" << coll_idx
+                                                    << "_count_" << count << "_buf_" << buf_idx
+                                                    << "_dt_" << dtype_name << "_rt_" << reduction;
+                                    bench_attr.set<ccl::operation_attr_id::match_id>(
+                                        ccl::string_class(match_id_stream.str()));
+                                    match_id_stream.str("");
+                                }
                                 coll->start(count, buf_idx, bench_attr, reqs);
                             }
                             double coll_end_time = when();
@@ -172,12 +174,14 @@ void run(ccl::communicator& service_comm,
                             prepare_coll(options, service_comm, coll, count);
 
                             for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) {
-                                match_id_stream << "coll_" << coll->name() << "_" << coll_idx
-                                                << "_count_" << count << "_buf_" << buf_idx
-                                                << "_dt_" << dtype_name << "_rt_" << reduction;
-                                bench_attr.set<ccl::operation_attr_id::match_id>(
-                                    ccl::string_class(match_id_stream.str()));
-                                match_id_stream.str("");
+                                if (options.cache_ops) {
+                                    match_id_stream << "coll_" << coll->name() << "_" << coll_idx
+                                                    << "_count_" << count << "_buf_" << buf_idx
+                                                    << "_dt_" << dtype_name << "_rt_" << reduction;
+                                    bench_attr.set<ccl::operation_attr_id::match_id>(
+                                        ccl::string_class(match_id_stream.str()));
+                                    match_id_stream.str("");
+                                }
                                 coll->start(count, buf_idx, bench_attr, reqs);
                             }
 
diff --git a/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp b/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
index b8b8b41ab..776d7e888 100644
--- a/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
+++ b/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
@@ -69,7 +69,7 @@ struct sycl_reduce_scatter_coll : sycl_base_coll<Dtype, reduce_scatter_strategy_
 
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
                 Dtype value = host_send_buf[e_idx];
-                if (value != sbuf_expected) {
+                if (!base_coll::get_inplace() && value != sbuf_expected) {
                     std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
                               << rank_idx << ", elem_idx " << e_idx << ", expected "
                               << sbuf_expected << ", got " << value << std::endl;
diff --git a/examples/benchmark/src/transport_impl.hpp b/examples/benchmark/src/transport_impl.hpp
index 23b587338..38ded2b6a 100644
--- a/examples/benchmark/src/transport_impl.hpp
+++ b/examples/benchmark/src/transport_impl.hpp
@@ -113,8 +113,14 @@ void transport_data::init_comms(user_options_t& options) {
     else if (options.backend == BACKEND_SYCL) {
         show_extened_info(options.show_additional_info);
 
+        // empty props means out-of-order sycl queue
+        sycl::property_list props{};
+        if (options.sycl_queue_type) {
+            props = { sycl::property::queue::in_order{},
+                      sycl::property::queue::enable_profiling{} };
+        }
         auto sycl_queues = create_sycl_queues(
-            sycl_dev_names[options.sycl_dev_type], proc_ranks, options.sycl_root_dev);
+            sycl_dev_names[options.sycl_dev_type], proc_ranks, options.sycl_root_dev, props);
         ASSERT(!sycl_queues.empty(), "queues should contain at least one queue");
         ASSERT(static_cast<size_t>(ranks_per_proc) == sycl_queues.size(),
                "ranks and queues sizes should match");
diff --git a/examples/include/store.hpp b/examples/include/store.hpp
index b6c0275be..aa9d7d220 100644
--- a/examples/include/store.hpp
+++ b/examples/include/store.hpp
@@ -41,6 +41,8 @@ class base_store {
 
 class file_store : public base_store {
 public:
+    file_store(const file_store& other) = delete;
+    file_store& operator=(const file_store& other) = delete;
     file_store(std::string path, int rank, const std::chrono::seconds& timeout)
             : base_store(),
               path(path),
@@ -175,4 +177,4 @@ class file_store : public base_store {
     int fd;
     std::chrono::seconds timeout;
     std::mutex mtx;
-};
+};
diff --git a/examples/include/sycl_base.hpp b/examples/include/sycl_base.hpp
index 821f68455..ae4403961 100644
--- a/examples/include/sycl_base.hpp
+++ b/examples/include/sycl_base.hpp
@@ -221,9 +221,11 @@ inline std::vector<sycl::device> create_sycl_gpu_devices(bool select_root_device
                           part_props.end(),
                           info::partition_property::partition_by_affinity_domain) ==
                 part_props.end()) {
-                ss_warn << prefix << "device [" << device_name
-                        << "] does not support partition by affinity domain"
-                        << ", use root device\n";
+                // ZE_FLAT_DEVICE_HIERARCHY=FLAT is by default now, meaning that
+                // tile is a root device, the warning is extra in this case
+                // ss_warn << prefix << "device [" << device_name
+                //         << "] does not support partition by affinity domain"
+                //         << ", use root device\n";
                 result.push_back(device);
                 continue;
             }
@@ -436,8 +438,12 @@ inline bool create_sycl_queue(const std::string& type,
     }
 }
 
-inline bool create_sycl_queue(int argc, char* argv[], int rank, queue& q) {
-    return create_sycl_queue(((argc >= 2) ? argv[1] : "unknown"), rank, q, {});
+inline bool create_sycl_queue(int argc,
+                              char* argv[],
+                              int rank,
+                              queue& q,
+                              const sycl::property_list& queue_props = {}) {
+    return create_sycl_queue(((argc >= 2) ? argv[1] : "unknown"), rank, q, queue_props);
 }
 
 inline bool handle_exception(queue& q) {
@@ -488,6 +494,7 @@ struct buf_allocator {
 
     buf_allocator(queue& q) : q(q) {}
 
+    buf_allocator& operator=(const buf_allocator&) = delete;
     buf_allocator(const buf_allocator&) = delete;
     buf_allocator(buf_allocator&&) = default;
 
diff --git a/examples/pt2pt/include/pt2pt_base.hpp b/examples/pt2pt/include/pt2pt_base.hpp
index cf1ff11f9..70899e136 100644
--- a/examples/pt2pt/include/pt2pt_base.hpp
+++ b/examples/pt2pt/include/pt2pt_base.hpp
@@ -24,19 +24,26 @@
 #include "types.hpp"
 
 typedef struct user_options_t {
+    backend_type_t backend;
     uint32_t cache;
     uint32_t iters;
 
     std::vector<int> peers;
     uint32_t queue;
-    int min_elem_count;
-    int max_elem_count;
+    size_t min_elem_count;
+    size_t max_elem_count;
+    std::list<size_t> elem_counts;
     validate_values_t validate;
     uint32_t warmup_iters;
     uint32_t wait;
     int window_size;
 
+    bool min_elem_count_set;
+    bool max_elem_count_set;
+    bool elem_counts_set;
+
     user_options_t() {
+        backend = DEFAULT_BACKEND;
         iters = DEFAULT_ITERS;
         warmup_iters = DEFAULT_WARMUP_ITERS;
         cache = DEFAULT_CACHE_OPS;
@@ -44,6 +51,7 @@ typedef struct user_options_t {
         wait = DEFAULT_WAIT;
         min_elem_count = DEFAULT_MIN_ELEM_COUNT;
         max_elem_count = DEFAULT_MAX_ELEM_COUNT;
+        fill_elem_counts(elem_counts, min_elem_count, max_elem_count);
         validate = DEFAULT_VALIDATE;
         // for bw benchmark
         window_size = DEFAULT_WINDOW_SIZE;
@@ -52,18 +60,82 @@ typedef struct user_options_t {
         // filling out with the default values
         peers.push_back(0);
         peers.push_back(1);
+
+        min_elem_count_set = false;
+        max_elem_count_set = false;
+        elem_counts_set = false;
     }
 } user_options_t;
 
+void adjust_elem_counts(user_options_t& options) {
+    if (options.max_elem_count < options.min_elem_count) {
+        options.max_elem_count = options.min_elem_count;
+    }
+
+    if (options.elem_counts_set) {
+        /* adjust min/max_elem_count or elem_counts */
+        if (options.min_elem_count_set) {
+            /* apply user-supplied count as limiter */
+            options.elem_counts.remove_if([&options](const size_t& count) {
+                return (count < options.min_elem_count);
+            });
+        }
+        else {
+            if (options.elem_counts.empty()) {
+                options.min_elem_count = DEFAULT_MIN_ELEM_COUNT;
+            }
+            else {
+                options.min_elem_count =
+                    *(std::min_element(options.elem_counts.begin(), options.elem_counts.end()));
+            }
+        }
+        if (options.max_elem_count_set) {
+            /* apply user-supplied count as limiter */
+            options.elem_counts.remove_if([&options](const size_t& count) {
+                return (count > options.max_elem_count);
+            });
+        }
+        else {
+            if (options.elem_counts.empty()) {
+                options.max_elem_count = options.min_elem_count;
+            }
+            else {
+                options.max_elem_count =
+                    *(std::max_element(options.elem_counts.begin(), options.elem_counts.end()));
+            }
+        }
+    }
+    else {
+        fill_elem_counts(options.elem_counts, options.min_elem_count, options.max_elem_count);
+    }
+}
+
+int set_backend(const std::string& option_value, backend_type_t& backend) {
+    std::string option_name = "backend";
+    std::set<std::string> supported_option_values{ backend_names[BACKEND_CPU] };
+
+#ifdef CCL_ENABLE_SYCL
+    supported_option_values.insert(backend_names[BACKEND_GPU]);
+#endif // CCL_ENABLE_SYCL
+
+    if (check_supported_options(option_name, option_value, supported_option_values))
+        return -1;
+
+    backend = (option_value == backend_names[BACKEND_GPU]) ? BACKEND_GPU : BACKEND_CPU;
+    return 0;
+}
+
 int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
     int ch;
     int errors = 0;
+    std::list<int> elem_counts_int;
 
     char short_options[1024] = { 0 };
-    const char* base_options = "i:w:c:q:s:f:t:v:m:h";
+    const char* base_options = "b:i:w:c:q:s:f:t:y:v:m:h";
     memcpy(short_options, base_options, strlen(base_options));
 
     struct option getopt_options[] = {
+        { "backend", required_argument, nullptr, 'b' },
         { "iters", required_argument, nullptr, 'i' },
         { "warmup_iters", required_argument, nullptr, 'w' },
         { "cache", required_argument, nullptr, 'c' },
@@ -71,6 +143,7 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
         { "wait", required_argument, nullptr, 's' },
         { "min_elem_count", required_argument, nullptr, 'f' },
         { "max_elem_count", required_argument, nullptr, 't' },
+        { "elem_counts", required_argument, nullptr, 'y' },
         { "validate", required_argument, nullptr, 'v' },
         { "window", required_argument, nullptr, 'm' },
         { "help", no_argument, nullptr, 'h' },
@@ -80,6 +153,12 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
 
     while ((ch = getopt_long(argc, argv, short_options, getopt_options, nullptr)) != -1) {
         switch (ch) {
+            case 'b':
+                if (set_backend(optarg, options.backend)) {
+                    PRINT("failed to parse 'backend' option");
+                    errors++;
+                }
+                break;
             case 'i':
                 if (is_valid_integer_option(optarg)) {
                     options.iters = atoll(optarg);
@@ -111,6 +190,7 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
             case 'f':
                 if (is_valid_integer_option(optarg)) {
                     options.min_elem_count = atoll(optarg);
+                    options.min_elem_count_set = true;
                 }
                 else
                     errors++;
@@ -118,13 +198,30 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
             case 't':
                 if (is_valid_integer_option(optarg)) {
                     options.max_elem_count = atoll(optarg);
+                    options.max_elem_count_set = true;
                 }
                 else
                     errors++;
                 break;
+            case 'y':
+                elem_counts_int = tokenize<int>(optarg, ',');
+                elem_counts_int.remove_if([](const size_t& count) {
+                    return !is_valid_integer_option(count);
+                });
+                options.elem_counts = tokenize<size_t>(optarg, ',');
+                if (elem_counts_int.size() == options.elem_counts.size())
+                    options.elem_counts_set = true;
+                else
+                    errors++;
+                break;
             case 's':
                 if (is_valid_integer_option(optarg)) {
                     options.wait = atoll(optarg);
+                    if (options.wait == 0) {
+                        PRINT(
+                            "Warning: Non-blocking mode is not supported, fallback to blocking mode");
+                        options.wait = 1;
+                    }
                 }
                 else
                     errors++;
@@ -162,10 +259,15 @@ int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
         }
         return -1;
     }
+
+    adjust_elem_counts(options);
+
     return 0;
 }
 
-auto create_attr(const bool is_cache, const int count, const std::string& match_id_suffix) {
+ccl::pt2pt_attr create_attr(const bool is_cache,
+                            const int count,
+                            const std::string& match_id_suffix) {
     auto attr = ccl::create_operation_attr<ccl::pt2pt_attr>();
     if (is_cache) {
         std::string matchId = "_len_" + std::to_string(count) + match_id_suffix;
@@ -198,18 +300,46 @@ void print_timings(ccl::communicator& comm,
     std::cout << ss.str();
 }
 
-template <class Dtype>
-void check_buffers(sycl::queue q,
-                   const user_options_t& options,
-                   const int count,
-                   const size_t iter_idx,
-                   Dtype buf_recv) {
+void check_cpu_buffers(const int count, const size_t iter_idx, std::vector<int>& buf_recv) {
+    bool failed = false;
+    std::vector<int> check_buf(count);
+
+    for (auto id = 0; id < count; id++) {
+        if (buf_recv[id] != static_cast<int>(id + iter_idx)) {
+            check_buf[id] = INVALID_VALUE;
+        }
+    }
+
+    for (int j = 0; j < count; j++) {
+        if (check_buf[j] == INVALID_VALUE) {
+            failed = true;
+            break;
+        }
+    }
+
+    if (failed) {
+        std::cout << "FAILED: iter_idx: " << iter_idx << ", count: " << count << std::endl;
+        ASSERT(0, "unexpected value");
+    }
+}
+
+#ifdef CCL_ENABLE_SYCL
+void check_gpu_buffers(sycl::queue q,
+                       const user_options_t& options,
+                       const int count,
+                       const size_t iter_idx,
+                       int* buf_recv,
+                       std::vector<ccl::event>& ccl_events) {
     bool failed = false;
     sycl::buffer<int> check_buf(count);
 
     auto e = q.submit([&](auto& h) {
         sycl::accessor check_buf_acc(check_buf, h, sycl::write_only);
-        h.parallel_for(count, [=](auto id) {
+        if (!options.queue && !options.wait) {
+            h.depends_on(ccl_events.back().get_native());
+        }
+        // check_buf_acc moved, not used any more
+        h.parallel_for(count, [=, check_buf_acc = std::move(check_buf_acc)](auto id) {
             if (buf_recv[id] != static_cast<int>(id + iter_idx)) {
                 check_buf_acc[id] = INVALID_VALUE;
             }
@@ -235,11 +365,13 @@ void check_buffers(sycl::queue q,
         ASSERT(0, "unexpected value");
     }
 }
+#endif // CCL_ENABLE_SYCL
 
 void print_help_usage(const char* app) {
     PRINT("\nUSAGE:\n"
           "\t%s [OPTIONS]\n\n"
           "OPTIONS:\n"
+          "\t[-b,--backend <backend>]: %s\n"
           "\t[-i,--iters <iteration count>]: %d\n"
           "\t[-w,--warmup_iters <warm up iteration count>]: %d\n"
           "\t[-c,--cache <use persistent operations>]: %d\n"
@@ -247,10 +379,12 @@ void print_help_usage(const char* app) {
           "\t[-s,--wait <enable synchronization on sycl and pt2pt level>]: %d\n"
           "\t[-f,--min_elem_count <minimum element count>]: %d\n"
           "\t[-t,--max_elem_count <maximum element count>]: %d\n"
+          "\t[-y,--elem_counts <list of element counts>]: [%d-%d]\n"
           "\t[-v,--validate <validate result correctness>]: %s\n"
           "\t[-h,--help]\n\n"
-          "example:\n\t--queue 1 --cache 0 --validate 1\n",
+          "example:\n\t--backend gpu --queue 1 --cache 0 --validate 1 --elem_counts 64,1024\n",
           app,
+          backend_names[DEFAULT_BACKEND].c_str(),
           DEFAULT_ITERS,
           DEFAULT_WARMUP_ITERS,
           DEFAULT_CACHE_OPS,
@@ -258,21 +392,53 @@ void print_help_usage(const char* app) {
           DEFAULT_WAIT,
           DEFAULT_MIN_ELEM_COUNT,
           DEFAULT_MAX_ELEM_COUNT,
+          // elem_counts requires 2 values, min and max
+          DEFAULT_MIN_ELEM_COUNT,
+          DEFAULT_MAX_ELEM_COUNT,
           validate_values_names[DEFAULT_VALIDATE].c_str());
 }
 
+template <class Dtype, class Iter>
+std::string get_values_str(Iter first,
+                           Iter last,
+                           const char* opening,
+                           const char* ending,
+                           const char* delim) {
+    std::stringstream ss;
+    ss.str("");
+    ss << opening;
+    std::copy(first, last, std::ostream_iterator<Dtype>(ss, delim));
+    if (*ss.str().rbegin() == ' ') {
+        ss.seekp(-1, std::ios_base::end);
+    }
+    ss << ending;
+
+    return ss.str();
+}
+
 void print_user_options(const std::string benchmark,
                         const user_options_t& options,
                         const ccl::communicator& comm) {
     std::stringstream ss;
 
+    std::string backend_str = find_str_val(backend_names, options.backend);
+
+    std::string elem_counts_str = get_values_str<size_t>(
+        options.elem_counts.begin(), options.elem_counts.end(), "[", "]", " ");
+
     std::string validate_values_str = find_str_val(validate_values_names, options.validate);
 
+    ss.str("");
     ss << "\noptions:"
-       << "\n  iters:          " << options.iters << "\n  warmup_iters:   " << options.warmup_iters
-       << "\n  cache:          " << options.cache << "\n  queue:          " << options.queue
-       << "\n  wait:           " << options.wait << "\n  min_elem_count: " << options.min_elem_count
+       << "\n  backend:        " << backend_str << "\n  iters:          " << options.iters
+       << "\n  warmup_iters:   " << options.warmup_iters << "\n  cache:          " << options.cache;
+    if (options.backend == BACKEND_GPU) {
+        ss << "\n  queue:          " << options.queue << "\n  wait:           " << options.wait;
+    }
+
+    ss << "\n  min_elem_count: " << options.min_elem_count
        << "\n  max_elem_count: " << options.max_elem_count
+       << "\n  elem_counts:    " << elem_counts_str
        << "\n  validate:       " << validate_values_str;
 
     if (benchmark == "Bandwidth") {
diff --git a/examples/pt2pt/include/pt2pt_transport.hpp b/examples/pt2pt/include/pt2pt_transport.hpp
index e5e6c367e..d2921a2bd 100644
--- a/examples/pt2pt/include/pt2pt_transport.hpp
+++ b/examples/pt2pt/include/pt2pt_transport.hpp
@@ -19,11 +19,15 @@
 #include <vector>
 
 #include "oneapi/ccl.hpp"
+#ifdef CCL_ENABLE_SYCL
 #include "sycl_base.hpp"
+#endif // CCL_ENABLE_SYCL
 #include "pt2pt_base.hpp"
 
 class transport_data {
 public:
+    transport_data(const transport_data& other) = delete;
+    transport_data& operator=(const transport_data& other) = delete;
     static transport_data& instance();
     static size_t get_comm_size();
 
@@ -36,10 +40,12 @@ class transport_data {
     std::vector<ccl::communicator>& get_comms();
     void reset_comms();
 
+#ifdef CCL_ENABLE_SYCL
     std::vector<ccl::stream>& get_streams();
 
     void create_sycl_queue(user_options_t& options);
     sycl::queue get_sycl_queue();
+#endif // CCL_ENABLE_SYCL
 
 private:
     transport_data();
@@ -53,8 +59,10 @@ class transport_data {
     ccl::shared_ptr_class<ccl::kvs> kvs;
     std::vector<ccl::communicator> comms;
 
+#ifdef CCL_ENABLE_SYCL
     std::vector<ccl::stream> streams;
     sycl::queue queue;
+#endif // CCL_ENABLE_SYCL
 
     void init_by_mpi();
     void deinit_by_mpi();
@@ -114,28 +122,40 @@ void transport_data::deinit_by_mpi() {
     MPI_Finalize();
 }
 
+#ifdef CCL_ENABLE_SYCL
 std::vector<ccl::stream>& transport_data::get_streams() {
     return streams;
 }
+#endif // CCL_ENABLE_SYCL
 
 void transport_data::init_comms(user_options_t& options) {
-    create_sycl_queue(options);
+#ifdef CCL_ENABLE_SYCL
+    if (options.backend == BACKEND_GPU) {
+        create_sycl_queue(options);
 
-    auto q = get_sycl_queue();
+        auto q = get_sycl_queue();
 
-    // create communicator
-    auto dev = ccl::create_device(q.get_device());
-    auto ctx = ccl::create_context(q.get_context());
-    comms.push_back(ccl::create_communicator(size, rank, dev, ctx, kvs));
+        // create communicator
+        auto dev = ccl::create_device(q.get_device());
+        auto ctx = ccl::create_context(q.get_context());
+        comms.push_back(ccl::create_communicator(size, rank, dev, ctx, kvs));
 
-    // create stream
-    streams.push_back(ccl::create_stream(q));
+        // create stream
+        streams.push_back(ccl::create_stream(q));
+    }
+    else {
+#endif // CCL_ENABLE_SYCL
+        comms.push_back(ccl::create_communicator(size, rank, kvs));
+#ifdef CCL_ENABLE_SYCL
+    }
+#endif // CCL_ENABLE_SYCL
 }
 
 std::vector<ccl::communicator>& transport_data::get_comms() {
     return comms;
 }
 
+#ifdef CCL_ENABLE_SYCL
 void transport_data::create_sycl_queue(user_options_t& options) {
     sycl::property_list props{};
     if (options.queue) {
@@ -150,8 +170,11 @@ void transport_data::create_sycl_queue(user_options_t& options) {
 sycl::queue transport_data::get_sycl_queue() {
     return queue;
 }
+#endif // CCL_ENABLE_SYCL
 
 void transport_data::reset_comms() {
     comms.clear();
+#ifdef CCL_ENABLE_SYCL
     streams.clear();
+#endif // CCL_ENABLE_SYCL
 }
diff --git a/examples/pt2pt/include/types.hpp b/examples/pt2pt/include/types.hpp
index 6e78a6648..f4ca83e9e 100644
--- a/examples/pt2pt/include/types.hpp
+++ b/examples/pt2pt/include/types.hpp
@@ -24,7 +24,16 @@ std::map<validate_values_t, std::string> validate_values_names = {
     std::make_pair(VALIDATE_ALL_ITERS, "all")
 };
 
+typedef enum { BACKEND_CPU, BACKEND_GPU } backend_type_t;
+std::map<backend_type_t, std::string> backend_names = { std::make_pair(BACKEND_CPU, "cpu"),
+                                                        std::make_pair(BACKEND_GPU, "gpu") };
+
 // defines
+#ifdef CCL_ENABLE_SYCL
+#define DEFAULT_BACKEND BACKEND_GPU
+#else // CCL_ENABLE_SYCL
+#define DEFAULT_BACKEND BACKEND_CPU
+#endif // CCL_ENABLE_SYCL
 #define COL_WIDTH     (18)
 #define COL_PRECISION (2)
 
@@ -66,3 +75,29 @@ int set_validate_values(const std::string& option_value, validate_values_t& vali
 
     return 0;
 }
+
+template <typename T>
+std::list<T> tokenize(const std::string& input, char delimeter) {
+    std::istringstream ss(input);
+    std::list<T> ret;
+    std::string str;
+    while (std::getline(ss, str, delimeter)) {
+        std::stringstream converter;
+        converter << str;
+        T value;
+        converter >> value;
+        ret.push_back(value);
+    }
+    return ret;
+}
+
+void fill_elem_counts(std::list<size_t>& counts, const size_t min_count, const size_t max_count) {
+    counts.clear();
+    size_t count = 0;
+    for (count = min_count; count <= max_count; count *= 2) {
+        counts.push_back(count);
+    }
+    if (*counts.rbegin() != max_count) {
+        counts.push_back(max_count);
+    }
+}
diff --git a/examples/pt2pt/src/ccl_bw.cpp b/examples/pt2pt/src/ccl_bw.cpp
index b74a064d3..d4293ef52 100644
--- a/examples/pt2pt/src/ccl_bw.cpp
+++ b/examples/pt2pt/src/ccl_bw.cpp
@@ -14,34 +14,31 @@
  limitations under the License.
 */
 #include "base.hpp"
+#ifdef CCL_ENABLE_SYCL
 #include "sycl_base.hpp"
+#endif // CCL_ENABLE_SYCL
 #include "pt2pt_transport.hpp"
 
 #include "oneapi/ccl.hpp"
 
-int main(int argc, char* argv[]) {
-    user_options_t options;
-
-    if (parse_user_options(argc, argv, options)) {
-        print_help_usage(argv[0]);
-        exit(INVALID_RETURN);
-    }
-
+void run_gpu_backend(user_options_t& options) {
+#ifdef CCL_ENABLE_SYCL
     auto& transport = transport_data::instance();
     transport.init_comms(options);
 
-    auto q = transport.get_sycl_queue();
     auto rank = transport.get_rank();
     auto& comms = transport.get_comms();
-    auto streams = transport.get_streams();
 
     print_user_options("Bandwidth", options, comms[0]);
 
     double start_t = 0.0, end_t = 0.0, diff_t = 0.0;
     size_t dtype_size = sizeof(ccl::datatype::int32);
 
-    for (int count = options.min_elem_count; count <= options.max_elem_count;
-         count = (count ? count * 2 : 1)) {
+    auto q = transport.get_sycl_queue();
+    auto streams = transport.get_streams();
+    std::vector<ccl::event> ccl_events{};
+
+    for (auto& count : options.elem_counts) {
         auto buf_send = sycl::malloc_device<int>(count, q);
         auto buf_recv = sycl::malloc_device<int>(count, q);
 
@@ -82,6 +79,7 @@ int main(int argc, char* argv[]) {
                     if (options.wait) {
                         send_event.wait();
                     }
+                    ccl_events.emplace_back(std::move(send_event));
                 }
 
                 auto recv_event = ccl::recv(buf_recv,
@@ -94,6 +92,7 @@ int main(int argc, char* argv[]) {
                 if (options.wait) {
                     recv_event.wait();
                 }
+                ccl_events.emplace_back(std::move(recv_event));
 
                 end_t = MPI_Wtime();
                 diff_t = end_t - start_t;
@@ -128,6 +127,7 @@ int main(int argc, char* argv[]) {
                     if (options.wait) {
                         recv_event.wait();
                     }
+                    ccl_events.emplace_back(std::move(recv_event));
                 }
 
                 // we can send 1 count here, this pair is for aligning
@@ -142,10 +142,12 @@ int main(int argc, char* argv[]) {
                 if (options.wait) {
                     send_event.wait();
                 }
+                ccl_events.emplace_back(std::move(send_event));
+
                 if (options.validate == VALIDATE_ALL_ITERS ||
                     (options.validate == VALIDATE_LAST_ITER &&
                      iter_idx == (options.warmup_iters + options.iters) - 1)) {
-                    check_buffers(q, options, count, iter_idx, buf_recv);
+                    check_gpu_buffers(q, options, count, iter_idx, buf_recv, ccl_events);
                 }
             }
         }
@@ -163,6 +165,116 @@ int main(int argc, char* argv[]) {
     PRINT_BY_ROOT(comms[0], "\n# All done\n");
 
     transport.reset_comms();
+#endif // CCL_ENABLE_SYCL
+}
+
+void run_cpu_backend(user_options_t& options) {
+    auto& transport = transport_data::instance();
+    transport.init_comms(options);
+
+    auto rank = transport.get_rank();
+    auto& comms = transport.get_comms();
+
+    print_user_options("Bandwidth", options, comms[0]);
+
+    double start_t = 0.0, end_t = 0.0, diff_t = 0.0;
+    size_t dtype_size = sizeof(ccl::datatype::int32);
+
+    std::vector<int> buf_send;
+    std::vector<int> buf_recv;
+
+    for (auto& count : options.elem_counts) {
+        buf_send.reserve(count);
+        buf_recv.reserve(count);
+
+        if (rank == options.peers[0]) {
+            for (size_t iter_idx = 0; iter_idx < (options.warmup_iters + options.iters);
+                 iter_idx++) {
+                // init the buffer
+                for (size_t id = 0; id < count; id++) {
+                    buf_send[id] = id + iter_idx;
+                    buf_recv[id] = INVALID_VALUE;
+                }
+
+                if (iter_idx == options.warmup_iters) {
+                    ccl::barrier(comms[0]);
+                    start_t = MPI_Wtime();
+                }
+
+                for (int j = 0; j < options.window_size; j++) {
+                    ccl::send(
+                        buf_send.data(), count, ccl::datatype::int32, options.peers[1], comms[0])
+                        .wait();
+                }
+
+                ccl::recv(buf_recv.data(), 1, ccl::datatype::int32, options.peers[1], comms[0])
+                    .wait();
+
+                end_t = MPI_Wtime();
+                diff_t = end_t - start_t;
+            }
+        }
+        else if (rank == options.peers[1]) {
+            for (size_t iter_idx = 0; iter_idx < (options.warmup_iters + options.iters);
+                 iter_idx++) {
+                // init the buffer
+                for (size_t id = 0; id < count; id++) {
+                    buf_send[id] = id + iter_idx;
+                    buf_recv[id] = INVALID_VALUE;
+                }
+
+                if (iter_idx == options.warmup_iters) {
+                    ccl::barrier(comms[0]);
+                }
+
+                for (int j = 0; j < options.window_size; j++) {
+                    ccl::recv(
+                        buf_recv.data(), count, ccl::datatype::int32, options.peers[0], comms[0])
+                        .wait();
+                }
+
+                // we can send 1 count here, this pair is for aligning
+                // no need a big count
+                ccl::send(buf_send.data(), 1, ccl::datatype::int32, options.peers[0], comms[0])
+                    .wait();
+
+                if (options.validate == VALIDATE_ALL_ITERS ||
+                    (options.validate == VALIDATE_LAST_ITER &&
+                     iter_idx == (options.warmup_iters + options.iters) - 1)) {
+                    check_cpu_buffers(count, iter_idx, buf_recv);
+                }
+            }
+        }
+
+        if (rank == options.peers[0]) {
+            double bandwidth_t =
+                (count * dtype_size / 1e6 * options.iters * options.window_size) / diff_t;
+            print_timings(comms[0], options, bandwidth_t, count * dtype_size, "Mbytes/sec");
+        }
+
+        buf_send.clear();
+        buf_recv.clear();
+    }
+
+    PRINT_BY_ROOT(comms[0], "\n# All done\n");
+
+    transport.reset_comms();
+}
+
+int main(int argc, char* argv[]) {
+    user_options_t options;
+
+    if (parse_user_options(argc, argv, options)) {
+        print_help_usage(argv[0]);
+        exit(INVALID_RETURN);
+    }
+
+    if (options.backend == BACKEND_GPU) {
+        run_gpu_backend(options);
+    }
+    else {
+        run_cpu_backend(options);
+    }
 
     return 0;
 }
diff --git a/examples/pt2pt/src/ccl_latency.cpp b/examples/pt2pt/src/ccl_latency.cpp
index 47fc756b6..3bf9c1405 100644
--- a/examples/pt2pt/src/ccl_latency.cpp
+++ b/examples/pt2pt/src/ccl_latency.cpp
@@ -14,33 +14,30 @@
  limitations under the License.
 */
 #include "base.hpp"
+#ifdef CCL_ENABLE_SYCL
 #include "sycl_base.hpp"
+#endif // CCL_ENABLE_SYCL
 #include "pt2pt_transport.hpp"
 
 #include "oneapi/ccl.hpp"
 
-int main(int argc, char* argv[]) {
-    user_options_t options;
-
-    if (parse_user_options(argc, argv, options)) {
-        print_help_usage(argv[0]);
-        exit(INVALID_RETURN);
-    }
-
+void run_gpu_backend(user_options_t& options) {
+#ifdef CCL_ENABLE_SYCL
     auto& transport = transport_data::instance();
     transport.init_comms(options);
 
-    auto q = transport.get_sycl_queue();
     auto rank = transport.get_rank();
     auto& comms = transport.get_comms();
-    auto streams = transport.get_streams();
 
     print_user_options("Latency", options, comms[0]);
 
     size_t dtype_size = sizeof(ccl::datatype::int32);
 
-    for (int count = options.min_elem_count; count <= options.max_elem_count;
-         count = (count ? count * 2 : 1)) {
+    auto q = transport.get_sycl_queue();
+    auto streams = transport.get_streams();
+    std::vector<ccl::event> ccl_events{};
+
+    for (auto& count : options.elem_counts) {
         double start_t = 0.0, end_t = 0.0, diff_t = 0.0, total_latency_t = 0.0;
 
         // create buffers
@@ -88,6 +85,7 @@ int main(int argc, char* argv[]) {
                 if (options.wait) {
                     send_event.wait();
                 }
+                ccl_events.emplace_back(std::move(send_event));
 
                 auto recv_event = ccl::recv(buf_recv,
                                             count,
@@ -99,6 +97,7 @@ int main(int argc, char* argv[]) {
                 if (options.wait) {
                     recv_event.wait();
                 }
+                ccl_events.emplace_back(std::move(recv_event));
 
                 if (iter_idx >= options.warmup_iters) {
                     end_t = MPI_Wtime();
@@ -117,6 +116,7 @@ int main(int argc, char* argv[]) {
                 if (options.wait) {
                     recv_event.wait();
                 }
+                ccl_events.emplace_back(std::move(recv_event));
 
                 auto send_event = ccl::send(buf_send,
                                             count,
@@ -128,13 +128,14 @@ int main(int argc, char* argv[]) {
                 if (options.wait) {
                     send_event.wait();
                 }
+                ccl_events.emplace_back(std::move(send_event));
             }
 
             if (options.validate == VALIDATE_ALL_ITERS ||
                 (options.validate == VALIDATE_LAST_ITER &&
                  iter_idx == (options.warmup_iters + options.iters) - 1)) {
                 ccl::barrier(comms[0]);
-                check_buffers(q, options, count, iter_idx, buf_recv);
+                check_gpu_buffers(q, options, count, iter_idx, buf_recv, ccl_events);
             }
         }
 
@@ -151,6 +152,106 @@ int main(int argc, char* argv[]) {
     PRINT_BY_ROOT(comms[0], "\n# All done\n");
 
     transport.reset_comms();
+#endif // CCL_ENABLE_SYCL
+}
+
+void run_cpu_backend(user_options_t& options) {
+    auto& transport = transport_data::instance();
+    transport.init_comms(options);
+
+    auto rank = transport.get_rank();
+    auto& comms = transport.get_comms();
+
+    print_user_options("Latency", options, comms[0]);
+
+    size_t dtype_size = sizeof(ccl::datatype::int32);
+
+    std::vector<int> buf_send;
+    std::vector<int> buf_recv;
+
+    for (auto& count : options.elem_counts) {
+        double start_t = 0.0, end_t = 0.0, diff_t = 0.0, total_latency_t = 0.0;
+
+        // create buffers
+        buf_send.reserve(count);
+        buf_recv.reserve(count);
+
+        for (size_t iter_idx = 0; iter_idx < (options.warmup_iters + options.iters); iter_idx++) {
+            // init the buffer
+            for (size_t id = 0; id < count; id++) {
+                buf_send[id] = id + iter_idx;
+                buf_recv[id] = INVALID_VALUE;
+            }
+
+            if (iter_idx == options.warmup_iters - 1) {
+                // to ensure that all processes or threads have reached
+                // a certain synchronization point before proceeding time
+                // calculation
+                ccl::barrier(comms[0]);
+            }
+
+            if (rank == options.peers[0]) {
+                if (iter_idx >= options.warmup_iters) {
+                    start_t = MPI_Wtime();
+                }
+
+                ccl::send(buf_send.data(), count, ccl::datatype::int32, options.peers[1], comms[0])
+                    .wait();
+
+                ccl::recv(buf_recv.data(), count, ccl::datatype::int32, options.peers[1], comms[0])
+                    .wait();
+
+                if (iter_idx >= options.warmup_iters) {
+                    end_t = MPI_Wtime();
+                    diff_t = end_t - start_t;
+                    total_latency_t += diff_t;
+                }
+            }
+            else if (rank == options.peers[1]) {
+                ccl::recv(buf_recv.data(), count, ccl::datatype::int32, options.peers[0], comms[0])
+                    .wait();
+
+                ccl::send(buf_send.data(), count, ccl::datatype::int32, options.peers[0], comms[0])
+                    .wait();
+            }
+
+            if (options.validate == VALIDATE_ALL_ITERS ||
+                (options.validate == VALIDATE_LAST_ITER &&
+                 iter_idx == (options.warmup_iters + options.iters) - 1)) {
+                ccl::barrier(comms[0]);
+                check_cpu_buffers(count, iter_idx, buf_recv);
+            }
+        }
+
+        if (rank == options.peers[0]) {
+            // test measures the round trip latency, divide by two to get the one-way latency
+            double average_t = (total_latency_t * 1e6) / (2.0 * options.iters);
+            print_timings(comms[0], options, average_t, count * dtype_size, "#usec(latency)");
+        }
+
+        buf_send.clear();
+        buf_recv.clear();
+    }
+
+    PRINT_BY_ROOT(comms[0], "\n# All done\n");
+
+    transport.reset_comms();
+}
+
+int main(int argc, char* argv[]) {
+    user_options_t options;
+
+    if (parse_user_options(argc, argv, options)) {
+        print_help_usage(argv[0]);
+        exit(INVALID_RETURN);
+    }
+
+    if (options.backend == BACKEND_GPU) {
+        run_gpu_backend(options);
+    }
+    else {
+        run_cpu_backend(options);
+    }
 
     return 0;
 }
diff --git a/examples/sycl/sycl_allgatherv_custom_usm_test.cpp b/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
index d607cf48d..f55c87461 100644
--- a/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
+++ b/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
@@ -24,7 +24,7 @@ struct custom_data_type {
 } __attribute__((packed));
 
 int main(int argc, char *argv[]) {
-    const size_t count = 10 * 1024 * 1024;
+    size_t count = 10 * 1024 * 1024;
 
     int size = 0;
     int rank = 0;
@@ -38,7 +38,18 @@ int main(int argc, char *argv[]) {
     atexit(mpi_finalize);
 
     queue q;
-    if (!create_sycl_queue(argc, argv, rank, q)) {
+    sycl::property_list props;
+    if (argc > 3) {
+        if (strcmp("in_order", argv[3]) == 0) {
+            props = { sycl::property::queue::in_order{} };
+        }
+    }
+
+    if (argc > 4) {
+        count = (size_t)std::atoi(argv[4]);
+    }
+
+    if (!create_sycl_queue(argc, argv, rank, q, props)) {
         return -1;
     }
 
@@ -75,7 +86,7 @@ int main(int argc, char *argv[]) {
     auto stream = ccl::create_stream(q);
 
     /* create buffers */
-    constexpr size_t send_count = count * sizeof(custom_data_type) / sizeof(native_dtype);
+    size_t send_count = count * sizeof(custom_data_type) / sizeof(native_dtype);
 
     auto send_buf = allocator.allocate(send_count, usm_alloc_type);
     auto recv_buf = allocator.allocate(send_count * size, usm_alloc_type);
diff --git a/examples/sycl/sycl_allgatherv_inplace_usm_test.cpp b/examples/sycl/sycl_allgatherv_inplace_usm_test.cpp
index ee9f9ab1b..ccb1361d8 100644
--- a/examples/sycl/sycl_allgatherv_inplace_usm_test.cpp
+++ b/examples/sycl/sycl_allgatherv_inplace_usm_test.cpp
@@ -19,7 +19,7 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-    const size_t count = 10 * 1024 * 1024;
+    size_t count = 10 * 1024 * 1024;
 
     int size = 0;
     int rank = 0;
@@ -33,7 +33,18 @@ int main(int argc, char *argv[]) {
     atexit(mpi_finalize);
 
     queue q;
-    if (!create_sycl_queue(argc, argv, rank, q)) {
+    sycl::property_list props;
+    if (argc > 3) {
+        if (strcmp("in_order", argv[3]) == 0) {
+            props = { sycl::property::queue::in_order{} };
+        }
+    }
+
+    if (argc > 4) {
+        count = (size_t)std::atoi(argv[4]);
+    }
+
+    if (!create_sycl_queue(argc, argv, rank, q, props)) {
         return -1;
     }
 
@@ -93,7 +104,9 @@ int main(int argc, char *argv[]) {
 
     /* invoke allgatherv */
     auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>();
-    ccl::allgatherv(recv_buf, count, recv_buf, recv_counts, comm, stream, attr, deps).wait();
+    ccl::allgatherv(
+        recv_buf, count, recv_buf, recv_counts, ccl::datatype::int32, comm, stream, attr, deps)
+        .wait();
 
     /* open recv_buf and check its correctness on the device side */
     q.submit([&](auto &h) {
diff --git a/examples/sycl/sycl_allgatherv_usm_test.cpp b/examples/sycl/sycl_allgatherv_usm_test.cpp
index 59c96dff3..af81d75fb 100644
--- a/examples/sycl/sycl_allgatherv_usm_test.cpp
+++ b/examples/sycl/sycl_allgatherv_usm_test.cpp
@@ -19,7 +19,7 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-    const size_t count = 10 * 1024 * 1024;
+    size_t count = 10 * 1024 * 1024;
 
     int size = 0;
     int rank = 0;
@@ -33,7 +33,18 @@ int main(int argc, char *argv[]) {
     atexit(mpi_finalize);
 
     queue q;
-    if (!create_sycl_queue(argc, argv, rank, q)) {
+    sycl::property_list props;
+    if (argc > 3) {
+        if (strcmp("in_order", argv[3]) == 0) {
+            props = { sycl::property::queue::in_order{} };
+        }
+    }
+
+    if (argc > 4) {
+        count = (size_t)std::atoi(argv[4]);
+    }
+
+    if (!create_sycl_queue(argc, argv, rank, q, props)) {
         return -1;
     }
 
@@ -94,7 +105,9 @@ int main(int argc, char *argv[]) {
 
     /* invoke allgatherv */
     auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>();
-    ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, stream, attr, deps).wait();
+    ccl::allgatherv(
+        send_buf, count, recv_buf, recv_counts, ccl::datatype::int32, comm, stream, attr, deps)
+        .wait();
 
     /* open recv_buf and check its correctness on the device side */
     q.submit([&](auto &h) {
diff --git a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
index f6284fb76..258ace128 100644
--- a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
+++ b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
@@ -19,7 +19,7 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-    const size_t count = 10 * 1024 * 1024;
+    size_t count = 10 * 1024 * 1024;
 
     int size = 0;
     int rank = 0;
@@ -33,7 +33,18 @@ int main(int argc, char *argv[]) {
     atexit(mpi_finalize);
 
     queue q;
-    if (!create_sycl_queue(argc, argv, rank, q)) {
+    sycl::property_list props;
+    if (argc > 3) {
+        if (strcmp("in_order", argv[3]) == 0) {
+            props = { sycl::property::queue::in_order{} };
+        }
+    }
+
+    if (argc > 4) {
+        count = (size_t)std::atoi(argv[4]);
+    }
+
+    if (!create_sycl_queue(argc, argv, rank, q, props)) {
         return -1;
     }
 
@@ -85,7 +96,9 @@ int main(int argc, char *argv[]) {
 
     /* invoke allreduce */
     auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
-    ccl::allreduce(buf, buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait();
+    ccl::allreduce(
+        buf, buf, count, ccl::datatype::int32, ccl::reduction::sum, comm, stream, attr, deps)
+        .wait();
 
     /* open buf and check its correctness on the device side */
     buffer<int> check_buf(count);
diff --git a/examples/sycl/sycl_allreduce_usm_test.cpp b/examples/sycl/sycl_allreduce_usm_test.cpp
index cf1c68c25..4ae02a776 100644
--- a/examples/sycl/sycl_allreduce_usm_test.cpp
+++ b/examples/sycl/sycl_allreduce_usm_test.cpp
@@ -19,7 +19,7 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char* argv[]) {
-    const size_t count = 10 * 1024 * 1024;
+    size_t count = 10 * 1024 * 1024;
 
     int size = 0;
     int rank = 0;
@@ -33,7 +33,18 @@ int main(int argc, char* argv[]) {
     atexit(mpi_finalize);
 
     queue q;
-    if (!create_sycl_queue(argc, argv, rank, q)) {
+    sycl::property_list props;
+    if (argc > 3) {
+        if (strcmp("in_order", argv[3]) == 0) {
+            props = { sycl::property::queue::in_order{} };
+        }
+    }
+
+    if (argc > 4) {
+        count = (size_t)std::atoi(argv[4]);
+    }
+
+    if (!create_sycl_queue(argc, argv, rank, q, props)) {
         return -1;
     }
 
@@ -92,7 +103,16 @@ int main(int argc, char* argv[]) {
 
     /* invoke allreduce */
     auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
-    ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream, attr, deps).wait();
+    ccl::allreduce(send_buf,
+                   recv_buf,
+                   count,
+                   ccl::datatype::int32,
+                   ccl::reduction::sum,
+                   comm,
+                   stream,
+                   attr,
+                   deps)
+        .wait();
 
     /* open recv_buf and check its correctness on the device side */
     buffer<int> check_buf(count);
diff --git a/examples/sycl/sycl_reduce_inplace_usm_test.cpp b/examples/sycl/sycl_reduce_inplace_usm_test.cpp
index c28770120..e974406ab 100644
--- a/examples/sycl/sycl_reduce_inplace_usm_test.cpp
+++ b/examples/sycl/sycl_reduce_inplace_usm_test.cpp
@@ -120,16 +120,18 @@ int main(int argc, char* argv[]) {
 
     /* print out the result of the test on the host side */
     {
-        host_accessor check_buf_acc(check_buf, read_only);
-        size_t i;
-        for (i = 0; i < count; i++) {
-            if (check_buf_acc[i] == -1) {
-                cout << "FAILED\n";
-                break;
+        if (rank == root_rank) {
+            host_accessor check_buf_acc(check_buf, read_only);
+            size_t i;
+            for (i = 0; i < count; i++) {
+                if (check_buf_acc[i] == -1) {
+                    cout << "FAILED\n";
+                    break;
+                }
+            }
+            if (i == count) {
+                cout << "PASSED\n";
             }
-        }
-        if (i == count) {
-            cout << "PASSED\n";
         }
     }
 
diff --git a/examples/sycl/sycl_reduce_scatter_usm_test.cpp b/examples/sycl/sycl_reduce_scatter_usm_test.cpp
index 7cf1f3505..35e6cbe74 100644
--- a/examples/sycl/sycl_reduce_scatter_usm_test.cpp
+++ b/examples/sycl/sycl_reduce_scatter_usm_test.cpp
@@ -19,7 +19,7 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-    const size_t count = 10 * 1024 * 1024;
+    size_t count = 10 * 1024 * 1024;
 
     int size = 0;
     int rank = 0;
@@ -33,7 +33,18 @@ int main(int argc, char *argv[]) {
     atexit(mpi_finalize);
 
     queue q;
-    if (!create_sycl_queue(argc, argv, rank, q)) {
+    sycl::property_list props;
+    if (argc > 3) {
+        if (strcmp("in_order", argv[3]) == 0) {
+            props = { sycl::property::queue::in_order{} };
+        }
+    }
+
+    if (argc > 4) {
+        count = (size_t)std::atoi(argv[4]);
+    }
+
+    if (!create_sycl_queue(argc, argv, rank, q, props)) {
         return -1;
     }
 
@@ -94,7 +105,15 @@ int main(int argc, char *argv[]) {
 
     /* invoke reduce_scatter */
     auto attr = ccl::create_operation_attr<ccl::reduce_scatter_attr>();
-    ccl::reduce_scatter(send_buf, recv_buf, count, ccl::reduction::sum, comm, stream, attr, deps)
+    ccl::reduce_scatter(send_buf,
+                        recv_buf,
+                        count,
+                        ccl::datatype::int32,
+                        ccl::reduction::sum,
+                        comm,
+                        stream,
+                        attr,
+                        deps)
         .wait();
 
     /* open recv_buf and check its correctness on the device side */
diff --git a/examples/sycl/sycl_reduce_usm_test.cpp b/examples/sycl/sycl_reduce_usm_test.cpp
index dc9dbd169..42067ccbc 100644
--- a/examples/sycl/sycl_reduce_usm_test.cpp
+++ b/examples/sycl/sycl_reduce_usm_test.cpp
@@ -123,16 +123,18 @@ int main(int argc, char* argv[]) {
 
     /* print out the result of the test on the host side */
     {
-        host_accessor check_buf_acc(check_buf, read_only);
-        size_t i;
-        for (i = 0; i < count; i++) {
-            if (check_buf_acc[i] == -1) {
-                cout << "FAILED\n";
-                break;
+        if (rank == root_rank) {
+            host_accessor check_buf_acc(check_buf, read_only);
+            size_t i;
+            for (i = 0; i < count; i++) {
+                if (check_buf_acc[i] == -1) {
+                    cout << "FAILED\n";
+                    break;
+                }
+            }
+            if (i == count) {
+                cout << "PASSED\n";
             }
-        }
-        if (i == count) {
-            cout << "PASSED\n";
         }
     }
 
diff --git a/include/oneapi/ccl/api_functions.hpp b/include/oneapi/ccl/api_functions.hpp
index 488879984..8ef762375 100644
--- a/include/oneapi/ccl/api_functions.hpp
+++ b/include/oneapi/ccl/api_functions.hpp
@@ -335,7 +335,7 @@ namespace preview {
  * @param devices user-supplied device objects for local ranks
  * @param context context containing the devices
  * @param kvs key-value store for ranks wire-up
-  * @param attr optional communicator attributes
+ * @param attr optional communicator attributes
  * @return vector of communicators / communicator
  */
 template <class DeviceType, class ContextType>
@@ -407,15 +407,17 @@ coll_attribute_type CCL_API create_operation_attr(attr_val_type&&... avs) {
  * \brief Allgatherv is a collective communication operation that collects data
  *        from all the ranks within a communicator into a single buffer.
  *        Different ranks may contribute segments of different sizes.
- *        The resulting data in the output buffer must be the same for each rank.
+ *        The resulting data in the output buffer is the same for each rank.
+ *.
  * @param send_buf the buffer with @c send_count elements of @c dtype that stores local data to be gathered
  * @param send_count the number of elements of type @c dtype in @c send_buf
- * @param recv_buf [out] the buffer to store gathered result, should be large enough to hold values from all ranks
- * @param recv_bufs [out] array of buffers to store gathered result, one buffer per each rank
+ * @param recv_buf [out] the buffer to store gathered result of @c dtype, must be large enough
+ *                      to hold values from all ranks, i.e. size should be equal
+ *                      to @c dtype size in bytes * sum of all values in @c recv_counts
  * @param recv_counts array with the number of elements of type @c dtype to be received from each rank
  * @param dtype the datatype of elements in @c send_buf and @c recv_buf
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -444,6 +446,11 @@ event CCL_API allgatherv(const void* send_buf,
 
 /*!
  * \overload
+ *
+ * This overloaded function takes separate receive buffer per rank.
+ *
+ * @param recv_bufs [out] array of buffers to store gathered result, one buffer per rank;
+ * each buffer must be large enough to keep the corresponding @c recv_counts elements of @c dtype size
  */
 event CCL_API allgatherv(const void* send_buf,
                          size_t send_count,
@@ -457,6 +464,11 @@ event CCL_API allgatherv(const void* send_buf,
 
 /*!
  * \overload
+ *
+ * This overloaded function takes separate receive buffer per rank.
+ *
+ * @param recv_bufs [out] array of buffers to store gathered result, one buffer per rank;
+ * each buffer must be large enough to keep the corresponding @c recv_counts elements of @c dtype size
  */
 event CCL_API allgatherv(const void* send_buf,
                          size_t send_count,
@@ -469,8 +481,14 @@ event CCL_API allgatherv(const void* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_buf the buffer with @c send_count elements of @c BufferType that stores local data to be gathered
+ * @param recv_buf [out] the buffer to store gathered result of @c BufferType, must be large enough
+ *                      to hold values from all ranks, i.e. size should be equal
+ *                      to @c BufferType size in bytes * sum of all values in @c recv_counts
  */
-/* Type safety version */
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API allgatherv(const BufferType* send_buf,
@@ -484,8 +502,14 @@ event CCL_API allgatherv(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_buf the buffer with @c send_count elements of @c BufferType that stores local data to be gathered
+ * @param recv_buf [out] the buffer to store gathered result of @c BufferType, must be large enough
+ *                      to hold values from all ranks, i.e. size should be equal
+ *                      to @c BufferType size in bytes * sum of all values in @c recv_counts
  */
-/* Type safety version */
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API allgatherv(const BufferType* send_buf,
@@ -498,8 +522,13 @@ event CCL_API allgatherv(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_buf the buffer with @c send_count elements of @c BufferType that stores local data to be gathered
+ * @param recv_bufs [out] array of buffers to store gathered result, one buffer per rank;
+ * each buffer must be large enough to keep the corresponding @c recv_counts elements of @c BufferType size
  */
-/* Type safety version */
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API allgatherv(const BufferType* send_buf,
@@ -513,8 +542,13 @@ event CCL_API allgatherv(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_buf the buffer with @c send_count elements of @c BufferType that stores local data to be gathered
+ * @param recv_bufs [out] array of buffers to store gathered result, one buffer per rank;
+ * each buffer must be large enough to keep the corresponding @c recv_counts elements of @c BufferType size
  */
-/* Type safety version */
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API allgatherv(const BufferType* send_buf,
@@ -527,8 +561,14 @@ event CCL_API allgatherv(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_buf the buffer of @c BufferObjectType with @c send_count elements that stores local data to be gathered
+ * @param recv_buf [out] the buffer of @c BufferObjectType to store gathered result, must be large enough
+ *                      to hold values from all ranks, i.e. size should be equal
+ *                      to @c BufferType size in bytes * sum of all values in @c recv_counts
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API allgatherv(const BufferObjectType& send_buf,
@@ -542,8 +582,14 @@ event CCL_API allgatherv(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_buf the buffer of @c BufferObjectType with @c send_count elements that stores local data to be gathered
+ * @param recv_buf [out] the buffer of @c BufferObjectType to store gathered result, must be large enough
+ *                      to hold values from all ranks, i.e. size should be equal
+ *                      to @c BufferType size in bytes * sum of all values in @c recv_counts
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API allgatherv(const BufferObjectType& send_buf,
@@ -556,8 +602,13 @@ event CCL_API allgatherv(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_buf the buffer of @c BufferObjectType with @c send_count elements that stores local data to be gathered
+ * @param recv_bufs [out] array of buffers to store gathered result, one buffer per rank;
+ * each buffer must be large enough to keep the corresponding @c recv_counts elements of @c BufferObjectType size
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API allgatherv(const BufferObjectType& send_buf,
@@ -571,8 +622,13 @@ event CCL_API allgatherv(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_buf the buffer of @c BufferObjectType with @c send_count elements that stores local data to be gathered
+ * @param recv_bufs [out] array of buffers to store gathered result, one buffer per rank;
+ * each buffer must be large enough to keep the corresponding @c recv_counts elements of @c BufferObjectType size
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API allgatherv(const BufferObjectType& send_buf,
@@ -595,10 +651,10 @@ event CCL_API allgatherv(const BufferObjectType& send_buf,
  * @param send_buf the buffer with @c count elements of @c dtype that stores local data to be reduced
  * @param recv_buf [out] the buffer to store reduced result, must have the same dimension as @c send_buf
  * @param count the number of elements of type @c dtype in @c send_buf and @c recv_buf
- * @param dtype the datatype of elements in @c send_buf and @c recv_buf
+ * @param dtype the datatype of elements in @c send_buf and @c recv_buf`
  * @param rtype the type of the reduction operation to be applied
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -627,8 +683,10 @@ event CCL_API allreduce(const void* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API allreduce(const BufferType* send_buf,
@@ -642,8 +700,10 @@ event CCL_API allreduce(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API allreduce(const BufferType* send_buf,
@@ -656,8 +716,10 @@ event CCL_API allreduce(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API allreduce(const BufferObjectType& send_buf,
@@ -671,8 +733,10 @@ event CCL_API allreduce(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API allreduce(const BufferObjectType& send_buf,
@@ -694,15 +758,14 @@ event CCL_API allreduce(const BufferObjectType& send_buf,
  *        sends distinct equal-sized blocks of data to each rank.
  *        The j-th block of @c send_buf sent from the i-th rank is received by the j-th rank
  *        and is placed in the i-th block of @c recvbuf.
+ *
  * @param send_buf the buffer with @c count elements of @c dtype that stores local data to be sent
- * @param recv_buf [out] the buffer to store received result, should be large enough
+ * @param recv_buf [out] the buffer to store received result, must be large enough
  *        to hold values from all ranks, i.e. at least @c comm_size * @c count
- * @param send_bufs array of buffers with local data to be sent, one buffer per each rank
- * @param recv_bufs [out] array of buffers to store received result, one buffer per each rank
  * @param count the number of elements of type @c dtype to be send to or to received from each rank
  * @param dtype the datatype of elements in @c send_buf and @c recv_buf
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -729,6 +792,9 @@ event CCL_API alltoall(const void* send_buf,
 
 /*!
  * \overload
+ *
+ * @param send_bufs array of buffers with local data to be sent, one buffer per rank
+ * @param recv_bufs [out] array of buffers to store received result, one buffer per rank
  */
 event CCL_API alltoall(const vector_class<void*>& send_buf,
                        const vector_class<void*>& recv_buf,
@@ -741,6 +807,9 @@ event CCL_API alltoall(const vector_class<void*>& send_buf,
 
 /*!
  * \overload
+ *
+ * @param send_bufs array of buffers with local data to be sent, one buffer per rank
+ * @param recv_bufs [out] array of buffers to store received result, one buffer per rank
  */
 event CCL_API alltoall(const vector_class<void*>& send_buf,
                        const vector_class<void*>& recv_buf,
@@ -752,8 +821,9 @@ event CCL_API alltoall(const vector_class<void*>& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API alltoall(const BufferType* send_buf,
@@ -766,8 +836,9 @@ event CCL_API alltoall(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API alltoall(const BufferType* send_buf,
@@ -779,8 +850,12 @@ event CCL_API alltoall(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_bufs array of buffers with local data to be sent, one buffer per rank
+ * @param recv_bufs [out] array of buffers to store received result, one buffer per rank
  */
-/* Type safety version */
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API alltoall(const vector_class<BufferType*>& send_buf,
@@ -793,8 +868,13 @@ event CCL_API alltoall(const vector_class<BufferType*>& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ *
+ * @param send_bufs array of buffers with local data to be sent, one buffer per rank
+ * @param recv_bufs [out] array of buffers to store received result, one buffer per rank
  */
-/* Type safety version */
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API alltoall(const vector_class<BufferType*>& send_buf,
@@ -806,8 +886,9 @@ event CCL_API alltoall(const vector_class<BufferType*>& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API alltoall(const BufferObjectType& send_buf,
@@ -820,8 +901,12 @@ event CCL_API alltoall(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_bufs array of buffers with local data to be sent, one buffer per rank
+ * @param recv_bufs [out] array of buffers to store received result, one buffer per rank
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API alltoall(const BufferObjectType& send_buf,
@@ -833,8 +918,12 @@ event CCL_API alltoall(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_bufs array of buffers with local data to be sent, one buffer per rank
+ * @param recv_bufs [out] array of buffers to store received result, one buffer per rank
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
@@ -847,8 +936,12 @@ event CCL_API alltoall(const vector_class<reference_wrapper_class<BufferObjectTy
 
 /*!
  * \overload
+ *
+ * Type-safe version.
+ *
+ * @param send_bufs array of buffers with local data to be sent, one buffer per rank
+ * @param recv_bufs [out] array of buffers to store received result, one buffer per rank
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
@@ -870,14 +963,14 @@ event CCL_API alltoall(const vector_class<reference_wrapper_class<BufferObjectTy
  *        The j-th block of @c send_buf sent from the i-th rank is received by the j-th rank
  *        and is placed in the i-th block of @c recvbuf.
  * @param send_buf the buffer with elements of @c dtype that stores local blocks to be sent to each rank
- * @param send_bufs array of buffers to store send blocks, one buffer per each rank
- * @param recv_buf [out] the buffer to store received result, should be large enough to hold blocks from all ranks
- * @param recv_bufs [out] array of buffers to store receive blocks, one buffer per each rank
+ * @param send_bufs array of buffers to store send blocks, one buffer per rank
+ * @param recv_buf [out] the buffer to store received result, must be large enough to hold blocks from all ranks
+ * @param recv_bufs [out] array of buffers to store receive blocks, one buffer per rank
  * @param send_counts array with the number of elements of type @c dtype in send blocks for each rank
  * @param recv_counts array with the number of elements of type @c dtype in receive blocks from each rank
  * @param dtype the datatype of elements in @c send_buf and @c recv_buf
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -906,8 +999,10 @@ event CCL_API alltoallv(const void* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 event CCL_API alltoallv(const vector_class<void*>& send_bufs,
                         const vector_class<size_t>& send_counts,
                         const vector_class<void*>& recv_bufs,
@@ -920,8 +1015,10 @@ event CCL_API alltoallv(const vector_class<void*>& send_bufs,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 event CCL_API alltoallv(const vector_class<void*>& send_bufs,
                         const vector_class<size_t>& send_counts,
                         const vector_class<void*>& recv_bufs,
@@ -933,8 +1030,10 @@ event CCL_API alltoallv(const vector_class<void*>& send_bufs,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API alltoallv(const BufferType* send_buf,
@@ -948,8 +1047,10 @@ event CCL_API alltoallv(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API alltoallv(const BufferType* send_buf,
@@ -962,8 +1063,10 @@ event CCL_API alltoallv(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API alltoallv(const vector_class<BufferType*>& send_bufs,
@@ -977,8 +1080,10 @@ event CCL_API alltoallv(const vector_class<BufferType*>& send_bufs,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API alltoallv(const vector_class<BufferType*>& send_bufs,
@@ -991,8 +1096,10 @@ event CCL_API alltoallv(const vector_class<BufferType*>& send_bufs,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API alltoallv(const BufferObjectType& send_buf,
@@ -1006,8 +1113,10 @@ event CCL_API alltoallv(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API alltoallv(const BufferObjectType& send_buf,
@@ -1020,8 +1129,10 @@ event CCL_API alltoallv(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
@@ -1035,8 +1146,10 @@ event CCL_API alltoallv(const vector_class<reference_wrapper_class<BufferObjectT
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
@@ -1058,7 +1171,7 @@ event CCL_API alltoallv(const vector_class<reference_wrapper_class<BufferObjectT
  * \brief Barrier synchronization is performed across all ranks of the communicator
  *        and it is completed only after all the ranks in the communicator have called it.
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -1092,7 +1205,7 @@ event CCL_API barrier(const communicator& comm,
  * @param dtype the datatype of elements in @c buf
  * @param root the rank that broadcasts @c buf
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -1119,8 +1232,10 @@ event CCL_API broadcast(void* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API broadcast(BufferType* buf,
@@ -1133,8 +1248,10 @@ event CCL_API broadcast(BufferType* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API broadcast(BufferType* buf,
@@ -1146,8 +1263,10 @@ event CCL_API broadcast(BufferType* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API broadcast(BufferObjectType& buf,
@@ -1160,8 +1279,10 @@ event CCL_API broadcast(BufferObjectType& buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API broadcast(BufferObjectType& buf,
@@ -1187,7 +1308,7 @@ event CCL_API broadcast(BufferObjectType& buf,
  * @param dtype the datatype of elements in @c buf
  * @param peer the rank that sends @c buf
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -1214,8 +1335,10 @@ event CCL_API recv(void* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API recv(BufferType* buf,
@@ -1228,8 +1351,10 @@ event CCL_API recv(BufferType* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API recv(BufferType* buf,
@@ -1241,8 +1366,10 @@ event CCL_API recv(BufferType* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API recv(BufferObjectType& buf,
@@ -1255,8 +1382,9 @@ event CCL_API recv(BufferObjectType& buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API recv(BufferObjectType& buf,
@@ -1282,7 +1410,7 @@ event CCL_API recv(BufferObjectType& buf,
  * @param dtype the datatype of elements in @c buf
  * @param peer the rank that receives @c buf
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -1309,8 +1437,10 @@ event CCL_API send(void* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API send(BufferType* buf,
@@ -1323,8 +1453,10 @@ event CCL_API send(BufferType* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API send(BufferType* buf,
@@ -1336,8 +1468,10 @@ event CCL_API send(BufferType* buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API send(BufferObjectType& buf,
@@ -1350,8 +1484,10 @@ event CCL_API send(BufferObjectType& buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API send(BufferObjectType& buf,
@@ -1379,7 +1515,7 @@ event CCL_API send(BufferObjectType& buf,
  * @param rtype the type of the reduction operation to be applied
  * @param root the rank that gets the result of reduction
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -1410,8 +1546,10 @@ event CCL_API reduce(const void* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API reduce(const BufferType* send_buf,
@@ -1426,8 +1564,10 @@ event CCL_API reduce(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API reduce(const BufferType* send_buf,
@@ -1441,8 +1581,10 @@ event CCL_API reduce(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API reduce(const BufferObjectType& send_buf,
@@ -1457,8 +1599,10 @@ event CCL_API reduce(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API reduce(const BufferObjectType& send_buf,
@@ -1486,7 +1630,7 @@ event CCL_API reduce(const BufferObjectType& send_buf,
  * @param dtype the datatype of elements in @c send_buf and @c recv_buf
  * @param rtype the type of the reduction operation to be applied
  * @param comm the communicator for which the operation will be performed
- * @param stream a stream associated with the operation
+ * @param stream abstraction over a device queue constructed via ccl::create_stream
  * @param attr optional attributes to customize operation
  * @param deps an optional vector of the events that the operation should depend on
  * @return @ref ccl::event an object to track the progress of the operation
@@ -1515,8 +1659,10 @@ event CCL_API reduce_scatter(const void* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API reduce_scatter(const BufferType* send_buf,
@@ -1530,8 +1676,10 @@ event CCL_API reduce_scatter(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferType,
           class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
 event CCL_API reduce_scatter(const BufferType* send_buf,
@@ -1544,8 +1692,11 @@ event CCL_API reduce_scatter(const BufferType* send_buf,
 
 /*!
  * \overload
+ *
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API reduce_scatter(const BufferObjectType& send_buf,
@@ -1559,8 +1710,10 @@ event CCL_API reduce_scatter(const BufferObjectType& send_buf,
 
 /*!
  * \overload
+ *
+ * Type-safe version.
  */
-/* Type safety version */
+
 template <class BufferObjectType,
           class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
 event CCL_API reduce_scatter(const BufferObjectType& send_buf,
diff --git a/include/oneapi/ccl/event.hpp b/include/oneapi/ccl/event.hpp
index 61e49311a..322f1abb1 100644
--- a/include/oneapi/ccl/event.hpp
+++ b/include/oneapi/ccl/event.hpp
@@ -93,10 +93,11 @@ class event : public ccl_api_base_movable<event, direct_access_policy, event_imp
     native_t& get_native();
     const native_t& get_native() const;
 
+    static event create_from_native(native_t& native_event);
+
 private:
     friend class ccl::detail::environment;
 
-    static event create_from_native(native_t& native_event);
     static event create_from_native(native_handle_t native_event_handle, context_t context);
 };
 
diff --git a/man/doxconfig b/man/doxconfig
index ae1aa51dd..6705a5622 100644
--- a/man/doxconfig
+++ b/man/doxconfig
@@ -1,5 +1,5 @@
 PROJECT_NAME           = "Intel® oneAPI Collective Communications Library"
-PROJECT_NUMBER         = "2021.11"
+PROJECT_NUMBER         = "2021.12"
 
 INPUT = ../src/common/env/vars.hpp ../src/common/env/vars_experimental.hpp
 
diff --git a/man/man3/OneCCL.3 b/man/man3/OneCCL.3
index e0ba0230b..8516bc18a 100644
--- a/man/man3/OneCCL.3
+++ b/man/man3/OneCCL.3
@@ -1,4 +1,4 @@
-.TH "OneCCLvars" 3 "Wed Aug 30 2023" "Version 2021.11" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
+.TH "OneCCLvars" 3 "Tue Jan 30 2024" "Version 2021.12" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
 .ad l
 .nh
 .SH NAME
@@ -722,7 +722,7 @@ By-default: '1'
 .SH "Author"
 .PP 
 Generated automatically by Doxygen for Intel® oneAPI Collective Communications Library from the source code\&.
-.TH "ExpOneCCLvars" 3 "Wed Aug 30 2023" "Version 2021.11" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
+.TH "ExpOneCCLvars" 3 "Tue Jan 30 2024" "Version 2021.12" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
 .ad l
 .nh
 .SH NAME
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 56a47e707..50fc65009 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -54,6 +54,33 @@ if (CCL_ENABLE_SYCL AND CCL_ENABLE_ZE)
         sched/ze/ze_handle_manager.cpp
         sched/ze/ze_ipc_event_pool_manager.cpp
         sched/ze/ze_list_manager.cpp
+
+        coll/algorithms/allgatherv/sycl/allgatherv_sycl.cpp
+        coll/algorithms/allgatherv/sycl/allgatherv_small_sycl.cpp
+        coll/algorithms/allgatherv/sycl/allgatherv_medium_sycl.cpp
+        coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.cpp
+
+        coll/algorithms/allreduce/sycl/allreduce_small_sycl.cpp
+        coll/algorithms/allreduce/sycl/allreduce_medium_sycl.cpp
+        coll/algorithms/allreduce/sycl/allreduce_medium_sycl_fp16.cpp
+        coll/algorithms/allreduce/sycl/allreduce_medium_sycl_bf16.cpp
+        coll/algorithms/allreduce/sycl/allreduce_medium_sycl_fp32.cpp
+        coll/algorithms/allreduce/sycl/allreduce_medium_sycl_int32.cpp
+        coll/algorithms/allreduce/sycl/allreduce_large_sycl.cpp
+        coll/algorithms/allreduce/sycl/allreduce_large_sycl_fp16.cpp
+        coll/algorithms/allreduce/sycl/allreduce_large_sycl_bf16.cpp
+        coll/algorithms/allreduce/sycl/allreduce_large_sycl_fp32.cpp
+        coll/algorithms/allreduce/sycl/allreduce_large_sycl_int32.cpp
+        coll/algorithms/allreduce/sycl/allreduce_sycl.cpp
+
+        coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.cpp
+        coll/algorithms/reduce_scatter/sycl/reduce_scatter_small_sycl.cpp
+        coll/algorithms/reduce_scatter/sycl/reduce_scatter_medium_sycl.cpp
+        coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.cpp
+        coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_int32.cpp
+        coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_fp32.cpp
+        coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_fp16.cpp
+        coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_bf16.cpp
         )
 endif(CCL_ENABLE_SYCL AND CCL_ENABLE_ZE)
 
@@ -94,18 +121,18 @@ set(CCL_SRC
     coll/attr/ccl_reduce_scatter_op_attr.cpp
     coll/coll_param.cpp
     coll/coll_util.cpp
-    coll/algorithms/allgatherv.cpp
+    coll/algorithms/allgatherv/allgatherv.cpp
     coll/algorithms/allreduce/allreduce.cpp
     coll/algorithms/allreduce/allreduce_rma.cpp
     coll/algorithms/algorithm_utils.cpp
     coll/algorithms/alltoall.cpp
     coll/algorithms/alltoallv.cpp
-    coll/algorithms/barrier.cpp
+    coll/algorithms/barrier/barrier.cpp
     coll/algorithms/bcast.cpp
     coll/algorithms/double_tree_ops.cpp
     coll/algorithms/recv.cpp
     coll/algorithms/reduce.cpp
-    coll/algorithms/reduce_scatter.cpp
+    coll/algorithms/reduce_scatter/reduce_scatter.cpp
     coll/algorithms/send.cpp
     coll/coll.cpp
     coll/coll_check.cpp
@@ -259,8 +286,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL
     set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -diag-disable=654")
 endif()
 
-
-if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") OR (${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM"))
     if (USE_CODECOV_FLAGS)
         set(SRC_C_FLAGS "${SRC_C_FLAGS} -fprofile-instr-generate -fcoverage-mapping -Wno-error=unused-command-line-argument -fno-sycl-use-footer")
         set(SRC_CXX_FLAGS "${SRC_CXX_FLAGS} -fprofile-instr-generate -fcoverage-mapping -Wno-error=unused-command-line-argument -fno-sycl-use-footer")
@@ -364,11 +390,12 @@ endif()
 
 if (ENABLE_MPI)
     file(GLOB mpi_bins "${DEPS_DIR}/mpi/bin/*")
-    install(PROGRAMS ${mpi_bins} DESTINATION ${CCL_INSTALL_BIN})
+    install(PROGRAMS ${mpi_bins}
+            DESTINATION ${CMAKE_INSTALL_PREFIX}/opt/mpi/bin)
     install(DIRECTORY ${DEPS_DIR}/mpi/include/
-            DESTINATION ${CCL_INSTALL_INCLUDE})
+            DESTINATION ${CMAKE_INSTALL_PREFIX}/opt/mpi/include)
     install(DIRECTORY ${DEPS_DIR}/mpi/lib/
-           DESTINATION ${CCL_INSTALL_LIB})
+           DESTINATION ${CMAKE_INSTALL_PREFIX}/opt/mpi/lib)
     install(DIRECTORY ${DEPS_DIR}/mpi/opt/mpi/etc/
             DESTINATION ${CMAKE_INSTALL_PREFIX}/opt/mpi/etc/)
     install(DIRECTORY ${DEPS_DIR}/mpi/licensing/
diff --git a/src/atl/atl_def.h b/src/atl/atl_def.h
index c67d84e56..0c80da6a8 100644
--- a/src/atl/atl_def.h
+++ b/src/atl/atl_def.h
@@ -71,7 +71,7 @@
 #define ATL_CHECK_PTR(ptr, str) \
     do { \
         if (!ptr) { \
-            LOG_ERROR("%s, errno: %s", str, strerror(errno)); \
+            LOG_ERROR(str, ", errno: ", strerror(errno)); \
             return ATL_STATUS_FAILURE; \
         } \
     } while (0)
diff --git a/src/atl/mpi/atl_mpi.hpp b/src/atl/mpi/atl_mpi.hpp
index ed32f94f5..a75bd009c 100644
--- a/src/atl/mpi/atl_mpi.hpp
+++ b/src/atl/mpi/atl_mpi.hpp
@@ -79,6 +79,8 @@ typedef struct atl_mpi_comm_info : atl_mpi_env_info_t {
 class atl_mpi : public atl_base_transport {
 public:
     atl_mpi() = default;
+    atl_mpi(const atl_mpi& other) = delete;
+    atl_mpi& operator=(const atl_mpi& other) = delete;
     ~atl_mpi();
 
     atl_status_t init(int* argc,
diff --git a/src/atl/ofi/atl_ofi.hpp b/src/atl/ofi/atl_ofi.hpp
index 113bdfb7c..a7f17e0e7 100644
--- a/src/atl/ofi/atl_ofi.hpp
+++ b/src/atl/ofi/atl_ofi.hpp
@@ -28,6 +28,8 @@
 class atl_ofi : public atl_base_transport {
 public:
     atl_ofi() = default;
+    atl_ofi(const atl_ofi& other) = delete;
+    atl_ofi& operator=(const atl_ofi& other) = delete;
     ~atl_ofi();
 
     atl_status_t init(int* argc,
@@ -204,6 +206,10 @@ class atl_ofi : public atl_base_transport {
     class mr_cache {
     public:
         mr_cache() = default;
+        mr_cache(const mr_cache&) = delete;
+        mr_cache& operator=(const mr_cache&) = delete;
+        mr_cache(mr_cache&&) noexcept = default;
+        mr_cache& operator=(mr_cache&&) noexcept = default;
         ~mr_cache();
 
         void clear();
diff --git a/src/atl/ofi/atl_ofi_comm.cpp b/src/atl/ofi/atl_ofi_comm.cpp
index 603c93d5f..d8a9093dc 100644
--- a/src/atl/ofi/atl_ofi_comm.cpp
+++ b/src/atl/ofi/atl_ofi_comm.cpp
@@ -34,7 +34,7 @@ atl_ofi_comm::atl_ofi_comm() {
             pmi = std::shared_ptr<ipmi>(new pmi_resizable(k));
         }
         else {
-            LOG_ERROR("unknown %s: %s", PM_TYPE, pm_type_str);
+            LOG_ERROR("unknown ", PM_TYPE, ": ", pm_type_str);
         }
     }
     else {
@@ -55,7 +55,7 @@ atl_ofi_comm::atl_ofi_comm(std::shared_ptr<ikvs_wrapper> k) {
             pmi = std::shared_ptr<ipmi>(new pmi_resizable(k));
         }
         else {
-            LOG_ERROR("unknown %s: %s", PM_TYPE, pm_type_str);
+            LOG_ERROR("unknown ", PM_TYPE, ": ", pm_type_str);
         }
     }
     else {
diff --git a/src/atl/ofi/atl_ofi_helper.cpp b/src/atl/ofi/atl_ofi_helper.cpp
index d5b757ff4..39e9016f3 100644
--- a/src/atl/ofi/atl_ofi_helper.cpp
+++ b/src/atl/ofi/atl_ofi_helper.cpp
@@ -1352,9 +1352,9 @@ atl_status_t atl_ofi_open_nw_provs(atl_ofi_ctx_t& ctx,
         prov = &ctx.provs[prov_idx];
         prov->idx = prov_idx;
         prov->is_shm = 0;
-        ATL_CALL(atl_ofi_prov_init(
-                     ctx, coord, final_provs[idx], prov, attr, std::move(pmi), ep_names[prov->idx]),
-                 goto err);
+        ATL_CALL(
+            atl_ofi_prov_init(ctx, coord, final_provs[idx], prov, attr, pmi, ep_names[prov->idx]),
+            goto err);
     }
 
 exit:
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
index aa54bf80e..92afca032 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
@@ -98,7 +98,7 @@ kvs_status_t server::try_to_connect_new() {
         if ((new_socket = accept(poll_fds[FDI_LISTENER].fd,
                                  addr->get_sock_addr_ptr(),
                                  (socklen_t*)&peer_addr_size)) < 0) {
-            LOG_ERROR("server_listen_sock accept, %s", strerror(errno));
+            LOG_ERROR("server_listen_sock accept:", strerror(errno));
             return KVS_STATUS_FAILURE;
         }
         for (size_t i = FDI_LAST; i < poll_fds.size(); i++) {
@@ -117,7 +117,7 @@ kvs_status_t server::try_to_connect_new() {
             if (close(new_socket)) {
                 // we are already returning failure, there is not much we can do
                 // except for logging the exact error that occurred
-                LOG_ERROR("error closing a socket, %s", strerror(errno));
+                LOG_ERROR("error closing a socket: ", strerror(errno));
             }
             return KVS_STATUS_FAILURE;
         }
@@ -329,7 +329,7 @@ kvs_status_t server::make_client_request(int& socket) {
         default: {
             if (request.name[0] == '\0')
                 return KVS_STATUS_SUCCESS;
-            LOG_ERROR("unknown request mode - %d.\n", request.mode);
+            LOG_ERROR("unknown request mode: ", request.mode);
             return KVS_STATUS_FAILURE;
         }
     }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
index 365e9a86a..5d6c5cc5d 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
@@ -98,7 +98,7 @@ kvs_status_t pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     for (i = 0, j = 0; i < num_listeners; i++, j++) {
         char* point_to_port = strstr(const_cast<char*>(sock_addr_str[j].c_str()), "_");
         if (point_to_port == NULL) {
-            LOG_ERROR("Wrong address_port record: %s", sock_addr_str[j]);
+            LOG_ERROR("Wrong address_port record: ", sock_addr_str[j]);
             status = KVS_STATUS_FAILURE;
             goto exit;
         }
@@ -117,7 +117,7 @@ kvs_status_t pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
         server_addresses[i].sin_family = AF_INET;
 
         if (inet_pton(AF_INET, sock_addr_str[j].c_str(), &(server_addresses[i].sin_addr)) <= 0) {
-            LOG_ERROR("Invalid address/ Address not supported: %s", sock_addr_str[j].c_str());
+            LOG_ERROR("Invalid address/ Address not supported: ", sock_addr_str[j].c_str());
             status = KVS_STATUS_FAILURE;
             goto exit;
         }
@@ -231,7 +231,7 @@ kvs_status_t pmi_listener::run_listener(std::shared_ptr<helper> h) {
                 return KVS_STATUS_SUCCESS;
             }
             if (errno != EINTR) {
-                LOG_ERROR("listner: accept error: %s\n", strerror(errno));
+                LOG_ERROR("listener: accept error: ", strerror(errno));
                 return KVS_STATUS_FAILURE;
             }
         }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
index 69b8b2b5c..0eabb2ce2 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
@@ -193,7 +193,7 @@ kvs_status_t pmi_resizable::PMIR_Update(void) {
                     break;
                 }
                 default: {
-                    LOG_ERROR("Unknown resize action: %d\n", answer);
+                    LOG_ERROR("Unknown resize action: ", answer);
                     KVS_CHECK_STATUS(PMIR_Finalize(), "failed to finalize");
                     return KVS_STATUS_FAILURE;
                 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
index b12cbdb7a..f876a4c07 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
@@ -190,11 +190,12 @@ atl_status_t pmi_resizable_simple::pmrt_kvs_put(char* kvs_key,
                                                 const void* kvs_val,
                                                 size_t kvs_val_len) {
     int ret;
-    char key_storage[max_keylen];
+    std::vector<char> key_storage(max_keylen);
     if (kvs_val_len > max_vallen)
         return ATL_STATUS_FAILURE;
 
-    ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
+    ret = snprintf(
+        key_storage.data(), max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
     if (ret < 0) {
         LOG_ERROR("sprintf failed");
         return ATL_STATUS_FAILURE;
@@ -206,7 +207,7 @@ atl_status_t pmi_resizable_simple::pmrt_kvs_put(char* kvs_key,
         return ATL_STATUS_FAILURE;
     }
 
-    ATL_CHECK_STATUS(kvs_set_value(KVS_NAME, key_storage, val_storage), "failed to set val");
+    ATL_CHECK_STATUS(kvs_set_value(KVS_NAME, key_storage.data(), val_storage), "failed to set val");
 
     return ATL_STATUS_SUCCESS;
 }
@@ -216,15 +217,16 @@ atl_status_t pmi_resizable_simple::pmrt_kvs_get(char* kvs_key,
                                                 void* kvs_val,
                                                 size_t kvs_val_len) {
     int ret;
-    char key_storage[max_keylen];
+    std::vector<char> key_storage(max_keylen);
 
-    ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
+    ret = snprintf(
+        key_storage.data(), max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
     if (ret < 0) {
         LOG_ERROR("sprintf failed");
         return ATL_STATUS_FAILURE;
     }
 
-    ATL_CHECK_STATUS(kvs_get_value(KVS_NAME, key_storage, val_storage), "failed to get val");
+    ATL_CHECK_STATUS(kvs_get_value(KVS_NAME, key_storage.data(), val_storage), "failed to get val");
 
     ret = decode(val_storage, kvs_val, kvs_val_len);
     if (ret) {
@@ -274,10 +276,13 @@ atl_status_t pmi_resizable_simple::kvs_get_value(const char* kvs_name,
     } while (value_vec.empty() && kvs_get_time < kvs_get_timeout);
 
     if (kvs_get_time >= kvs_get_timeout) {
-        LOG_ERROR("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
+        LOG_ERROR("KVS get error: timeout limit: ",
                   kvs_get_time,
+                  " > ",
                   kvs_get_timeout,
+                  ", prefix: ",
                   result_kvs_name.c_str(),
+                  ", key: ",
                   key);
         return ATL_STATUS_FAILURE;
     }
@@ -336,7 +341,7 @@ atl_status_t pmi_resizable_simple::register_my_proc_name() {
     char hostname[hostname_len];
     int ret = gethostname(hostname, hostname_len);
     if (ret) {
-        LOG_ERROR("gethostname error: %s\n", strerror(errno));
+        LOG_ERROR("gethostname error: ", strerror(errno));
         return ATL_STATUS_FAILURE;
     }
     my_process_name = std::string(hostname) + std::to_string(my_pid);
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
index 28c308d10..50413594b 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
@@ -214,13 +214,14 @@ atl_status_t pmi_resizable_simple_internal::pmrt_kvs_put(char* kvs_key,
                                                          const void* kvs_val,
                                                          size_t kvs_val_len) {
     int ret;
-    char key_storage[max_keylen];
+    std::vector<char> key_storage(max_keylen);
     if (kvs_val_len > max_vallen) {
         LOG_ERROR("asked len > max len");
         return ATL_STATUS_FAILURE;
     }
 
-    ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
+    ret = snprintf(
+        key_storage.data(), max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
     if (ret < 0) {
         LOG_ERROR("snprintf failed");
         return ATL_STATUS_FAILURE;
@@ -232,7 +233,7 @@ atl_status_t pmi_resizable_simple_internal::pmrt_kvs_put(char* kvs_key,
         return ATL_STATUS_FAILURE;
     }
 
-    ATL_CHECK_STATUS(kvs_set_value(KVS_NAME, key_storage, val_storage), "failed to set val");
+    ATL_CHECK_STATUS(kvs_set_value(KVS_NAME, key_storage.data(), val_storage), "failed to set val");
 
     return ATL_STATUS_SUCCESS;
 }
@@ -242,16 +243,22 @@ atl_status_t pmi_resizable_simple_internal::pmrt_kvs_get(char* kvs_key,
                                                          void* kvs_val,
                                                          size_t kvs_val_len) {
     int ret;
-    char key_storage[max_keylen];
+    std::vector<char> key_storage(max_keylen);
     std::string val_storage_str;
+    if (kvs_val_len > max_vallen) {
+        LOG_ERROR("asked len > max len");
+        return ATL_STATUS_FAILURE;
+    }
 
-    ret = snprintf(key_storage, max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
+    ret = snprintf(
+        key_storage.data(), max_keylen - 1, RESIZABLE_PMI_RT_KEY_FORMAT, kvs_key, proc_idx);
     if (ret < 0) {
         LOG_ERROR("snprintf failed");
         return ATL_STATUS_FAILURE;
     }
 
-    ATL_CHECK_STATUS(kvs_get_value(KVS_NAME, key_storage, val_storage_str), "failed to get val");
+    ATL_CHECK_STATUS(kvs_get_value(KVS_NAME, key_storage.data(), val_storage_str),
+                     "failed to get val");
 
     ret = decode(val_storage_str.c_str(), kvs_val, kvs_val_len);
     if (ret) {
@@ -306,10 +313,13 @@ atl_status_t pmi_resizable_simple_internal::kvs_get_value(const std::string& kvs
     } while (value.empty() && kvs_get_time < kvs_get_timeout);
 
     if (kvs_get_time >= kvs_get_timeout) {
-        LOG_ERROR("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
+        LOG_ERROR("KVS get error: timeout limit: ",
                   kvs_get_time,
+                  " > ",
                   kvs_get_timeout,
+                  ", prefix: ",
                   result_kvs_name.c_str(),
+                  ", key: ",
                   key);
         return ATL_STATUS_FAILURE;
     }
diff --git a/src/atl/util/pm/pmi_rt/pmi_simple.h b/src/atl/util/pm/pmi_rt/pmi_simple.h
index 4f6df3895..80142d89d 100644
--- a/src/atl/util/pm/pmi_rt/pmi_simple.h
+++ b/src/atl/util/pm/pmi_rt/pmi_simple.h
@@ -19,6 +19,8 @@
 class pmi_simple final : public ipmi {
 public:
     pmi_simple();
+    pmi_simple(const pmi_simple &other) = delete;
+    pmi_simple &operator=(const pmi_simple &other) = delete;
     ~pmi_simple() override;
 
     int is_pm_resize_enabled() override;
diff --git a/src/ccl_api_functions.cpp b/src/ccl_api_functions.cpp
index 50966ba35..7f7a8594a 100644
--- a/src/ccl_api_functions.cpp
+++ b/src/ccl_api_functions.cpp
@@ -25,6 +25,13 @@
 
 #include "ccl_api_functions_generators.hpp"
 #include "common/global/global.hpp"
+#include "common/api_wrapper/mpi_api_wrapper.hpp"
+
+#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+#include "coll/algorithms/allgatherv/sycl/allgatherv_sycl.hpp"
+#include "coll/algorithms/allreduce/sycl/allreduce_sycl.hpp"
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.hpp"
+#endif // CCL_ENABLE_ZE || CCL_ENABLE_SYCL
 
 namespace ccl {
 
@@ -147,9 +154,58 @@ event allgatherv(const void* send_buf,
                  const stream& op_stream,
                  const allgatherv_attr& attr,
                  const vector_class<event>& deps) {
+#if !(defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL))
     impl_dispatch disp;
     return disp(comm)->allgatherv(
         send_buf, send_count, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
+#else // defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+    impl_dispatch disp;
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    bool is_single_node = false;
+    bool is_oversubscription = true;
+    if (ccl::global_data::env().backend == backend_mode::native) {
+        const ccl::topo_manager& topo_manager = global_comm->get_topo_manager();
+        is_single_node = topo_manager.is_single_node;
+        is_oversubscription = topo_manager.has_oversubscription();
+    }
+
+    sycl::queue q = op_stream.get_native();
+
+    if (ccl::global_data::env().skip_scheduler && q.is_in_order() && is_single_node &&
+        comm.size() == ccl::global_data::get().get_local_proc_count() && !is_oversubscription &&
+        (dtype == ccl::datatype::float16 || dtype == ccl::datatype::bfloat16 ||
+         dtype == ccl::datatype::float32 || dtype == ccl::datatype::int32)) {
+        LOG_DEBUG("|CCL_SYCL| allgatherv selects sycl-kernels send_count: ",
+                  send_count,
+                  ", datatype: ",
+                  dtype);
+
+        bool done = false;
+        ccl::event e = allgather_sycl(q,
+                                      send_buf,
+                                      send_count,
+                                      recv_buf,
+                                      recv_counts,
+                                      dtype,
+                                      comm,
+                                      op_stream,
+                                      attr,
+                                      deps,
+                                      done);
+        if (done) {
+            if (ccl::global_data::env().enable_op_sync) {
+                e.wait();
+            }
+            return e;
+        }
+    }
+
+    LOG_DEBUG(
+        "|CCL_SCHED| allgatherv using scheduler send_count: ", send_count, ", datatype: ", dtype);
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
+#endif // defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
 }
 
 event allgatherv(const void* send_buf,
@@ -310,9 +366,48 @@ event allreduce(const void* send_buf,
                 const stream& op_stream,
                 const allreduce_attr& attr,
                 const vector_class<event>& deps) {
+#if !(defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL))
     impl_dispatch disp;
     return disp(comm)->allreduce(
         send_buf, recv_buf, count, dtype, reduction, disp(op_stream), attr, deps);
+#else // defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+    impl_dispatch disp;
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    bool is_single_node = false;
+    bool is_oversubscription = true;
+    if (ccl::global_data::env().backend == backend_mode::native) {
+        const ccl::topo_manager& topo_manager = global_comm->get_topo_manager();
+        is_single_node = topo_manager.is_single_node;
+        is_oversubscription = topo_manager.has_oversubscription();
+    }
+
+    sycl::queue q = op_stream.get_native();
+
+    if (ccl::global_data::env().skip_scheduler && is_single_node && !is_oversubscription &&
+        q.is_in_order() && reduction == ccl::reduction::sum &&
+        comm.size() == ccl::global_data::get().get_local_proc_count() &&
+        (dtype == ccl::datatype::float16 || dtype == ccl::datatype::bfloat16 ||
+         dtype == ccl::datatype::float32 || dtype == ccl::datatype::int32)) {
+        LOG_DEBUG(
+            "|CCL_SYCL| allreduce selects sycl-kernels count: ", count, ", datatype: ", dtype);
+
+        bool done = false;
+        ccl::event e = allreduce_sycl(
+            q, send_buf, recv_buf, count, dtype, reduction, comm, op_stream, attr, deps, done);
+        if (done) {
+            if (ccl::global_data::env().enable_op_sync) {
+                e.wait();
+            }
+            return e;
+        }
+    }
+
+    LOG_DEBUG("|CCL_SCHED| allreduce selects scheduler count: ", count, ", datatype: ", dtype);
+    return disp(comm)->allreduce(
+        send_buf, recv_buf, count, dtype, reduction, disp(op_stream), attr, deps);
+#endif // defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+    assert(false);
 }
 
 event allreduce(const void* send_buf,
@@ -857,9 +952,52 @@ event reduce_scatter(const void* send_buf,
                      const stream& op_stream,
                      const reduce_scatter_attr& attr,
                      const vector_class<event>& deps) {
+#if !(defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL))
     impl_dispatch disp;
     return disp(comm)->reduce_scatter(
         send_buf, recv_buf, recv_count, dtype, reduction, disp(op_stream), attr, deps);
+#else // defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+    impl_dispatch disp;
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    bool is_single_node = false;
+    bool is_oversubscription = true;
+    if (ccl::global_data::env().backend == backend_mode::native) {
+        const ccl::topo_manager& topo_manager = global_comm->get_topo_manager();
+        is_single_node = topo_manager.is_single_node;
+        is_oversubscription = topo_manager.has_oversubscription();
+    }
+
+    sycl::queue q = op_stream.get_native();
+
+    if (ccl::global_data::env().skip_scheduler && is_single_node && q.is_in_order() &&
+        !is_oversubscription && comm.size() == ccl::global_data::get().get_local_proc_count() &&
+        (dtype == ccl::datatype::float16 || dtype == ccl::datatype::bfloat16 ||
+         dtype == ccl::datatype::float32 || dtype == ccl::datatype::int32) &&
+        reduction == ccl::reduction::sum) {
+        LOG_DEBUG("|CCL_SYCL| reduce_scatter selects sycl-kernels recv_count: ",
+                  recv_count,
+                  ", datatype: ",
+                  dtype)
+        ccl::event e;
+        bool done = false;
+        e = reduce_scatter_sycl(
+            q, send_buf, recv_buf, recv_count, dtype, reduction, comm, op_stream, done);
+        if (done) {
+            if (ccl::global_data::env().enable_op_sync) {
+                e.wait();
+            }
+            return e;
+        }
+    }
+
+    LOG_DEBUG("|CCL_SCHED| reduce_scatter using scheduler recv_count: ",
+              recv_count,
+              ", datatype: ",
+              dtype);
+    return disp(comm)->reduce_scatter(
+        send_buf, recv_buf, recv_count, dtype, reduction, disp(op_stream), attr, deps);
+#endif // defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
 }
 
 event reduce_scatter(const void* send_buf,
diff --git a/src/coll/algorithms/algorithm_utils.cpp b/src/coll/algorithms/algorithm_utils.cpp
index fdab96d67..9bc16e76d 100644
--- a/src/coll/algorithms/algorithm_utils.cpp
+++ b/src/coll/algorithms/algorithm_utils.cpp
@@ -89,14 +89,50 @@ void ccl_get_segment_sizes(size_t dtype_size,
     }
 }
 
+bool ccl_is_ptr_aligned(uintptr_t ptr, size_t alignment) {
+    CCL_THROW_IF_NOT(alignment != 0, "memory alignment cannot be 0 by definition");
+    return (ptr % alignment) == 0;
+}
+
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+static bool is_reorderable_algo(const char* algo_name) {
+    const char* reordable_algo_prefixes[] = {
+        "ALLREDUCE_PIPE",
+        "ALLGATHERV_PIPE",
+        "REDUCE_PIPE",
+        "REDUCE_SCATTER_PIPE",
+    };
+    for (auto& reordable_algo_prefix : reordable_algo_prefixes) {
+        if (0 == strncmp(algo_name, reordable_algo_prefix, strlen(reordable_algo_prefix))) {
+            return true;
+        }
+    }
+    return false;
+}
 
-uint32_t submit_ze_commands_in_subsched_entries(ccl_sched* sched) {
+// Reorders commands to minimize wait time
+//
+// Sample case before submission, 3-stage command split into 2 chunks:
+//      chunk_0.0 chunk_0.1 chunk_0.2 chunk_1.0 chunk_1.1 chunk_1.2
+//
+// Problem: each command within chunk depends on its predecessor, serial execution
+//
+// Submission order within this function:
+//      chunk_0.0 chunk_1.0 chunk_0.1 chunk_1.1 chunk_0.2 chunk_1.2
+// allows parallel execution of e.g. chunk_0.0 and chunk_1.1
+//
+// Reordering only happens for entries that have a name that matches `is_reorderable_algo`
+//
+// Opportunity for micro-optimization and simplification: compare enum instead of strings
+uint32_t ccl_submit_ze_commands_in_subsched_entries(ccl_sched* sched) {
     std::vector<subsched_entry*> subsched_chunks;
     for (auto& entry : sched->entries) {
-        if (!strncmp(entry->name(), "ALLREDUCE_PIPE", strlen("ALLREDUCE_PIPE"))) {
+        if (is_reorderable_algo(entry->name())) {
             subsched_chunks.push_back(static_cast<subsched_entry*>(entry.get()));
         }
+        else {
+            LOG_DEBUG("entry: ", entry->name(), "is NOT reorderable algo")
+        }
     }
 
     auto chunk_count = subsched_chunks.size();
@@ -106,8 +142,10 @@ uint32_t submit_ze_commands_in_subsched_entries(ccl_sched* sched) {
     bool done = false;
     uint32_t command_count = 0;
     int cmd_idx = 0;
+    // iterate over each stage
     while (!done) {
         done = true;
+        // iterate over each chunk
         for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
             LOG_DEBUG("cmd_idx=",
                       cmd_idx,
@@ -136,4 +174,175 @@ uint32_t submit_ze_commands_in_subsched_entries(ccl_sched* sched) {
     return command_count;
 }
 
+// checks if pipelining is enabled and returns pipelining parameters
+// output params:
+//  - main_chunk_count: number of elements in the "standard" chunk, i.e. not counting the remainder
+//  - mem_align: alignment required for each chunk; cannot be zero
+bool ccl_is_pipe_enabled(const size_t count,
+                         const size_t dtype_size,
+                         const size_t chunk_count,
+                         size_t& main_chunk_count,
+                         size_t& mem_align) {
+    bool ret = true;
+    // Note about cache lines and pipelining: The same cache line must contain
+    // a single chunk only.
+    //
+    // If the same cache line contains two chunks (or more), and we parallelize
+    // the instructions required for both chunks, a conflict (race condition)
+    // may appear between the copy-out for the scaleout portion and the
+    // reduce_scatter phase.
+    //
+    // The easiest way to avoid that race condition is to require that each
+    // cache line contains a single entry. If that is not the case, we must not
+    // parallelize the instructions for different chunks.
+
+    bool is_pipe = chunk_count > 1 && ccl::global_data::env().enable_ze_single_list;
+
+    // TODO: why does oneCCL have CACHELINE_SIZE *and* CCL_KERNEL_MEM_ALIGN?
+    mem_align = ccl::global_data::env().kernel_mem_align;
+    CCL_THROW_IF_NOT(mem_align != 0, "memory alignment cannot be zero by definition");
+    size_t buf_size_bytes = count * dtype_size;
+
+    // First, determine if we need to fallback to non-pipelined algorightm.
+    // Such a fallback may happen in cases such as (1) the user requests it,
+    // (2) message fits into a cache line, or (3) the cache line size is not
+    // divisible by the data type size.
+
+    size_t number_of_cache_lines_per_chunk =
+        is_pipe ? std::max(mem_align, buf_size_bytes / chunk_count) / mem_align : 1;
+    size_t main_chunk_size_bytes = mem_align * number_of_cache_lines_per_chunk;
+    main_chunk_count = main_chunk_size_bytes / dtype_size;
+
+    bool is_dtype_divisible = ((main_chunk_size_bytes % dtype_size) == 0);
+    bool is_msg_bigger_than_cache_line = buf_size_bytes > main_chunk_size_bytes;
+
+    bool is_singleworker =
+        !ccl::global_data::env().ze_multi_workers || (ccl::global_data::env().worker_count <= 1);
+
+    if (!is_pipe) {
+        LOG_DEBUG("Pipelining code disabled");
+        ret = false;
+    }
+    else {
+        if (!is_dtype_divisible) {
+            LOG_INFO("Running without pipelining because datatype size (",
+                     dtype_size,
+                     ") is not divisible by cache line size (",
+                     mem_align,
+                     ")");
+            ret = false;
+        }
+        if (!is_msg_bigger_than_cache_line) {
+            LOG_INFO("Running without pipelining because message size (",
+                     buf_size_bytes,
+                     ") is smaller than a cache line (",
+                     mem_align,
+                     ") or than main_chunk_size_bytes (",
+                     main_chunk_size_bytes,
+                     ")");
+            ret = false;
+        }
+        if (!is_singleworker) {
+            LOG_INFO("Running without pipelining because ze_multi_workers was requested with ",
+                     ccl::global_data::env().worker_count,
+                     " workers, which is more than one worker ");
+            ret = false;
+        }
+    }
+
+    return ret;
+}
+
+ccl::status ccl_build_topo_uniform_buff_size_op(
+    ccl_sched* sched,
+    ccl_buffer send_buf,
+    ccl_buffer recv_buf,
+    size_t count,
+    size_t dtype_size,
+    size_t pipe_nof_chunks,
+    const std::string& op_name,
+    ccl::profile::metrics_counter& metrics,
+    std::function<
+        ccl::status(ccl_sched* sched, ccl_buffer send_buf, ccl_buffer recv_buf, size_t count)>
+        fill_op_lambda) {
+    size_t mem_align = 0;
+    size_t main_chunk_count = 0;
+    if (!ccl_is_pipe_enabled(count, dtype_size, pipe_nof_chunks, main_chunk_count, mem_align)) {
+        // Fall back to topo algorithm without pipelining
+        fill_op_lambda(sched, send_buf, recv_buf, count);
+        entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
+
+        return ccl::status::success;
+    }
+
+    LOG_DEBUG("build pipe ", op_name);
+
+    // Need to re-calculate chunk_count after main_chunk_size_bytes calculation
+    // with cache alignment in mind.
+    size_t chunk_count = count / main_chunk_count;
+    size_t last_chunk_count = main_chunk_count + (count % main_chunk_count);
+
+    sched->try_enable_ze_single_list();
+    auto sync_obj = std::make_shared<sync_object>(chunk_count);
+    bool is_parallelizable_chunks = true;
+
+    for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+        size_t chunk_offset = chunk_idx * main_chunk_count * dtype_size;
+        ccl_buffer sbuf = send_buf + chunk_offset;
+        ccl_buffer rbuf = recv_buf + chunk_offset;
+        size_t this_chunk_count =
+            (chunk_idx == (chunk_count - 1)) ? last_chunk_count : main_chunk_count;
+
+        if (this_chunk_count || (count == 0 && chunk_idx == 0)) {
+            entry_factory::create<subsched_entry>(
+                sched,
+                chunk_idx,
+                [sched, sbuf, rbuf, this_chunk_count, sync_obj, fill_op_lambda](ccl_sched* s) {
+                    s->inherit_ze_managers_from(sched);
+                    s->set_init_ze_hook_sync_obj(sync_obj);
+                    s->set_ze_commands_bypass_flag(false);
+
+                    fill_op_lambda(s, sbuf, rbuf, this_chunk_count);
+                },
+                (op_name + "_PIPE" + std::to_string(chunk_idx)).c_str());
+        }
+        // WARNING: previous chunk has part of this chunk's first cache
+        // line. Cannot use pipelining. However, since this is a
+        // "local" decision (i.e., other ranks may decide differently),
+        // we still need to apply chunking. However, we will run one
+        // chunk at a time, without parallelizing them.
+        // Another way to have implemented this would be to link the
+        // last task of the prev chunk with the first of this chunk
+        // with an event.
+        is_parallelizable_chunks &=
+            ccl_is_ptr_aligned(reinterpret_cast<uintptr_t>(rbuf.get_ptr()), mem_align);
+    }
+
+    static bool is_chunk_mem_align_warning_printed{};
+    if (!is_parallelizable_chunks && !is_chunk_mem_align_warning_printed) {
+        is_chunk_mem_align_warning_printed = true;
+        LOG_WARN(
+            "[",
+            op_name,
+            " pipelining]: For best performance, (i) chunk size should be a multiple of a cache line (",
+            mem_align,
+            " bytes), and (ii) buffers in all ranks should be aligned to ",
+            mem_align);
+    }
+
+    if (!is_parallelizable_chunks) {
+        metrics.nonparallel_calls_per_count[count]++;
+    }
+    else {
+        metrics.parallel_calls_per_count[count]++;
+    }
+
+    entry_factory::create<ze_execute_cmdlists_on_start_entry>(
+        sched,
+        sync_obj,
+        is_parallelizable_chunks ? ccl_submit_ze_commands_in_subsched_entries : nullptr);
+
+    return ccl::status::success;
+}
+
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
diff --git a/src/coll/algorithms/algorithm_utils.hpp b/src/coll/algorithms/algorithm_utils.hpp
index 86c522a0b..c6abb7848 100644
--- a/src/coll/algorithms/algorithm_utils.hpp
+++ b/src/coll/algorithms/algorithm_utils.hpp
@@ -18,7 +18,9 @@
 #include <vector>
 
 #include "common/utils/enums.hpp"
+#include "common/utils/buffer.hpp"
 #include "oneapi/ccl/types.hpp"
+#include "internal_types.hpp"
 
 #define CCL_COLL_LIST \
     ccl_coll_allgatherv, ccl_coll_allreduce, ccl_coll_alltoall, ccl_coll_alltoallv, \
@@ -168,5 +170,23 @@ void ccl_get_segment_sizes(size_t dtype_size,
 class ccl_sched;
 
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
-uint32_t submit_ze_commands_in_subsched_entries(ccl_sched* sched);
+bool ccl_is_pipe_enabled(const size_t count,
+                         const size_t dtype_size,
+                         const size_t chunk_count,
+                         size_t& main_chunk_count,
+                         size_t& mem_align);
+bool ccl_is_ptr_aligned(uintptr_t start_ptr, size_t mem_align);
+ccl::status ccl_build_topo_uniform_buff_size_op(
+    ccl_sched* sched,
+    ccl_buffer send_buf,
+    ccl_buffer recv_buf,
+    size_t count,
+    size_t dtype_size,
+    size_t pipe_nof_chunks,
+    const std::string& op_name,
+    ccl::profile::metrics_counter& metrics,
+    std::function<
+        ccl::status(ccl_sched* sched, ccl_buffer send_buf, ccl_buffer recv_buf, size_t count)>
+        fill_op_lambda);
+uint32_t ccl_submit_ze_commands_in_subsched_entries(ccl_sched* sched);
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
diff --git a/src/coll/algorithms/algorithms.hpp b/src/coll/algorithms/algorithms.hpp
index 6e90a12d0..e31dfda8c 100644
--- a/src/coll/algorithms/algorithms.hpp
+++ b/src/coll/algorithms/algorithms.hpp
@@ -44,6 +44,7 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* main_sched,
                                            size_t send_count,
                                            ccl_buffer recv_buf,
                                            const size_t* recv_counts,
+                                           const std::vector<ccl_buffer>& recv_device_bufs,
                                            const ccl_datatype& dtype,
                                            ccl_comm* comm);
 ccl::status ccl_coll_build_flat_allgatherv(ccl_sched* main_sched,
@@ -85,6 +86,7 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
                                           ccl_buffer send_buf,
                                           ccl_buffer recv_buf,
                                           size_t count,
+                                          const std::vector<ccl_buffer>& recv_device_bufs,
                                           const ccl_datatype& dtype,
                                           ccl::reduction reduction,
                                           ccl_comm* comm);
@@ -229,6 +231,14 @@ ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
                                            int root,
                                            ccl_comm* comm);
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_reduce_fill(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            ccl_buffer recv_buf,
+                                            size_t count,
+                                            const ccl_datatype& dtype,
+                                            ccl::reduction reduction,
+                                            int root,
+                                            ccl_comm* comm);
 ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
                                        ccl_buffer send_buf,
                                        ccl_buffer recv_buf,
@@ -262,6 +272,13 @@ ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
                                                      ccl::reduction reduction,
                                                      ccl_comm* comm);
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_reduce_scatter_fill(ccl_sched* sched,
+                                                    ccl_buffer send_buf,
+                                                    ccl_buffer recv_buf,
+                                                    size_t send_count,
+                                                    const ccl_datatype& dtype,
+                                                    ccl::reduction reduction,
+                                                    ccl_comm* comm);
 ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                                                ccl_buffer send_buf,
                                                ccl_buffer recv_buf,
diff --git a/src/coll/algorithms/allgatherv.cpp b/src/coll/algorithms/allgatherv/allgatherv.cpp
similarity index 83%
rename from src/coll/algorithms/allgatherv.cpp
rename to src/coll/algorithms/allgatherv/allgatherv.cpp
index 428a2e589..e26069a3c 100644
--- a/src/coll/algorithms/allgatherv.cpp
+++ b/src/coll/algorithms/allgatherv/allgatherv.cpp
@@ -95,6 +95,7 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* main_sched,
                                            size_t send_count,
                                            ccl_buffer recv_buf,
                                            const size_t* recv_counts,
+                                           const std::vector<ccl_buffer>& recv_device_bufs,
                                            const ccl_datatype& dtype,
                                            ccl_comm* comm) {
     LOG_DEBUG("build ring allgatherv, send_count ", send_count);
@@ -110,6 +111,10 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* main_sched,
     int comm_size = comm->size();
     const size_t sched_count = scheds.size();
     const size_t dtype_size = dtype.size();
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    // Note: HMEM case does not require copy to device stage
+    bool enable_hmem = (ccl::global_data::env().use_hmem && atl_base_comm::attr.out.enable_hmem);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     size_t* offsets = static_cast<size_t*>(CCL_MALLOC(comm_size * sizeof(size_t), "offsets"));
     offsets[0] = 0;
@@ -185,6 +190,30 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* main_sched,
             // in the next loop iteration, we are sending the received data-block forward
             // following the ring algorithm. Therefore, barrier is needed.
             scheds[s_idx]->add_barrier();
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+            // in scale-out case it is possible to start copying the data to device
+            // right after the receive operation, overalpping with the next send operation.
+            if (recv_block_thread_counts[s_idx] && !enable_hmem && !recv_device_bufs.empty()) {
+                // express dependency between the recv_entry and ze_copy_entry
+                auto signaled_event = ccl::add_signal_event(scheds[s_idx]);
+
+                // prepare buffers
+                ccl_buffer copy_src = rbuf + recv_sched_offset[s_idx];
+                ccl_buffer copy_dst = recv_device_bufs[recv_block_idx] + recv_sched_offset[s_idx];
+                size_t copy_counts = recv_block_thread_counts[s_idx];
+
+                // Submit parallel H2D copy with the next send operation
+                entry_factory::create<ze_copy_entry>(
+                    scheds[s_idx],
+                    copy_src,
+                    copy_dst,
+                    copy_counts,
+                    dtype,
+                    copy_attr(copy_direction::h2d),
+                    std::vector<ze_event_handle_t>{ signaled_event });
+            }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         }
 
         block_idx = (comm_size + block_idx - 1) % comm_size; // move left
@@ -786,7 +815,9 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
                                            std::vector<ccl_sched*>& scheds,
                                            const ccl_coll_param& coll_param) {
     size_t chunk_count = ccl::global_data::env().allgatherv_pipe_chunk_count;
-    bool is_pipe = chunk_count > 0 && ccl::global_data::env().enable_ze_single_list;
+    bool is_pipe = chunk_count > 1 && ccl::global_data::env().enable_ze_single_list;
+    bool is_multiworker =
+        ccl::global_data::env().ze_multi_workers && ccl::global_data::env().worker_count > 1;
 
     CCL_THROW_IF_NOT(
         scheds.size() == 1, "size of schedule list must be one, but is ", scheds.size());
@@ -798,20 +829,27 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
 
     std::vector<ccl_buffer> recv_bufs{};
     const std::vector<size_t>& recv_counts = coll_param.recv_counts;
+    const std::vector<size_t> main_chunk_counts(recv_counts.size());
     ccl_coll_get_allgatherv_bufs(coll_param, recv_bufs);
 
     const size_t send_count = recv_counts[comm->rank()];
     ccl_buffer send_buf;
+
     if (is_inplace) {
         send_buf = recv_bufs[comm->rank()];
     }
     else {
         send_buf = ccl_buffer(coll_param.get_send_buf(), send_count * dtype.size());
     }
-
-    if (!is_pipe) {
+    if (!is_pipe || is_multiworker) {
         // Fall back to topo algorithm without pipelining
-        LOG_DEBUG("build topo allgatherv - pipe allgatherv disabled");
+        if (!is_pipe) {
+            LOG_DEBUG("build topo allgatherv - pipe allgatherv disabled");
+        }
+        if (is_multiworker) {
+            LOG_INFO(
+                "Running without pipelining because ze_multi_workers was requested with more than one worker");
+        }
 
         ccl_coll_build_topo_allgatherv_fill(
             sched, send_buf, send_count, recv_bufs, recv_counts, dtype, comm, is_inplace);
@@ -823,45 +861,85 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
 
     LOG_DEBUG("build pipe allgatherv");
 
+    sched->try_enable_ze_single_list();
+
+    // recalculate size of chunks based on cache alignment
+    // e.g. we might need to have bigger chunks than actually calculated due to cache alignment
+    // chunk_count stays the same, but some chunks might be bigger and others might be zero-sized
+    auto sync_obj = std::make_shared<sync_object>(chunk_count);
+    // count that caused pipelining should be printed => use array of bools
+    std::vector<bool> is_parallelizable_chunks_idx(comm->size(), true);
+    bool is_parallelizable_chunks = true; // convenience variable
+    size_t mem_align = 0;
+    std::vector<size_t> chunked_recv_counts(comm->size(), 0);
+    std::vector<ccl_buffer> chunked_recv_bufs(comm->size());
     for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+        size_t send_buffer_offset = 0;
+        bool is_empty_total_size = true;
+        for (int rank_idx = 0; rank_idx < comm->size(); rank_idx++) {
+            size_t main_chunk_count = 0;
+            bool pipe_possible =
+                ccl_is_pipe_enabled(recv_counts[rank_idx],
+                                    dtype.size(),
+                                    ccl::global_data::env().allgatherv_pipe_chunk_count,
+                                    main_chunk_count,
+                                    mem_align);
+            if (!pipe_possible) {
+                // only a single buffer might have issues with alignemnt, but we need to pipeline
+                // either all or none; we split the chunks, but we don't parallelize them
+                is_parallelizable_chunks = false;
+                is_parallelizable_chunks_idx[rank_idx] = false;
+            }
+            // more than two last chunks might have different size
+            size_t already_processed_buffer_size = main_chunk_count * chunk_idx;
+            if (rank_idx == comm->rank()) {
+                // send_buffer is current recv_buffer
+                send_buffer_offset = already_processed_buffer_size;
+                // send_buffer_offset might be > recv_counts, but then - chunked_recv_counts == 0
+            }
+            size_t remaining_to_process =
+                (recv_counts[rank_idx] > already_processed_buffer_size)
+                    ? recv_counts[rank_idx] - already_processed_buffer_size
+                    : 0;
+            bool is_last_chunk = (chunk_idx == (chunk_count - 1));
+            chunked_recv_counts[rank_idx] = remaining_to_process < main_chunk_count || is_last_chunk
+                                                ? remaining_to_process
+                                                : main_chunk_count;
+            // chunked_recv_bufs[rank_idx] might be invalid if chunked_recv_counts[rank_idx] are 0
+            // zero sized buffer should be handled correctly without dereferencing the memory
+            chunked_recv_bufs[rank_idx] =
+                recv_bufs[rank_idx] + chunk_idx * main_chunk_count * dtype.size();
+            if (chunked_recv_counts[rank_idx] > 0) {
+                is_empty_total_size = false;
+            }
+        }
+
+        const size_t chunked_send_count = chunked_recv_counts[comm->rank()];
+        ccl_buffer chunked_send_buf{};
+        if (!is_inplace) {
+            // offset and pointer might be invalid only when sending zero-sized chunk
+            chunked_send_buf = send_buf + send_buffer_offset * dtype.size();
+        }
+        else {
+            chunked_send_buf = chunked_recv_bufs[comm->rank()];
+        }
         entry_factory::create<subsched_entry>(
             sched,
             chunk_idx,
-            [comm,
+            [sched,
+             comm,
              is_inplace,
              dtype,
-             send_buf,
-             send_count,
-             recv_bufs,
-             recv_counts,
+             chunked_send_buf,
+             chunked_send_count,
+             chunked_recv_bufs,
+             chunked_recv_counts,
+             is_empty_total_size,
              chunk_idx,
-             chunk_count](ccl_sched* s) {
-                bool is_empty_total_size = true;
-                std::vector<ccl_buffer> chunked_recv_bufs(comm->size());
-                std::vector<size_t> chunked_recv_counts(comm->size(), 0);
-
-                for (int idx = 0; idx < comm->size(); idx++) {
-                    size_t main_chunk_count = recv_counts[idx] / chunk_count;
-                    size_t last_chunk_count = main_chunk_count + recv_counts[idx] % chunk_count;
-                    chunked_recv_counts[idx] =
-                        (chunk_idx == (chunk_count - 1)) ? last_chunk_count : main_chunk_count;
-                    chunked_recv_bufs[idx] =
-                        recv_bufs[idx] + chunk_idx * main_chunk_count * dtype.size();
-                    if (chunked_recv_counts[idx] > 0) {
-                        is_empty_total_size = false;
-                    }
-                }
-
-                const size_t main_chunked_send_count = send_count / chunk_count;
-                const size_t chunked_send_count = chunked_recv_counts[comm->rank()];
-                ccl_buffer chunked_send_buf{};
-                if (!is_inplace) {
-                    chunked_send_buf =
-                        send_buf + chunk_idx * main_chunked_send_count * dtype.size();
-                }
-                else {
-                    chunked_send_buf = chunked_recv_bufs[comm->rank()];
-                }
+             sync_obj](ccl_sched* s) {
+                s->inherit_ze_managers_from(sched);
+                s->set_init_ze_hook_sync_obj(sync_obj);
+                s->set_ze_commands_bypass_flag(false);
 
                 if (is_empty_total_size) {
                     // TODO: ccl_coll_build_topo_allgatherv_fill should be able to handle 0-sized inputs!
@@ -881,10 +959,43 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
                                                            is_inplace);
             },
             ("ALLGATHERV_PIPE" + std::to_string(chunk_idx)).c_str());
-        sched->add_barrier();
+        for (auto& rbuf : chunked_recv_bufs) {
+            // WARNING: previous chunk has part of this chunk's first cache
+            // line. Cannot use pipelining. However, since this is a
+            // "local" decision (i.e., other ranks may decide differently),
+            // we still need to apply chunking. However, we will run one
+            // chunk at a time, without parallelizing them.
+            // Another way to have implemented this would be to link the
+            // last task of the prev chunk with the first of this chunk
+            // with an event.
+            is_parallelizable_chunks &=
+                ccl_is_ptr_aligned(reinterpret_cast<uintptr_t>(rbuf.get_ptr()), mem_align);
+        }
     }
 
-    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
+    static bool is_chunk_mem_align_warning_printed{};
+    if (!is_parallelizable_chunks && !is_chunk_mem_align_warning_printed) {
+        is_chunk_mem_align_warning_printed = true;
+        LOG_WARN(
+            "[allgatherv pipelining]: For best performance, (i) chunk size should be a multiple of a cache line (",
+            mem_align,
+            " bytes), and (ii) buffers in all ranks should be aligned to ",
+            mem_align);
+    }
+    for (int idx = 0; idx < comm->size(); idx++) {
+        if (!is_parallelizable_chunks_idx[idx]) {
+            ccl::global_data::get()
+                .metrics_profiler->allgatherv_pipe.nonparallel_calls_per_count[recv_counts[idx]]++;
+        }
+        else {
+            ccl::global_data::get()
+                .metrics_profiler->allgatherv_pipe.parallel_calls_per_count[recv_counts[idx]]++;
+        }
+    }
+    entry_factory::create<ze_execute_cmdlists_on_start_entry>(
+        sched,
+        sync_obj,
+        is_parallelizable_chunks ? ccl_submit_ze_commands_in_subsched_entries : nullptr);
 
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/allgatherv/sycl/.clang-format b/src/coll/algorithms/allgatherv/sycl/.clang-format
new file mode 100644
index 000000000..726f33fd2
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/.clang-format
@@ -0,0 +1,145 @@
+---
+Language: Cpp
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
+AlignEscapedNewlines: DontAlign
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Empty
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: true
+  BeforeElse: true
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: false
+ColumnLimit: 115
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: true
+DisableFormat: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+ReflowComments: false
+SortIncludes: false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth: 1
+UseTab: Never
+...
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.cpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.cpp
new file mode 100644
index 000000000..5bf69d929
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.cpp
@@ -0,0 +1,249 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allgatherv/sycl/allgatherv_large_sycl_impl.hpp"
+#include "sched/entry/factory/entry_factory.hpp"
+
+std::pair<ccl_sched*, ze_handle_exchange_entry*> do_ipc_exchange(ccl_comm* comm,
+                                                                 ccl_stream* stream,
+                                                                 std::vector<void*> ptrs) {
+    ccl_comm* node_comm = comm->get_node_comm().get();
+    std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers;
+
+    for (auto ptr : ptrs) {
+        in_buffers.emplace_back(ptr, ccl::ze::ipc_mem_type::memory);
+    }
+
+    ccl_coll_param param{};
+    param.comm = comm;
+    param.stream = stream;
+    ccl_coll_attr attr{};
+    static ccl_sched* sched = ccl_sched::create(param, attr);
+    ccl::utils::pt2pt_handle_exchange_info info = {};
+    int skip_rank = ccl_comm::invalid_rank;
+
+    ze_handle_exchange_entry* exchange_entry =
+        new ze_handle_exchange_entry(sched, node_comm, in_buffers, skip_rank, info);
+    exchange_entry->update();
+    return { sched, exchange_entry };
+}
+
+template <int N>
+ccl::event allgatherv_large_type(const void* send_buf,
+                                 size_t send_count,
+                                 void* recv_buf,
+                                 const ccl::vector_class<size_t>& recv_counts,
+                                 ccl::datatype dtype,
+                                 const ccl::communicator& comm,
+                                 const ccl::stream& op_stream,
+                                 const ccl::allgatherv_attr& attr,
+                                 const ccl::vector_class<ccl::event>& deps) {
+    ccl::event e;
+    switch (dtype) {
+        case ccl::datatype::float16:
+            e = allgatherv_large_impl<sycl::half, N>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case ccl::datatype::bfloat16:
+            e = allgatherv_large_impl<short, N>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case ccl::datatype::int32:
+            e = allgatherv_large_impl<int, N>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case ccl::datatype::float32:
+            e = allgatherv_large_impl<float, N>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        default: assert(false); break;
+    }
+    return e;
+}
+
+ccl::event allgatherv_large(const void* send_buf,
+                            size_t send_count,
+                            void* recv_buf,
+                            const ccl::vector_class<size_t>& recv_counts,
+                            ccl::datatype dtype,
+                            const ccl::communicator& comm,
+                            const ccl::stream& op_stream,
+                            const ccl::allgatherv_attr& attr,
+                            const ccl::vector_class<ccl::event>& deps) {
+    ccl::impl_dispatch disp;
+    static size_t allgatherv_count = 0;
+
+    sycl::queue q = op_stream.get_native();
+    assert(q.is_in_order());
+
+    const int comm_rank = comm.rank();
+    const int comm_size = comm.size();
+
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    std::shared_ptr<ccl_comm> pair_comm = global_comm->get_pair_comm();
+    std::shared_ptr<ccl_comm> even_comm = global_comm->get_even_comm();
+    std::shared_ptr<ccl_comm> node_comm = global_comm->get_node_comm();
+
+    const int pair_comm_size = pair_comm->size();
+    const int even_comm_size = even_comm->size();
+    const int node_comm_size = node_comm->size();
+
+    allgatherv_count++;
+    const bool is_use_tmp = ccl::global_data::env().allgatherv_use_tmp_buf;
+
+    if (allgatherv_count == 1) {
+        LOG_INFO("invoking allgatherv large kernel first time");
+        const size_t tmp_buffer_size = ccl::global_data::env().allgatherv_chunk_size * node_comm_size * 2;
+        tmp_buf = sycl::aligned_alloc_device<char>(4096, tmp_buffer_size, q);
+
+        sync_remote_ptrs[node_comm->rank()] = sycl::malloc_device<size_t>(MAX_NODE_RANKS, q);
+        q.memset(sync_remote_ptrs[node_comm->rank()], 0, MAX_NODE_RANKS * sizeof(size_t)).wait();
+    }
+    {
+        // ipc exchange needs to be done everytime if tmp buffer is not used and
+        // when tmp buffer is used, it needs to be done only for the first time
+        if (!is_use_tmp || allgatherv_count == 1) {
+            void* data_buf_send = is_use_tmp ? tmp_buf : (void*)send_buf;
+            void* data_buf_recv = is_use_tmp ? tmp_buf : recv_buf;
+            bool to_cache = !is_use_tmp;
+
+            std::vector<void*> ptrs{ data_buf_send, data_buf_recv }; // index 0 and 1
+            // need to get sync remote pointers only onece
+            if (allgatherv_count == 1) {
+                ptrs.push_back(sync_remote_ptrs[comm_rank]); // index 2
+            }
+
+            auto [sched, exchange_entry] = do_ipc_exchange(global_comm, disp(op_stream).get(), ptrs);
+
+            // kernels are unable to read/write data from the ipc pointers and
+            // as a workaround touching them once with memcpy seems to fix the issue
+            std::vector<sycl::event> dummy_copy_events;
+            static sycl::queue q_worker(q.get_device());
+            // need to get sync remote pointers only onece
+            if (allgatherv_count == 1) {
+                for (int i = 1; i < node_comm_size; i++) {
+                    int peer_rank = (node_comm->rank() + i) % node_comm_size;
+                    ccl_buffer tmp_ccl_buf;
+                    sched->get_memory().handle_manager.get(
+                        peer_rank, 2, tmp_ccl_buf, node_comm.get(), false /*pt2pt_op*/, false);
+                    CCL_THROW_IF_NOT(tmp_ccl_buf.get_ptr(), "null IPC buffer is received");
+                    sync_remote_ptrs[peer_rank] = (size_t*)tmp_ccl_buf.get_ptr();
+                    dummy_copy_events.push_back(q_worker.memcpy(tmp_buf, sync_remote_ptrs[peer_rank], 1));
+                }
+            }
+
+            // get xelink remote pointers
+            auto ccl_dtype = ccl::global_data::get().dtypes->get(dtype);
+            const int dsize = ccl_dtype.size();
+            const int global_rank = even_comm->get_global_rank(even_comm->rank());
+            const size_t adjust_offset_count = (is_use_tmp || send_buf == recv_buf) ? 0 : send_count * global_rank;
+
+            // for inplace, send_buf is at an offset of send_count*global_rank in the recv_buf
+            // but for non-inplace send_buf is separate and there is no send_count*global_rank offset
+            // and therefore for non-inplace, subtract send_count*global_rank to the ptr, so that
+            // the algorithm can always access send buffer at an index of send_count*global_rank
+            xelink_ptrs[even_comm->rank()] = (char*)data_buf_send - adjust_offset_count * dsize;
+
+            for (int i = 1; i < even_comm_size; i++) {
+                int peer_rank = (even_comm->rank() + i) % even_comm_size;
+                ccl_buffer tmp_ccl_buf;
+                sched->get_memory().handle_manager.get(
+                    peer_rank, 0, tmp_ccl_buf, even_comm.get(), false /*pt2pt_op*/, to_cache);
+                CCL_THROW_IF_NOT(tmp_ccl_buf.get_ptr(), "null IPC buffer is received");
+                const int global_rank_pr = even_comm->get_global_rank(peer_rank);
+                const size_t adjust_offset_count_pr =
+                    (is_use_tmp || send_buf == recv_buf) ? 0 : send_count * global_rank_pr;
+                xelink_ptrs[peer_rank] = (char*)tmp_ccl_buf.get_ptr() - adjust_offset_count_pr * dsize;
+                if (allgatherv_count == 1)
+                    dummy_copy_events.push_back(q_worker.memcpy(tmp_buf, tmp_ccl_buf.get_ptr(), 1));
+            }
+
+            // get mdfi remote pointers
+            if (pair_comm_size > 1) {
+                int peer_rank = (pair_comm->rank() + 1) % pair_comm_size;
+                ccl_buffer tmp_ccl_buf;
+                sched->get_memory().handle_manager.get(
+                    peer_rank, 1, tmp_ccl_buf, pair_comm.get(), false /*pt2pt_op*/, to_cache);
+                CCL_THROW_IF_NOT(tmp_ccl_buf.get_ptr(), "null IPC buffer is received");
+                mdfi_ptr = tmp_ccl_buf.get_ptr();
+                if (allgatherv_count == 1)
+                    dummy_copy_events.push_back(q_worker.memcpy(tmp_buf, mdfi_ptr, 1));
+            }
+
+            if (allgatherv_count == 1) {
+                // combine the dummy copy events into the inorder queue
+                q.ext_oneapi_submit_barrier(dummy_copy_events);
+            }
+
+            // sycl_barrier is working only if a barrier is there
+            // between ipc exchange and first invocation of sycl_barrier
+            if (allgatherv_count == 1) {
+                ccl_comm* node_comm_ptr = node_comm.get();
+                q.submit([=](sycl::handler& h) {
+                    h.host_task([=]() {
+                        ccl::impl_dispatch disp;
+                        node_comm_ptr->barrier(disp(ccl::default_stream), ccl::default_barrier_attr).wait();
+                    });
+                });
+            }
+
+            delete exchange_entry;
+            sched->clear_memory();
+        }
+    }
+
+    LOG_DEBUG("|CCL_SYCL| allgatherv selects large kernel without ccl scheduler autotune");
+
+    // TODO: which is better as outer switch - comm_size of dtype?
+    ccl::event e;
+    //TODO: for multi-node we need to use node_comm_size
+    switch (comm_size) {
+        case 2:
+            e = allgatherv_large_type<2>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case 4:
+            e = allgatherv_large_type<4>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case 6:
+            e = allgatherv_large_type<6>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case 8:
+            e = allgatherv_large_type<8>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case 10:
+            e = allgatherv_large_type<10>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case 12:
+            e = allgatherv_large_type<12>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case 14:
+            e = allgatherv_large_type<14>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        case 16:
+            e = allgatherv_large_type<16>(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            break;
+        default: assert(false); break;
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.hpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.hpp
new file mode 100644
index 000000000..e5feaa23a
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.hpp
@@ -0,0 +1,27 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "oneapi/ccl.hpp"
+
+ccl::event allgatherv_large(const void* send_buf,
+                            size_t send_count,
+                            void* recv_buf,
+                            const ccl::vector_class<size_t>& recv_counts,
+                            ccl::datatype dtype,
+                            const ccl::communicator& comm,
+                            const ccl::stream& op_stream,
+                            const ccl::allgatherv_attr& attr,
+                            const ccl::vector_class<ccl::event>& deps);
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl_impl.hpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl_impl.hpp
new file mode 100644
index 000000000..bfef2e45a
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_large_sycl_impl.hpp
@@ -0,0 +1,560 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "oneapi/ccl.hpp"
+#include "common/global/global.hpp"
+#include "common/api_wrapper/mpi_api_wrapper.hpp"
+
+namespace ccl::v1 {
+struct impl_dispatch {
+    template <class Object>
+    const typename Object::impl_value_t& operator()(const Object& obj) {
+        return obj.get_impl();
+    }
+};
+}; // namespace ccl::v1
+
+static constexpr int MAX_NODE_RANKS = 16;
+static constexpr int MAX_GPUS = 8;
+
+void* tmp_buf = nullptr;
+std::array<size_t*, MAX_NODE_RANKS> sync_remote_ptrs;
+std::array<void*, MAX_GPUS> xelink_ptrs;
+void* mdfi_ptr = nullptr;
+
+constexpr bool use_sycl_kernel_block = true;
+constexpr int vec_size = 1;
+
+template <typename T,
+          sycl::access::address_space Space = sycl::access::address_space::global_space,
+          sycl::access::decorated IsDecorated = sycl::access::decorated::yes>
+sycl::multi_ptr<T, sycl::access::address_space::global_space, sycl::access::decorated::yes> get_multi_ptr(T* ptr) {
+    return sycl::address_space_cast<Space, IsDecorated>(ptr);
+}
+
+template <typename T, int N>
+void inline read_write(std::array<void*, MAX_GPUS> peer_even_ptrs,
+                       std::array<void*, MAX_GPUS> local_ptrs,
+                       std::array<void*, MAX_GPUS> peer_pair_ptrs,
+                       const size_t count,
+                       const sycl::nd_item<1> it) {
+    const size_t idx = it.get_global_linear_id();
+    sycl::sub_group sg = it.get_sub_group();
+    const size_t sgSize = sg.get_local_range()[0];
+
+    int base = (idx / sgSize) * sgSize * vec_size;
+    const long rem_elem_count = count - base;
+
+    if (idx < count) {
+        if (use_sycl_kernel_block && rem_elem_count > 0 && (size_t)rem_elem_count >= sgSize) {
+#pragma unroll
+            for (int i = 0; i < N; i++) {
+                auto val = sg.load<vec_size>(get_multi_ptr(&(((T*)peer_even_ptrs[i])[base])));
+                sg.store<vec_size>(get_multi_ptr(&(((T*)peer_pair_ptrs[i])[base])), val);
+                sg.store<vec_size>(get_multi_ptr(&(((T*)local_ptrs[i])[base])), val);
+            }
+        }
+        else {
+#pragma unroll
+            for (int i = 0; i < N; i++) {
+                const T val = ((T*)peer_even_ptrs[i])[idx];
+                ((T*)peer_pair_ptrs[i])[idx] = val;
+                ((T*)local_ptrs[i])[idx] = val;
+            }
+        }
+    }
+}
+
+template <typename T, int N>
+void inline copy_data(std::array<void*, MAX_GPUS> dst,
+                      std::array<void*, MAX_GPUS> src,
+                      const size_t count,
+                      const sycl::nd_item<1> it) {
+    const size_t idx = it.get_global_linear_id();
+    sycl::sub_group sg = it.get_sub_group();
+    const size_t sgSize = sg.get_local_range()[0];
+
+    int base = (idx / sgSize) * sgSize * vec_size;
+
+    if (idx < count) {
+        if (use_sycl_kernel_block) {
+#pragma unroll
+            for (int i = 0; i < N; i++) {
+                const sycl::vec<T, vec_size> val = sg.load<vec_size>(get_multi_ptr(&(((T*)src[i])[base])));
+                sg.store<vec_size>(get_multi_ptr(&(((T*)dst[i])[base])), val);
+            }
+        }
+        else {
+#pragma unroll
+            for (int i = 0; i < N; i++) {
+                ((T*)dst[i])[idx] = ((T*)src[i])[idx];
+            }
+        }
+    }
+}
+
+sycl::event sycl_barrier(std::shared_ptr<ccl_comm> comm,
+                         sycl::queue q,
+                         std::vector<sycl::event> dep_events,
+                         std::array<size_t*, MAX_NODE_RANKS> local_sync_remote_ptrs) {
+    static size_t barrier_count_val = 0;
+    barrier_count_val++;
+
+    const size_t counter = barrier_count_val;
+    const int comm_rank = comm->rank();
+    const int comm_size = comm->size();
+    auto evt = q.submit([=](sycl::handler& h) {
+        h.depends_on(dep_events);
+
+        //h.parallel_for(1, [=](sycl::item<1> idx) {
+        h.parallel_for(comm_size, [=](sycl::item<1> idx) {
+            //TODO: find out the best consitency parameters to be used for atomic variables
+
+            int i = idx;
+            //for(int i=0; i<comm_size; i++)
+            {
+                sycl::atomic_ref<size_t,
+                                 sycl::memory_order::seq_cst,
+                                 sycl::memory_scope::system,
+                                 sycl::access::address_space::global_space>
+                    atomic_p(local_sync_remote_ptrs[i][comm_rank]);
+                atomic_p += 1;
+            }
+
+            //for(int i=0; i<comm_size; i++)
+            {
+                sycl::atomic_ref<size_t,
+                                 sycl::memory_order::seq_cst,
+                                 sycl::memory_scope::system,
+                                 sycl::access::address_space::global_space>
+                    atomic_p(local_sync_remote_ptrs[comm_rank][i]);
+
+                size_t val = atomic_p.load();
+                while (val < counter) {
+                    val = atomic_p.load();
+                }
+            }
+        });
+    });
+    return evt;
+}
+
+sycl::event invoke_barrier(std::shared_ptr<ccl_comm> comm,
+                           sycl::queue q,
+                           std::vector<sycl::event> dep_events,
+                           std::array<size_t*, MAX_NODE_RANKS> local_sync_remote_ptrs,
+                           const size_t num_chunks) {
+    sycl::event ret_event;
+    const bool use_sycl_barrier = !ccl::global_data::env().use_ccl_barrier;
+    if (ccl::global_data::env().use_sycl_barrier || use_sycl_barrier) {
+        ret_event = sycl_barrier(comm, q, dep_events, local_sync_remote_ptrs);
+    }
+    else {
+        const bool use_sync_barrier = false;
+        if (use_sync_barrier) {
+            for (auto dep_event : dep_events) {
+                dep_event.wait();
+            }
+            //MPI_Barrier(MPI_COMM_WORLD);
+            ccl::impl_dispatch disp;
+            comm->barrier(disp(ccl::default_stream), ccl::default_barrier_attr).wait();
+        }
+        else {
+            ret_event = q.submit([=](sycl::handler& h) {
+                h.depends_on(dep_events);
+                h.host_task([=]() {
+                    //MPI_Barrier(MPI_COMM_WORLD);
+                    ccl::impl_dispatch disp;
+                    comm->barrier(disp(ccl::default_stream), ccl::default_barrier_attr).wait();
+                });
+            });
+        }
+    }
+    return ret_event;
+}
+
+sycl::queue create_main_ce_queue(sycl::queue q) {
+    sycl::device dev = q.get_device();
+    sycl::context ctx = q.get_context();
+    ze_device_handle_t ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(dev);
+    ze_context_handle_t ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(ctx);
+
+    // Create Command Queue
+    ze_command_queue_desc_t Qdescriptor = {};
+    Qdescriptor.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC;
+    Qdescriptor.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
+    Qdescriptor.ordinal = 1;
+    Qdescriptor.index = 0;
+    Qdescriptor.flags = 0;
+    Qdescriptor.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
+
+    //ze_command_queue_handle_t ze_cmd_queue = nullptr;
+    //ze_result_t result = zeCommandQueueCreate(ze_ctx, ze_dev, &Qdescriptor, &ze_cmd_queue);
+
+    ze_command_list_handle_t ze_imm_cmd_list = nullptr;
+    ze_result_t result = zeCommandListCreateImmediate(ze_ctx, ze_dev, &Qdescriptor, &ze_imm_cmd_list);
+    if (result != ZE_RESULT_SUCCESS) {
+        std::cerr << "zeCommandQueueCreate failed\n";
+        return q;
+    }
+
+    sycl::backend_input_t<sycl::backend::ext_oneapi_level_zero, sycl::device> InteropDeviceInput{ ze_dev };
+    sycl::device InteropDevice = sycl::make_device<sycl::backend::ext_oneapi_level_zero>(InteropDeviceInput);
+
+    sycl::backend_input_t<sycl::backend::ext_oneapi_level_zero, sycl::context> InteropContextInput{
+        ze_ctx, std::vector<sycl::device>(1, InteropDevice), sycl::ext::oneapi::level_zero::ownership::keep
+    };
+    sycl::context InteropContext = sycl::make_context<sycl::backend::ext_oneapi_level_zero>(InteropContextInput);
+
+    //sycl::backend_input_t<sycl::backend::ext_oneapi_level_zero, sycl::queue> InteropQueueInputCQ{
+    //  ze_cmd_queue, InteropDevice, sycl::ext::oneapi::level_zero::ownership::keep};
+
+    sycl::backend_input_t<sycl::backend::ext_oneapi_level_zero, sycl::queue> InteropQueueInputCL{
+        ze_imm_cmd_list, InteropDevice, sycl::ext::oneapi::level_zero::ownership::keep
+    };
+
+    //sycl::queue q_mce = sycl::make_queue<sycl::backend::ext_oneapi_level_zero>(InteropQueueInputCQ, InteropContext);
+    sycl::queue q_mce =
+        sycl::make_queue<sycl::backend::ext_oneapi_level_zero>(InteropQueueInputCL, InteropContext);
+
+    return q_mce;
+}
+
+sycl::queue get_main_ce_queue(sycl::queue q) {
+    static sycl::queue q_mce = create_main_ce_queue(q);
+    return q_mce;
+}
+
+template <typename T, int N>
+ccl::event allgatherv_large_impl_ipc(const void* send_buf,
+                                     size_t send_count,
+                                     void* recv_buf,
+                                     const ccl::vector_class<size_t>& recv_counts,
+                                     ccl::datatype dtype,
+                                     const ccl::communicator& comm,
+                                     const ccl::stream& op_stream,
+                                     const ccl::allgatherv_attr& attr,
+                                     const ccl::vector_class<ccl::event>& deps) {
+    ccl::impl_dispatch disp;
+    auto ccl_dtype = ccl::global_data::get().dtypes->get(dtype);
+    const int dsize = ccl_dtype.size();
+    sycl::queue q = op_stream.get_native();
+    assert(q.is_in_order());
+
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    std::shared_ptr<ccl_comm> even_comm = global_comm->get_even_comm();
+    std::shared_ptr<ccl_comm> node_comm = global_comm->get_node_comm();
+
+    std::array<void*, MAX_GPUS> local_peer_even_ptrs, local_local_ptrs, local_peer_pair_ptrs;
+    for (int i = 0; i < even_comm->size(); i++) {
+        // offsets for read_write kernel
+        const int global_rank = even_comm->get_global_rank(i);
+        const size_t offset_bytes = send_count * global_rank * dsize;
+        local_peer_even_ptrs[i] = (char*)xelink_ptrs[i] + offset_bytes;
+        local_local_ptrs[i] = (char*)recv_buf + offset_bytes;
+        local_peer_pair_ptrs[i] = (char*)mdfi_ptr + offset_bytes;
+    }
+
+    invoke_barrier(node_comm, q, {}, sync_remote_ptrs, 1);
+
+    assert(vec_size == 1);
+    const size_t work_group_size = 32;
+    const size_t kernel_size = ((send_count + work_group_size - 1) / work_group_size) * work_group_size;
+
+    q.submit([=](sycl::handler& h) {
+        h.parallel_for(
+            sycl::nd_range(sycl::range{ kernel_size }, sycl::range{ work_group_size }), [=](sycl::nd_item<1> it) {
+                read_write<T, N / 2>(local_peer_even_ptrs, local_local_ptrs, local_peer_pair_ptrs, send_count, it);
+            });
+    });
+
+    sycl::event barrier_event = invoke_barrier(node_comm, q, {}, sync_remote_ptrs, 1);
+    return ccl::event::create_from_native(barrier_event);
+}
+
+template <typename T, int N>
+ccl::event allgatherv_large_impl(const void* send_buf,
+                                 size_t send_count,
+                                 void* recv_buf,
+                                 const ccl::vector_class<size_t>& recv_counts,
+                                 ccl::datatype dtype,
+                                 const ccl::communicator& comm,
+                                 const ccl::stream& op_stream,
+                                 const ccl::allgatherv_attr& attr,
+                                 const ccl::vector_class<ccl::event>& deps) {
+    if (!ccl::global_data::env().allgatherv_use_tmp_buf) {
+        ccl::event e = allgatherv_large_impl_ipc<T, N>(
+            send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+        return e;
+    }
+
+    ccl::impl_dispatch disp;
+    auto ccl_dtype = ccl::global_data::get().dtypes->get(dtype);
+    const int dsize = ccl_dtype.size();
+    sycl::queue q = op_stream.get_native();
+    assert(q.is_in_order());
+
+    static sycl::queue q_worker(q.get_device());
+    static sycl::queue q_copy = get_main_ce_queue(q);
+
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    std::shared_ptr<ccl_comm> pair_comm = global_comm->get_pair_comm();
+    std::shared_ptr<ccl_comm> even_comm = global_comm->get_even_comm();
+    std::shared_ptr<ccl_comm> node_comm = global_comm->get_node_comm();
+
+    const int pair_comm_size = pair_comm->size();
+
+    const int even_comm_size = even_comm->size();
+
+    const int node_comm_size = node_comm->size();
+
+    const int comm_rank = comm.rank();
+
+    constexpr int pipeline_size = 2;
+    const size_t tmp_chunk_size = ccl::global_data::env().allgatherv_chunk_size * node_comm_size;
+    void* tmp_bufs[pipeline_size];
+    tmp_bufs[0] = tmp_buf;
+    tmp_bufs[1] = ((char*)tmp_buf) + tmp_chunk_size;
+
+    std::array<void*, MAX_GPUS> local_peer_even_ptrs, local_local_ptrs, local_peer_pair_ptrs;
+    std::array<void*, MAX_GPUS> recv_buf_dst_ptrs, tmp_buf_src_ptrs;
+    std::array<void*, MAX_GPUS> recv_buf_dst_ptrs_prev, tmp_buf_src_ptrs_prev;
+    std::array<void*, MAX_GPUS> tmp_send_buf_next, my_send_buf_next;
+
+    const size_t chunk_size = ccl::global_data::env().allgatherv_chunk_size;
+    const size_t chunk_count = chunk_size / dsize;
+    const size_t num_chunks = send_count / chunk_count + (send_count % chunk_count != 0);
+
+    std::vector<sycl::event> work_events;
+    sycl::event output_event;
+    for (size_t nc = 0; nc < num_chunks; nc++) {
+        // setup pointers
+
+        // alternate between tmp buffers since we use a pipeline of size 2
+        // i.e. copy previous output from one tmp_buffer when allgatherv
+        // is operating on the second tmp_buffer
+        const int tmp_chunk_id = nc % pipeline_size;
+        void* tmp_buf_use = tmp_bufs[tmp_chunk_id];
+        void* tmp_buf_other = tmp_bufs[!tmp_chunk_id];
+
+        // TODO: which rank to be used for scaleout, node_comm or global_comm
+
+        // offset on send buffer
+        const size_t my_offset_count_send = chunk_count * nc;
+        // offset on recv buffer
+        const size_t my_offset_count = send_count * comm_rank + my_offset_count_send;
+        // offset on tmp buffer
+        const size_t my_offset_count_tmp = chunk_count * comm_rank;
+
+        void* my_send_buf = (char*)send_buf + my_offset_count_send * dsize;
+        if (send_buf == recv_buf) {
+            my_send_buf = (char*)recv_buf + my_offset_count * dsize;
+        }
+        void* tmp_send_buf = (char*)tmp_buf_use + my_offset_count_tmp * dsize;
+
+        my_send_buf_next[0] = (char*)my_send_buf + chunk_count * dsize;
+        tmp_send_buf_next[0] = (char*)tmp_buf_other + my_offset_count_tmp * dsize;
+
+        for (int i = 0; i < even_comm_size; i++) {
+            // offsets for read_write kernel
+            int global_rank = even_comm->get_global_rank(i);
+            const size_t offset_bytes = (send_count * global_rank + chunk_count * nc) * dsize;
+            const size_t offset_bytes_tmp = chunk_count * global_rank * dsize;
+
+            // xelink and mdfi ptrs are the tmp buffers in the other ranks
+            const size_t tmp_chunk_offset = tmp_chunk_id * tmp_chunk_size;
+            local_peer_even_ptrs[i] = (char*)xelink_ptrs[i] + offset_bytes_tmp + tmp_chunk_offset;
+            local_local_ptrs[i] = (char*)recv_buf + offset_bytes;
+            local_peer_pair_ptrs[i] = (char*)mdfi_ptr + offset_bytes_tmp + tmp_chunk_offset;
+
+            // offsets for copy kernel
+            // TODO: is there a better way to find the pair_neighbor global rank
+            int global_rank_neighbor = (global_rank / pair_comm_size) * pair_comm_size;
+            if (global_rank % pair_comm_size == 0) {
+                global_rank_neighbor = global_rank_neighbor + 1;
+            }
+            const size_t offset_bytes_c = (send_count * global_rank_neighbor + chunk_count * nc) * dsize;
+            const size_t offset_bytes_c_tmp = chunk_count * global_rank_neighbor * dsize;
+            recv_buf_dst_ptrs[i] = (char*)recv_buf + offset_bytes_c;
+            tmp_buf_src_ptrs[i] = (char*)tmp_buf_use + offset_bytes_c_tmp;
+
+            recv_buf_dst_ptrs_prev[i] = (char*)recv_buf_dst_ptrs[i] - chunk_count * dsize;
+            // offset of prev tmp buffer is same but use tmp_buf_other instead of tmp_buf_use
+            tmp_buf_src_ptrs_prev[i] = (char*)tmp_buf_other + offset_bytes_c_tmp;
+        }
+
+        // start the collective
+
+        // if send_count is not a multiple of chunk_count, then last chunk will contain only remainder data
+        const size_t data_count = (nc < send_count / chunk_count) ? chunk_count : send_count % chunk_count;
+        const size_t data_count_next =
+            (nc < (send_count / chunk_count) - 1) ? chunk_count : send_count % chunk_count;
+
+        // TODO: move this outside the looop
+        // pipeline prologue - copy first chunk from send_buf to tmp_buf using in-order queue
+        if (nc == 0) {
+            sycl::event e = q.submit([=](sycl::handler& h) {
+                h.memcpy(tmp_send_buf, my_send_buf, dsize * data_count);
+            });
+            work_events.push_back(e);
+        }
+
+        sycl::queue q_use = q_worker;
+
+        sycl::event barrier_event1 = invoke_barrier(node_comm, q_use, work_events, sync_remote_ptrs, num_chunks);
+        work_events.clear();
+
+        // use kernel for local pipeline copies of next and prev buffers,
+        // by default we using main copy engine using memcpy
+        constexpr bool use_kernel_copy = false;
+
+        const size_t work_group_size = 32;
+
+        const size_t kernel_threads = data_count / vec_size + data_count % vec_size;
+        const size_t kernel_size = ((kernel_threads + work_group_size - 1) / work_group_size) * work_group_size;
+
+        sycl::event kernel_event = q_use.submit([=](sycl::handler& h) {
+            h.depends_on(barrier_event1);
+            h.parallel_for(sycl::nd_range(sycl::range{ kernel_size }, sycl::range{ work_group_size }),
+                           [=](sycl::nd_item<1> it) {
+                               read_write<T, N / 2>(
+                                   local_peer_even_ptrs, local_local_ptrs, local_peer_pair_ptrs, data_count, it);
+                               // copy next input chunk
+                               if (use_kernel_copy && nc < num_chunks - 1) {
+                                   copy_data<T, 1>(tmp_send_buf_next, my_send_buf_next, data_count_next, it);
+                               }
+                               // copy prev output chunk
+                               if (use_kernel_copy && nc > 0) {
+                                   copy_data<T, N / 2>(
+                                       recv_buf_dst_ptrs_prev, tmp_buf_src_ptrs_prev, data_count, it);
+                               }
+                           });
+        });
+        work_events.push_back(kernel_event);
+
+        std::vector<sycl::event> copy_events;
+        if (!use_kernel_copy) {
+            // copy next input chunk
+            if (nc < num_chunks - 1) {
+                sycl::event e = q_copy.submit([=](sycl::handler& h) {
+                    h.depends_on(barrier_event1);
+                    const size_t data_count_next =
+                        (nc < (send_count / chunk_count) - 1) ? chunk_count : send_count % chunk_count;
+                    h.memcpy(tmp_send_buf_next[0], my_send_buf_next[0], dsize * data_count_next);
+                });
+                copy_events.push_back(e);
+            }
+
+            // copy prev output chunk
+            if (nc > 0) {
+                // for last iteration, if read_write kernel is small, then use
+                // compute engine for copying since it is faster than copy engine
+                // and there is very less overlap with read_write since it is small
+                const size_t small_size_threshold = ccl::global_data::env().allgatherv_small_size_threshold;
+
+                //TODO: should we use single kernel copy when q_use is used
+                sycl::queue q_copy_use =
+                    (nc == num_chunks - 1 && data_count < small_size_threshold) ? q_use : q_copy;
+                for (int i = 0; i < even_comm_size; i++) {
+                    sycl::event e = q_copy_use.submit([=](sycl::handler& h) {
+                        h.depends_on(barrier_event1);
+                        const size_t data_count_prev = chunk_count;
+                        h.memcpy(recv_buf_dst_ptrs_prev[i], tmp_buf_src_ptrs_prev[i], dsize * data_count_prev);
+                    });
+                    copy_events.push_back(e);
+                }
+            }
+        }
+        else if (use_kernel_copy && chunk_count > data_count && nc > 0) {
+            // in case the last chunk is small than chunk_count,
+            // we still need to copy the rest of prev output chunk
+            assert(nc == num_chunks - 1);
+            const size_t copy_count = chunk_count - data_count;
+            for (int i = 0; i < even_comm_size; i++) {
+                sycl::event e = q_copy.submit([=](sycl::handler& h) {
+                    h.depends_on(barrier_event1);
+                    void* src = (char*)(tmp_buf_src_ptrs_prev[i]) + data_count * dsize;
+                    void* dst = (char*)(recv_buf_dst_ptrs_prev[i]) + data_count * dsize;
+                    h.memcpy(dst, src, dsize * copy_count);
+                });
+                copy_events.push_back(e);
+            }
+        }
+
+        // WA: directly connecting the output event of q_copy to gpu kernels
+        // cause failure when MPI binding is used - I_MPI_PIN_PROCESSOR_LIST
+        if (!copy_events.empty()) {
+            sycl::event e = q_use.submit([=](sycl::handler& h) {
+                h.depends_on(copy_events);
+                h.host_task([]() {});
+            });
+            work_events.push_back(e);
+        }
+
+        // TODO: move this outside of the looop
+        // pipeline epilogue - copy the final output chunk from tmp_buffer to recv_buffer
+        if (nc == num_chunks - 1) {
+            sycl::event barrier_event2;
+            barrier_event2 = invoke_barrier(
+                node_comm, q_use, work_events, /*ccl::global_data::get().*/ sync_remote_ptrs, num_chunks);
+            work_events.clear();
+
+            // TODO: find when to use single kernel copy vs memcpys
+            constexpr bool use_single_kernel_copy = true;
+            // use a single kernel to copy from tmp_buffer to recv_buffer
+            if (use_single_kernel_copy) {
+                output_event = q.submit([=](sycl::handler& h) {
+                    h.depends_on(barrier_event2);
+                    constexpr int mult = 4;
+                    const size_t packed_size = data_count / mult;
+                    const size_t rem_size = data_count % mult;
+                    const size_t kernel_size = packed_size + rem_size;
+                    using AT = sycl::vec<T, mult>;
+                    h.parallel_for(kernel_size, [=](sycl::item<1> idx) {
+                        if (idx < packed_size) {
+#pragma unroll
+                            for (int i = 0; i < N / 2; i++) {
+                                ((AT*)recv_buf_dst_ptrs[i])[idx] = ((AT*)tmp_buf_src_ptrs[i])[idx];
+                            }
+                        }
+                        else {
+#pragma unroll
+                            for (int i = 0; i < N / 2; i++) {
+                                const size_t new_idx = idx + (mult - 1) * packed_size;
+                                ((T*)recv_buf_dst_ptrs[i])[new_idx] = ((T*)tmp_buf_src_ptrs[i])[new_idx];
+                            }
+                        }
+                    });
+                });
+            }
+            // use memcpys to copy from tmp_buffer to recv_buffer
+            else {
+                for (int i = 0; i < even_comm_size; i++) {
+                    sycl::event e = q_use.submit([=](sycl::handler& h) {
+                        h.depends_on(barrier_event2);
+                        h.memcpy(recv_buf_dst_ptrs[i], tmp_buf_src_ptrs[i], dsize * data_count);
+                    });
+                    work_events.push_back(e);
+                }
+                output_event = q.ext_oneapi_submit_barrier(work_events);
+            }
+        }
+    } // nc
+
+    return ccl::event::create_from_native(output_event);
+}
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_medium_sycl.cpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_medium_sycl.cpp
new file mode 100644
index 000000000..75e6d164e
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_medium_sycl.cpp
@@ -0,0 +1,67 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allgatherv/sycl/allgatherv_medium_sycl.hpp"
+
+sycl_allgatherv_medium<sycl::half> agv_medium_fp16;
+sycl_allgatherv_medium<sycl::_V1::ext::oneapi::bfloat16> agv_medium_bf16;
+sycl_allgatherv_medium<float> agv_medium_fp32;
+sycl_allgatherv_medium<int> agv_medium_int32;
+
+#define SWITCH_INIT_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        if (!agv_medium_##TYPE.inited()) { \
+            LOG_INFO("invoking allgatherv medium kernel first time for datatype: ", ccl_type); \
+            agv_medium_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+        break;
+
+void init_allgatherv_medium(ccl::datatype dtype,
+                            sycl::queue &queue,
+                            ccl_comm *comm,
+                            ccl_stream *stream,
+                            uint32_t rank_in,
+                            uint32_t world_in) {
+    switch (dtype) {
+        SWITCH_INIT_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_INIT_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_INIT_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_INIT_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for allgatherv"); assert(0);
+    }
+}
+
+#define SWITCH_RUN_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        e = agv_medium_##TYPE.allgatherv(queue, send_buf, send_count, recv_buf, recv_counts, done); \
+        break;
+
+ccl::event run_allgatherv_medium(ccl::datatype dtype,
+                                 sycl::queue queue,
+                                 const void *send_buf,
+                                 size_t send_count,
+                                 void *recv_buf,
+                                 const ccl::vector_class<size_t> &recv_counts,
+                                 bool &done) {
+    ccl::event e;
+    switch (dtype) {
+        SWITCH_RUN_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_RUN_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_RUN_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_RUN_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for allgatherv"); assert(0);
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_medium_sycl.hpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_medium_sycl.hpp
new file mode 100644
index 000000000..21c23b6e7
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_medium_sycl.hpp
@@ -0,0 +1,1257 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define MAX_RANK            16
+#define SIMD_COMPUTE_MAX    256
+#define SIMD_COMPUTE        (SIMD_COMPUTE_MAX / sizeof(data_type))
+#define SIMD_SYNC           32
+#define UNROLL_SIZE         1
+#define BUFFER_COUNT        2
+#define SYNC_BYTE           (SIMD_SYNC * sizeof(int) * 2)
+#define MAX_COUNT           (32 * 1024 * 1024 / sizeof(data_type))
+#define ALIGNMENT_BYTE      256
+#define EU_COUNT_PER_RANK   512
+#define THREAD_COUNT_PER_EU 8
+#define HW_THREAD_COUNT     (EU_COUNT_PER_RANK * THREAD_COUNT_PER_EU)
+#define RANKS_PER_GPU       2
+
+template <typename data_type>
+void local_copy(int idx,
+                const void *send_buf,
+                int myoffset,
+                uint32_t size,
+                int threads_already_processed,
+                void *temp_buffer[],
+                uint32_t temp_rank,
+                int size_per_buffer_kernel,
+                int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    int read_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+
+    //do copy from input buffer to temp buffer.
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE> buffer;
+#pragma unroll
+    for (int i = 0; i < UNROLL_SIZE; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(i * SIMD_COMPUTE) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::cached,
+                           cache_hint::cached>((data_type *)send_buf + myoffset + read_offset + i * SIMD_COMPUTE);
+    }
+
+    // write to my rank's temp buffer's first chunk
+    data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+    local_temp_ptr += size_per_buffer_kernel * buffer_index_kernel;
+    local_temp_ptr += idx * SIMD_COMPUTE * UNROLL_SIZE;
+#pragma unroll
+    for (int i = 0; i < UNROLL_SIZE; i++) {
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::uncached>(local_temp_ptr + i * SIMD_COMPUTE,
+                                              buffer.template select<SIMD_COMPUTE, 1>(i * SIMD_COMPUTE));
+    }
+    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void read_write(int *even_ranks,
+                int my_rank_index,
+                int idx,
+                const void *send_buf,
+                int myoffset,
+                void *out_buffer,
+                uint32_t size,
+                int threads_already_processed,
+                void *temp_buffer[],
+                uint32_t temp_rank,
+                int outer_iter,
+                int size_per_buffer_kernel,
+                int buffer_index_kernel,
+                int chunk_size) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    // read from xelinks of all odd/even ranks
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> xe_buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r];
+        data_type *temp_ptr = (data_type *)temp_buffer[rr];
+        temp_ptr += size_per_buffer_kernel * buffer_index_kernel;
+        temp_ptr += idx * SIMD_COMPUTE * UNROLL_SIZE;
+#pragma unroll
+        for (int i = 0; i < UNROLL_SIZE; i++) {
+            xe_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::cached,
+                               cache_hint::cached>(temp_ptr + i * SIMD_COMPUTE);
+        }
+    }
+
+    // write to mdfi buffer
+    data_type *mdfi_buffer = (data_type *)temp_buffer[temp_rank ^ 1];
+    mdfi_buffer += size_per_buffer_kernel * buffer_index_kernel;
+    mdfi_buffer += chunk_size + idx * SIMD_COMPUTE * UNROLL_SIZE;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                mdfi_buffer + r * chunk_size + i * SIMD_COMPUTE,
+                xe_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+        }
+    }
+
+    // write to output buffer
+    data_type *out_ptr = (data_type *)out_buffer;
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    int write_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    if (write_offset + SIMD_COMPUTE * UNROLL_SIZE <= size) {
+        //#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            if ((int)r == my_rank_index && send_buf == out_buffer) {
+                continue;
+            }
+            int rr = even_ranks[r];
+#pragma unroll
+            for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+                lsc_block_store<data_type,
+                                SIMD_COMPUTE,
+                                lsc_data_size::default_size,
+                                cache_hint::uncached,
+                                cache_hint::uncached>(
+                    out_ptr + rr * size + write_offset + i * SIMD_COMPUTE,
+                    xe_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+            }
+        }
+    }
+    else {
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            if ((int)r == my_rank_index && send_buf == out_buffer) {
+                continue;
+            }
+            int rr = even_ranks[r];
+#if 0
+            int count = (size - write_offset + SIMD_COMPUTE - 1) / SIMD_COMPUTE;
+            for (int i = 0; i < count; i++)
+            {
+                lsc_block_store<data_type, SIMD_COMPUTE, lsc_data_size::default_size, cache_hint::uncached, cache_hint::uncached>
+                    (out_ptr + rr * size + write_offset + i * SIMD_COMPUTE, xe_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+            }
+#else
+            int count = size - write_offset;
+            for (int i = 0; i < count; i++) {
+                out_ptr[rr * size + write_offset + i] = xe_buffer[r * SIMD_COMPUTE * UNROLL_SIZE + i];
+            }
+#endif
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void write_output(int *even_ranks,
+                  int my_rank_index,
+                  int idx,
+                  void *out_buffer,
+                  uint32_t size,
+                  int threads_already_processed,
+                  void *temp_buffer[],
+                  uint32_t temp_rank,
+                  int outer_iter,
+                  int size_per_buffer_kernel,
+                  int buffer_index_kernel,
+                  int chunk_size) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> buffer;
+    data_type *read_ptr = (data_type *)temp_buffer[temp_rank];
+    read_ptr += size_per_buffer_kernel * buffer_index_kernel;
+    read_ptr += chunk_size + idx * SIMD_COMPUTE * UNROLL_SIZE;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#pragma unroll
+        for (int i = 0; i < UNROLL_SIZE; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::cached,
+                               cache_hint::cached>(read_ptr + i * SIMD_COMPUTE);
+        }
+        read_ptr += chunk_size;
+    }
+
+    data_type *write_ptr = (data_type *)out_buffer;
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    int write_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    if (write_offset + SIMD_COMPUTE * UNROLL_SIZE <= size) {
+        //#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            int rr = even_ranks[r] ^ 1;
+            //sycl::_V1::ext::oneapi::experimental::printf("write_output [%d] write to %d\n", temp_rank, rr);
+#pragma unroll
+            for (int i = 0; i < UNROLL_SIZE; i++) {
+                lsc_block_store<data_type,
+                                SIMD_COMPUTE,
+                                lsc_data_size::default_size,
+                                cache_hint::uncached,
+                                cache_hint::uncached>(
+                    write_ptr + rr * size + write_offset + i * SIMD_COMPUTE,
+                    buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+            }
+        }
+    }
+    else {
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            int rr = even_ranks[r] ^ 1;
+#if 0
+            int count = (size - write_offset + SIMD_COMPUTE - 1) / SIMD_COMPUTE;
+            for (int i = 0; i < count; i++)
+            {
+                lsc_block_store<data_type, SIMD_COMPUTE, lsc_data_size::default_size, cache_hint::uncached, cache_hint::uncached>
+                    (write_ptr + rr * size + write_offset + i * SIMD_COMPUTE, buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+            }
+#else
+            int count = size - write_offset;
+            for (int i = 0; i < count; i++) {
+                write_ptr[rr * size + write_offset + i] = buffer[r * SIMD_COMPUTE * UNROLL_SIZE + i];
+            }
+#endif
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_read_write(int *even_ranks,
+                       int my_rank_index,
+                       int idx,
+                       void **send_bufs,
+                       void **out_buffers,
+                       uint32_t size,
+                       uint32_t temp_rank) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int offset = idx * SIMD_COMPUTE * UNROLL_SIZE;
+
+    //sycl::ext::intel::esimd::properties props{alignment<sizeof(T)>};
+    //sycl::ext::oneapi::experimental::detail::empty_properties_t  PropertyListT;
+    //constexpr size_t Alignment = sycl::ext::oneapi::experimental::detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(data_type));
+    //constexpr size_t Alignment = 2;
+    //sycl::_V1::ext::intel::esimd::properties props{cache_hint_L1<cached>};
+    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::flushl3, lsc_scope::gpus>();
+    // read from xelinks of all odd/even ranks
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> in_buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int r0 = (my_rank_index + r) % (TEMP_WORLD / 2);
+        int rr = even_ranks[r0];
+        data_type *read_ptr = (data_type *)send_bufs[rr];
+        int in_place = send_bufs[rr] == out_buffers[rr];
+        int myoffset = in_place ? rr * size : 0;
+        read_ptr += myoffset + offset;
+#pragma unroll
+        for (int i = 0; i < UNROLL_SIZE; i++) {
+#if 1
+            in_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::cached,
+                               cache_hint::cached>(read_ptr + i * SIMD_COMPUTE);
+#else
+            in_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE) =
+                block_load<data_type, SIMD_COMPUTE>(read_ptr + i * SIMD_COMPUTE, overaligned_tag<Alignment>{});
+#endif
+        }
+    }
+
+    // write to mdfi buffer and output buffer
+    data_type *mdfi_buffer = (data_type *)out_buffers[temp_rank ^ 1];
+    data_type *out_ptr = (data_type *)out_buffers[temp_rank];
+    if (offset + SIMD_COMPUTE * UNROLL_SIZE <= size) {
+        //#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            uint32_t r0 = (my_rank_index + r) % (TEMP_WORLD / 2);
+            uint32_t rr = even_ranks[r0];
+            int in_place = send_bufs[rr] == out_buffers[rr];
+#pragma unroll
+            for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+#if 1
+                if (rr != (temp_rank ^ 1) || !in_place) {
+                    lsc_block_store<data_type,
+                                    SIMD_COMPUTE,
+                                    lsc_data_size::default_size,
+                                    cache_hint::uncached,
+                                    cache_hint::uncached>(mdfi_buffer + rr * size + offset + i * SIMD_COMPUTE,
+                                                          in_buffer.template select<SIMD_COMPUTE, 1>(
+                                                              r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+                }
+                if (rr != temp_rank || !in_place) {
+                    lsc_block_store<data_type,
+                                    SIMD_COMPUTE,
+                                    lsc_data_size::default_size,
+                                    cache_hint::uncached,
+                                    cache_hint::uncached>(out_ptr + rr * size + offset + i * SIMD_COMPUTE,
+                                                          in_buffer.template select<SIMD_COMPUTE, 1>(
+                                                              r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+                }
+#else
+                block_store<data_type, SIMD_COMPUTE>(
+                    mdfi_buffer + rr * size + offset + i * SIMD_COMPUTE,
+                    in_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE),
+                    overaligned_tag<Alignment>{});
+                block_store<data_type, SIMD_COMPUTE>(
+                    out_ptr + rr * size + offset + i * SIMD_COMPUTE,
+                    in_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE),
+                    overaligned_tag<Alignment>{});
+#endif
+            }
+        }
+    }
+    else {
+        int vc_count = (size - offset) / SIMD_COMPUTE;
+        int count = size - offset - SIMD_COMPUTE * vc_count;
+        //sycl::_V1::ext::oneapi::experimental::printf("offset [%d] size %d vc_count:%d count:%d \n", offset, size, vc_count, count);
+        //#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            uint32_t r0 = (my_rank_index + r) % (TEMP_WORLD / 2);
+            uint32_t rr = even_ranks[r0];
+            int in_place = send_bufs[rr] == out_buffers[rr];
+#if 0
+            int count = (size - offset + SIMD_COMPUTE - 1) / SIMD_COMPUTE;
+            for (int i = 0; i < count; i++)
+            {
+                lsc_block_store<data_type, SIMD_COMPUTE, lsc_data_size::default_size, cache_hint::uncached, cache_hint::uncached>
+                    (mdfi_buffer + rr * size + offset + i * SIMD_COMPUTE, in_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+                lsc_block_store<data_type, SIMD_COMPUTE, lsc_data_size::default_size, cache_hint::uncached, cache_hint::uncached>
+                    (out_ptr + rr * size + offset + i * SIMD_COMPUTE, in_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+            }
+#else
+            for (int i = 0; i < vc_count; i++) {
+                lsc_block_store<data_type,
+                                SIMD_COMPUTE,
+                                lsc_data_size::default_size,
+                                cache_hint::uncached,
+                                cache_hint::uncached>(
+                    mdfi_buffer + rr * size + offset + i * SIMD_COMPUTE,
+                    in_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+                lsc_block_store<data_type,
+                                SIMD_COMPUTE,
+                                lsc_data_size::default_size,
+                                cache_hint::uncached,
+                                cache_hint::uncached>(
+                    out_ptr + rr * size + offset + i * SIMD_COMPUTE,
+                    in_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+            }
+            for (int i = 0; i < count; i++) {
+                if (rr != (temp_rank ^ 1) || !in_place) {
+                    mdfi_buffer[rr * size + offset + vc_count * SIMD_COMPUTE + i] =
+                        in_buffer[r * SIMD_COMPUTE * UNROLL_SIZE + vc_count * SIMD_COMPUTE + i];
+                }
+                if (rr != temp_rank || !in_place) {
+                    out_ptr[rr * size + offset + vc_count * SIMD_COMPUTE + i] =
+                        in_buffer[r * SIMD_COMPUTE * UNROLL_SIZE + vc_count * SIMD_COMPUTE + i];
+                }
+            }
+#endif
+        }
+    }
+    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::flushl3, lsc_scope::system>();
+}
+
+//constexpr sycl::specialization_id<uint32_t> temp_world_const;
+
+template <typename dtype>
+class AllgatherMediumKernel_local_copy;
+template <typename dtype>
+class AllgatherMediumKernel_read_write;
+template <typename dtype>
+class AllgatherMediumKernel_write_output;
+
+template <typename dtype>
+class AllgatherMediumKernel_nocopy_read_write;
+
+template <typename dtype>
+class AllgathervMediumKernel_GlobalSync;
+template <typename dtype>
+class AllgathervMediumKernel_LocalSync;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK>
+class sycl_allgatherv_medium : public sycl_coll_base<data_type> {
+public:
+    sycl_allgatherv_medium() : sycl_coll_base<data_type>() {
+        buffer_index = 0;
+        size_per_buffer = 0;
+    }
+
+    void init(sycl::queue &queue, ccl_comm *comm, ccl_stream *stream, uint32_t rank_in, uint32_t world_in) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+        rank = rank_in;
+        world = world_in;
+        max_count_per_rank = (MAX_COUNT + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE) *
+                             SIMD_COMPUTE * UNROLL_SIZE;
+        data_size_per_buffer = max_count_per_rank * (world / 2 + 1);
+        size_per_buffer = data_size_per_buffer * sizeof(data_type) + SYNC_BYTE;
+
+        void *local_triple_buffer = sycl::malloc_device(size_per_buffer * BUFFER_COUNT, queue);
+        auto e = queue.memset(local_triple_buffer, 0, size_per_buffer * BUFFER_COUNT);
+        e.wait();
+
+        this->exchange_peer_ipc_mem(queue,
+                                    comm,
+                                    stream,
+                                    local_triple_buffer,
+                                    NULL,
+                                    rank,
+                                    world,
+                                    data_size_per_buffer * sizeof(data_type),
+                                    (void **)buffers,
+                                    (void **)sync_buffer,
+                                    offsets,
+                                    ipc_handle,
+                                    NULL,
+                                    NULL /* mmap_buffers */,
+                                    false /* to_cache */);
+        this->initialized = true;
+
+        global_stream = stream;
+        global_comm = comm;
+        even_comm = global_comm->get_even_comm().get();
+    }
+
+    ccl::event allgatherv(sycl::queue &queue,
+                          const void *send_buf,
+                          size_t send_count,
+                          void *recv_buf,
+                          const ccl::vector_class<size_t> &recv_counts,
+                          bool &done) {
+        done = true;
+        if ((send_count * sizeof(data_type)) % 4 != 0) {
+            done = false;
+            return ccl::event();
+        }
+        if (ccl::global_data::env().allgatherv_use_tmp_buf) {
+            return allgatherv_copy(queue, send_buf, send_count, recv_buf, recv_counts);
+        }
+        else {
+            return allgatherv_nocopy(queue, send_buf, send_count, recv_buf, recv_counts);
+        }
+    }
+
+private:
+    ccl::event allgatherv_copy(sycl::queue &queue,
+                               const void *send_buf,
+                               size_t send_count,
+                               void *recv_buf,
+                               const ccl::vector_class<size_t> &recv_counts) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+
+        for (uint32_t i = 0; i < recv_counts.size(); i++) {
+            if (recv_counts[i] != send_count) {
+                CCL_THROW("not all recv_counts are the same as send_count\n");
+            }
+        }
+
+        uint32_t myoffset = 0;
+        if (send_buf == recv_buf)
+            myoffset = send_count * temp_rank;
+
+        int even_ranks[max_rank];
+        int myrank;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank)
+                myrank = i;
+            //printf("even rank %d: %d neighbor: %d\n", i, even_ranks[i], even_ranks[i] ^ 1);
+        }
+
+        int chunk_size = max_count_per_rank;
+        int buffer_index_kernel = buffer_index;
+
+        int size_per_buffer_kernel __attribute__((unused)) = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel __attribute__((unused)) =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+
+        int threads_already_processed = 0;
+        //uint32_t total_threads_needed_sync __attribute__((unused)) = 1;
+        int outerloop_iter_count = (send_count + max_count_per_rank - 1) / max_count_per_rank;
+
+        for (int outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++) {
+            uint32_t threads_needed_per_chunk;
+            uint32_t total_threads_needed __attribute__((unused));
+            if ((outer_iter + 1) * max_count_per_rank < (int)send_count) {
+                threads_needed_per_chunk = max_count_per_rank / (SIMD_COMPUTE * UNROLL_SIZE);
+            }
+            else {
+                uint32_t leftover = send_count - outer_iter * max_count_per_rank;
+                threads_needed_per_chunk =
+                    (leftover + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE);
+            }
+            int wg_size __attribute__((unused)) = 1;
+            total_threads_needed = threads_needed_per_chunk;
+
+            int innerloop_iter_count __attribute__((unused)) =
+                (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+
+            // FIRST KERNEL
+            e = queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for<class AllgatherMediumKernel_local_copy<data_type>>(
+                    sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                    {
+                    uint32_t idx = idx2.get_global_id();
+                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                        if ((uint32_t)index >= total_threads_needed)
+                            break;
+
+                        local_copy<data_type>(index,
+                                              send_buf,
+                                              myoffset,
+                                              send_count,
+                                              threads_already_processed,
+                                              (void **)temp_buffer,
+                                              temp_rank,
+                                              size_per_buffer_kernel,
+                                              buffer_index_kernel);
+                    }
+                    });
+            });
+
+            e = global_sync(
+                queue, temp_rank, temp_world, size_per_buffer_for_sync_kernel * buffer_index_kernel, 1, 0);
+
+            // SECOND KERNEL
+            e = queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for<class AllgatherMediumKernel_read_write<data_type>>(
+                    sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                    {
+                    //ESIMD kernel
+                    uint32_t idx = idx2.get_global_id();
+                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                        if ((uint32_t)index >= total_threads_needed)
+                            break;
+
+                        switch (temp_world) {
+                            case 2:
+                                read_write<2, data_type>((int *)even_ranks,
+                                                         myrank,
+                                                         index,
+                                                         send_buf,
+                                                         myoffset,
+                                                         recv_buf,
+                                                         send_count,
+                                                         threads_already_processed,
+                                                         (void **)temp_buffer,
+                                                         temp_rank,
+                                                         outer_iter,
+                                                         size_per_buffer_kernel,
+                                                         buffer_index_kernel,
+                                                         chunk_size);
+                                break;
+                            case 4:
+                                read_write<4, data_type>((int *)even_ranks,
+                                                         myrank,
+                                                         index,
+                                                         send_buf,
+                                                         myoffset,
+                                                         recv_buf,
+                                                         send_count,
+                                                         threads_already_processed,
+                                                         (void **)temp_buffer,
+                                                         temp_rank,
+                                                         outer_iter,
+                                                         size_per_buffer_kernel,
+                                                         buffer_index_kernel,
+                                                         chunk_size);
+                                break;
+                            case 6:
+                                read_write<6, data_type>((int *)even_ranks,
+                                                         myrank,
+                                                         index,
+                                                         send_buf,
+                                                         myoffset,
+                                                         recv_buf,
+                                                         send_count,
+                                                         threads_already_processed,
+                                                         (void **)temp_buffer,
+                                                         temp_rank,
+                                                         outer_iter,
+                                                         size_per_buffer_kernel,
+                                                         buffer_index_kernel,
+                                                         chunk_size);
+                                break;
+                            case 8:
+                                read_write<8, data_type>((int *)even_ranks,
+                                                         myrank,
+                                                         index,
+                                                         send_buf,
+                                                         myoffset,
+                                                         recv_buf,
+                                                         send_count,
+                                                         threads_already_processed,
+                                                         (void **)temp_buffer,
+                                                         temp_rank,
+                                                         outer_iter,
+                                                         size_per_buffer_kernel,
+                                                         buffer_index_kernel,
+                                                         chunk_size);
+                                break;
+                            case 10:
+                                read_write<10, data_type>((int *)even_ranks,
+                                                          myrank,
+                                                          index,
+                                                          send_buf,
+                                                          myoffset,
+                                                          recv_buf,
+                                                          send_count,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          chunk_size);
+                                break;
+                            case 12:
+                                read_write<12, data_type>((int *)even_ranks,
+                                                          myrank,
+                                                          index,
+                                                          send_buf,
+                                                          myoffset,
+                                                          recv_buf,
+                                                          send_count,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          chunk_size);
+                                break;
+                            case 14:
+                                read_write<14, data_type>((int *)even_ranks,
+                                                          myrank,
+                                                          index,
+                                                          send_buf,
+                                                          myoffset,
+                                                          recv_buf,
+                                                          send_count,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          chunk_size);
+                                break;
+                            case 16:
+                                read_write<16, data_type>((int *)even_ranks,
+                                                          myrank,
+                                                          index,
+                                                          send_buf,
+                                                          myoffset,
+                                                          recv_buf,
+                                                          send_count,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          chunk_size);
+                                break;
+                            default: break;
+                        }
+                    }
+                });
+            });
+
+            e = global_sync(
+                queue, temp_rank, temp_world, size_per_buffer_for_sync_kernel * buffer_index_kernel, 2, 1);
+
+            // THIRD KERNEL
+            e = queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for<class AllgatherMediumKernel_write_output<data_type>>(
+                    sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                    {
+                    uint32_t idx = idx2.get_global_id();
+                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                        if ((uint32_t)index >= total_threads_needed)
+                            break;
+
+                        switch (temp_world) {
+                            case 2:
+                                write_output<2, data_type>((int *)even_ranks,
+                                                           myrank,
+                                                           index,
+                                                           recv_buf,
+                                                           send_count,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           chunk_size);
+                                break;
+                            case 4:
+                                write_output<4, data_type>((int *)even_ranks,
+                                                           myrank,
+                                                           index,
+                                                           recv_buf,
+                                                           send_count,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           chunk_size);
+                                break;
+                            case 6:
+                                write_output<6, data_type>((int *)even_ranks,
+                                                           myrank,
+                                                           index,
+                                                           recv_buf,
+                                                           send_count,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           chunk_size);
+                                break;
+                            case 8:
+                                write_output<8, data_type>((int *)even_ranks,
+                                                           myrank,
+                                                           index,
+                                                           recv_buf,
+                                                           send_count,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           chunk_size);
+                                break;
+                            case 10:
+                                write_output<10, data_type>((int *)even_ranks,
+                                                            myrank,
+                                                            index,
+                                                            recv_buf,
+                                                            send_count,
+                                                            threads_already_processed,
+                                                            (void **)temp_buffer,
+                                                            temp_rank,
+                                                            outer_iter,
+                                                            size_per_buffer_kernel,
+                                                            buffer_index_kernel,
+                                                            chunk_size);
+                                break;
+                            case 12:
+                                write_output<12, data_type>((int *)even_ranks,
+                                                            myrank,
+                                                            index,
+                                                            recv_buf,
+                                                            send_count,
+                                                            threads_already_processed,
+                                                            (void **)temp_buffer,
+                                                            temp_rank,
+                                                            outer_iter,
+                                                            size_per_buffer_kernel,
+                                                            buffer_index_kernel,
+                                                            chunk_size);
+                                break;
+                            case 14:
+                                write_output<14, data_type>((int *)even_ranks,
+                                                            myrank,
+                                                            index,
+                                                            recv_buf,
+                                                            send_count,
+                                                            threads_already_processed,
+                                                            (void **)temp_buffer,
+                                                            temp_rank,
+                                                            outer_iter,
+                                                            size_per_buffer_kernel,
+                                                            buffer_index_kernel,
+                                                            chunk_size);
+                                break;
+                            case 16:
+                                write_output<16, data_type>((int *)even_ranks,
+                                                            myrank,
+                                                            index,
+                                                            recv_buf,
+                                                            send_count,
+                                                            threads_already_processed,
+                                                            (void **)temp_buffer,
+                                                            temp_rank,
+                                                            outer_iter,
+                                                            size_per_buffer_kernel,
+                                                            buffer_index_kernel,
+                                                            chunk_size);
+                                break;
+                            default: break;
+                        }
+                    }
+                });
+            });
+            //e.wait();
+
+            threads_already_processed += total_threads_needed;
+            buffer_index++;
+            buffer_index %= BUFFER_COUNT;
+            buffer_index_kernel = buffer_index;
+        } // end of for outer_iter
+
+        return ccl::event::create_from_native(e);
+    }
+
+    ccl::event allgatherv_nocopy(sycl::queue &queue,
+                                 const void *send_buf,
+                                 size_t send_count,
+                                 void *recv_buf,
+                                 const ccl::vector_class<size_t> &recv_counts) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+
+        for (uint32_t i = 0; i < recv_counts.size(); i++) {
+            if (recv_counts[i] != send_count) {
+                CCL_THROW("not all recv_counts are the same as send_count\n");
+            }
+        }
+
+        int even_ranks[max_rank];
+        int myrank;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank)
+                myrank = i;
+            //printf("[%d] even rank %d: %d neighbor: %d\n", temp_rank, i, even_ranks[i], even_ranks[i] ^ 1);
+        }
+
+        void *in_buffers[max_rank];
+        void *out_buffers[max_rank];
+        LOG_DEBUG("No-copy kernel calling exchange_peer_ipc_mem");
+        //printf("[%d] before exchange: %p %p \n", rank, send_buf, recv_buf);
+        this->exchange_peer_ipc_mem(queue,
+                                    global_comm,
+                                    global_stream,
+                                    (void **)send_buf,
+                                    recv_buf,
+                                    rank,
+                                    world,
+                                    0,
+                                    (void **)in_buffers,
+                                    NULL,
+                                    NULL,
+                                    NULL,
+                                    (void **)out_buffers);
+
+        int buffer_index_kernel __attribute__((unused)) = buffer_index;
+        int size_per_buffer_kernel __attribute__((unused)) = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel __attribute__((unused)) =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+
+        //int threads_already_processed = 0;
+        uint32_t total_threads_needed_sync __attribute__((unused)) = 1;
+        //outerloop_iter_count = (send_count + max_count_per_rank - 1) / max_count_per_rank;
+
+        uint32_t threads_needed_per_chunk;
+        uint32_t total_threads_needed __attribute__((unused));
+        threads_needed_per_chunk = (send_count + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE);
+        int wg_size __attribute__((unused)) = 1;
+        total_threads_needed = threads_needed_per_chunk;
+
+        int innerloop_iter_count __attribute__((unused)) =
+            (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+
+        //        uint32_t total_threads_dispatched = (total_threads_needed + wg_size - 1) / wg_size * wg_size;
+        //        uint32_t total_wg_count = total_threads_dispatched / wg_size;
+
+        uint32_t persist_threads_needed = total_threads_needed;
+        if (persist_threads_needed > HW_THREAD_COUNT)
+            persist_threads_needed = HW_THREAD_COUNT;
+
+            //printf("innerloop_iter_count: %d total_threads_needed:%d persist_threads_needed:%d \n", innerloop_iter_count, total_threads_needed, persist_threads_needed);
+
+#if 0
+        persist_threads_needed = (persist_threads_needed + wg_size - 1) / wg_size * wg_size;
+        uint32_t total_wg_count = persist_threads_needed / wg_size;
+#endif
+
+        // a GPU barrier to make sure all ranks are ready
+        e = global_sync(queue, temp_rank, temp_world, size_per_buffer_for_sync_kernel * buffer_index_kernel, 0, 0);
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            // Set the coefficient of the convolution as constant.
+            // This will build a specific kernel the coefficient available as literals.
+            //            cgh.set_specialization_constant<temp_world_const>(temp_world);
+            cgh.parallel_for<class AllgatherMediumKernel_nocopy_read_write<data_type>>(
+                sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                {
+                uint32_t idx = idx2.get_global_id();
+
+                //uint32_t temp_world_kernel = h.get_specialization_constant<temp_world_const>();
+
+                //ESIMD kernel
+                for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                    int index = idx + inner_iter * HW_THREAD_COUNT;
+                    if ((uint32_t)index >= total_threads_needed)
+                        break;
+
+                    switch (temp_world) {
+                        case 2:
+                            nocopy_read_write<2, data_type>((int *)even_ranks,
+                                                            myrank,
+                                                            index,
+                                                            (void **)in_buffers,
+                                                            (void **)out_buffers,
+                                                            send_count,
+                                                            temp_rank);
+                            break;
+                        case 4:
+                            nocopy_read_write<4, data_type>((int *)even_ranks,
+                                                            myrank,
+                                                            index,
+                                                            (void **)in_buffers,
+                                                            (void **)out_buffers,
+                                                            send_count,
+                                                            temp_rank);
+                            break;
+                        case 6:
+                            nocopy_read_write<6, data_type>((int *)even_ranks,
+                                                            myrank,
+                                                            index,
+                                                            (void **)in_buffers,
+                                                            (void **)out_buffers,
+                                                            send_count,
+                                                            temp_rank);
+                            break;
+                        case 8:
+                            nocopy_read_write<8, data_type>((int *)even_ranks,
+                                                            myrank,
+                                                            index,
+                                                            (void **)in_buffers,
+                                                            (void **)out_buffers,
+                                                            send_count,
+                                                            temp_rank);
+                            break;
+                        case 10:
+                            nocopy_read_write<10, data_type>((int *)even_ranks,
+                                                             myrank,
+                                                             index,
+                                                             (void **)in_buffers,
+                                                             (void **)out_buffers,
+                                                             send_count,
+                                                             temp_rank);
+                            break;
+                        case 12:
+                            nocopy_read_write<12, data_type>((int *)even_ranks,
+                                                             myrank,
+                                                             index,
+                                                             (void **)in_buffers,
+                                                             (void **)out_buffers,
+                                                             send_count,
+                                                             temp_rank);
+                            break;
+                        case 14:
+                            nocopy_read_write<14, data_type>((int *)even_ranks,
+                                                             myrank,
+                                                             index,
+                                                             (void **)in_buffers,
+                                                             (void **)out_buffers,
+                                                             send_count,
+                                                             temp_rank);
+                            break;
+                        case 16:
+                            nocopy_read_write<16, data_type>((int *)even_ranks,
+                                                             myrank,
+                                                             index,
+                                                             (void **)in_buffers,
+                                                             (void **)out_buffers,
+                                                             send_count,
+                                                             temp_rank);
+                            break;
+                        default: break;
+                    }
+                }
+            });
+        });
+
+#if 1
+        e = global_sync(queue, temp_rank, temp_world, size_per_buffer_for_sync_kernel * buffer_index_kernel, 1, 1);
+#else
+        // sync two tiles of a same GPU before exiting the call
+        e = local_sync(queue, temp_rank, temp_world, size_per_buffer_for_sync_kernel * buffer_index_kernel, 1, 1);
+#endif
+
+        //threads_already_processed += total_threads_needed;
+        buffer_index++;
+        buffer_index %= BUFFER_COUNT;
+        //buffer_index_kernel = buffer_index;
+
+        return ccl::event::create_from_native(e);
+    }
+
+    //sync all the ranks here before consuming the results.
+    // offset = size_per_buffer_for_sync_kernel * buffer_index_kernel
+    sycl::event global_sync(sycl::queue queue,
+                            int temp_rank,
+                            uint32_t temp_world,
+                            int offset,
+                            int index,
+                            int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (uint32_t i = 0; i < temp_world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class AllgathervMediumKernel_GlobalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                //ESIMD kernel
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //since other ranks might still be doing local_sum, we need to sync ranks here.
+                //After the sync is done, the second half of hte temp buffer will be replaced with new sum val.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                for (uint32_t i = 0; i < temp_world; i++) {
+                    int *sync_ptr = (int *)temp_sync_buffer[i] + offset;
+                    ////never true. Used to force dependecy with prev kernel
+                    //if (total_threads_needed_sync == 0x7fffffff)
+                    //    sync_ptr = temp_buffer[0];
+                    lsc_atomic_update<atomic_op::inc,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(sync_ptr, ramp, pred);
+                }
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != temp_world) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    // sync tiles in a GPU
+    sycl::event local_sync(sycl::queue queue,
+                           int temp_rank,
+                           uint32_t temp_world,
+                           int offset,
+                           int index,
+                           int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class AllgathervMediumKernel_LocalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                //ESIMD kernel
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //sync only the rank pair within the same gpu.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank ^ 1] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+                sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != RANKS_PER_GPU) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    void release(sycl::queue &queue) {
+        // Clean up, close/put ipc handles, free memory, etc.
+        auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_context());
+        for (int i = 0; i < world; i++) {
+            if (i != rank) {
+                ZE_CALL(zeMemCloseIpcHandle, (l0_ctx, (char *)buffers[i] - offsets[i]));
+            }
+        }
+
+        sycl::free(buffers[rank], queue);
+        this->initialized = false;
+    }
+
+private:
+    void *buffers[max_rank];
+    void *sync_buffer[max_rank];
+    size_t offsets[max_rank];
+    ze_ipc_mem_handle_t ipc_handle[max_rank];
+    int rank{ ccl::utils::invalid_rank }, world{ ccl::utils::invalid_err_code };
+    int buffer_index{ ccl::utils::invalid_err_code };
+    int size_per_buffer{ ccl::utils::invalid_bytes_value };
+    int max_count_per_rank{ ccl::utils::initial_count_value };
+    int data_size_per_buffer{ ccl::utils::invalid_bytes_value };
+    ccl_stream *global_stream{};
+    ccl_comm *global_comm{};
+    ccl_comm *even_comm{};
+};
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_small_sycl.cpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_small_sycl.cpp
new file mode 100644
index 000000000..2463fe01a
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_small_sycl.cpp
@@ -0,0 +1,67 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allgatherv/sycl/allgatherv_small_sycl.hpp"
+
+sycl_allgatherv_small<sycl::half> agv_small_fp16;
+sycl_allgatherv_small<sycl::_V1::ext::oneapi::bfloat16> agv_small_bf16;
+sycl_allgatherv_small<float> agv_small_fp32;
+sycl_allgatherv_small<int> agv_small_int32;
+
+#define SWITCH_INIT_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        if (!agv_small_##TYPE.inited()) { \
+            LOG_INFO("invoking allgatherv small kernel first time for datatype: ", ccl_type); \
+            agv_small_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+        break;
+
+void init_allgatherv_small(ccl::datatype dtype,
+                           sycl::queue &queue,
+                           ccl_comm *comm,
+                           ccl_stream *stream,
+                           uint32_t rank_in,
+                           uint32_t world_in) {
+    switch (dtype) {
+        SWITCH_INIT_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_INIT_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_INIT_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_INIT_TYPE(int32, ccl::datatype::int32)
+        default: assert(0);
+    }
+}
+
+#define SWITCH_RUN_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        e = agv_small_##TYPE.allgatherv(queue, send_buf, send_count, recv_buf, recv_counts, done); \
+        break;
+
+ccl::event run_allgatherv_small(ccl::datatype dtype,
+                                sycl::queue queue,
+                                const void *send_buf,
+                                size_t send_count,
+                                void *recv_buf,
+                                const ccl::vector_class<size_t> &recv_counts,
+                                bool &done) {
+    ccl::event e;
+    switch (dtype) {
+        SWITCH_RUN_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_RUN_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_RUN_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_RUN_TYPE(int32, ccl::datatype::int32)
+        default: assert(0);
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_small_sycl.hpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_small_sycl.hpp
new file mode 100644
index 000000000..253a695a7
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_small_sycl.hpp
@@ -0,0 +1,649 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define SIMD_MAX       256
+#define SIMD           (SIMD_MAX / sizeof(data_type))
+#define SIMD_ATOMIC    16
+#define MAX_RANK       16
+#define UNROLL_SIZE    1
+#define TRIPLE_BUFFER  3
+#define SYNC_BYTE      (SIMD_ATOMIC * sizeof(int) * 2)
+#define ALIGNMENT_BYTE 256
+#define EU_COUNT       512
+#define THREADS_PER_EU 8
+#define MAX_THREAD     (EU_COUNT * THREADS_PER_EU)
+#define MAX_COUNT      (SIMD * UNROLL_SIZE * MAX_THREAD)
+#define INIT_SIZE      64
+#define INIT_COUNT     1
+#define SIMD_INIT      (INIT_SIZE * INIT_COUNT)
+
+template <uint32_t TEMP_WORLD, typename data_type>
+ESIMD_INLINE void gather_write(int offset,
+                               const void *send_buf,
+                               int myoffset,
+                               void *recv_buf,
+                               uint32_t send_count,
+                               void *temp_buffer[],
+                               uint32_t temp_rank,
+                               int size_per_buffer_kernel,
+                               int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    simd<data_type, SIMD * UNROLL_SIZE * TEMP_WORLD> buffer;
+    for (uint32_t r = 0; r < TEMP_WORLD; r++) {
+        if (r == temp_rank && send_buf == recv_buf)
+            continue;
+        data_type *peer_ptr = (data_type *)temp_buffer[r] + (buffer_index_kernel * size_per_buffer_kernel);
+#pragma unroll
+        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+            buffer.template select<SIMD, 1>(r * UNROLL_SIZE * SIMD + unroll_i * SIMD) =
+                lsc_block_load<data_type,
+                               SIMD,
+                               lsc_data_size::default_size,
+                               cache_hint::cached,
+                               cache_hint::cached>(peer_ptr + offset + unroll_i * SIMD);
+        }
+    }
+
+    for (uint32_t r = 0; r < TEMP_WORLD; r++) {
+        if (r == temp_rank && send_buf == recv_buf)
+            continue;
+        uint32_t r_offset = send_count * r + offset;
+#pragma unroll
+        for (int i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                (data_type *)recv_buf + r_offset + i * SIMD,
+                buffer.template select<SIMD, 1>(r * UNROLL_SIZE * SIMD + i * SIMD));
+        }
+    }
+}
+
+template <typename dtype>
+class Allgatherv_small_kernel_esimd;
+template <typename dtype, int wg_size>
+class Allgatherv_small_kernel_scalar;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK, uint32_t max_buffer = 1024 /*KB*/>
+class sycl_allgatherv_small : public sycl_coll_base<data_type> {
+public:
+    sycl_allgatherv_small() : sycl_coll_base<data_type>() {
+        buffer_index = 0;
+        size_per_buffer = 0;
+    }
+
+    void init(sycl::queue &queue, ccl_comm *comm, ccl_stream *stream, uint32_t rank_in, uint32_t world_in) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+        rank = rank_in;
+        world = world_in;
+        // temporal buffer used for allreduce temporal use only.
+        data_size_per_buffer = ((MAX_COUNT + SIMD * UNROLL_SIZE - 1) / (SIMD * UNROLL_SIZE)) * SIMD * UNROLL_SIZE;
+        data_size_per_buffer = ((data_size_per_buffer * sizeof(data_type) + ALIGNMENT_BYTE - 1) / ALIGNMENT_BYTE) *
+                               ALIGNMENT_BYTE / sizeof(data_type); //aligned size
+        size_per_buffer = data_size_per_buffer * sizeof(data_type) + SYNC_BYTE;
+
+        void *local_triple_buffer = sycl::malloc_device(size_per_buffer * TRIPLE_BUFFER, queue);
+        auto e = queue.memset(local_triple_buffer, 0, size_per_buffer * TRIPLE_BUFFER);
+        e.wait();
+        this->exchange_peer_ipc_mem(queue,
+                                    comm,
+                                    stream,
+                                    local_triple_buffer,
+                                    NULL,
+                                    rank,
+                                    world,
+                                    data_size_per_buffer * sizeof(data_type),
+                                    (void **)buffers,
+                                    (void **)sync_buffer,
+                                    offsets,
+                                    ipc_handle,
+                                    NULL,
+                                    NULL /* mmap_buffers */,
+                                    false /* to_cache */);
+        this->initialized = true;
+
+        int wg_size = 1;
+        //dummy kernel to avoid hang. The hang happens when there is no dummy kernel and allreduce() is called right after init().
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<1>({ 1 }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL {
+
+            });
+        });
+        e.wait();
+    }
+
+    ccl::event allgatherv(sycl::queue &queue,
+                          const void *send_buf,
+                          size_t send_count,
+                          void *recv_buf,
+                          const ccl::vector_class<size_t> &recv_counts,
+                          bool &done) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        assert(this->initialized == true);
+
+        for (uint32_t i = 0; i < recv_counts.size(); i++) {
+            if (recv_counts[i] != send_count) {
+                fprintf(stderr, "not all recv_counts are the same as send_count\n");
+                abort();
+            }
+        }
+
+        done = true;
+        if (send_count * world <= MAX_THREAD) {
+            if (send_count * world < 3000)
+                e = allgatherv_scalar<8>(queue, send_buf, send_count, recv_buf, recv_counts);
+            else
+                e = allgatherv_scalar<16>(queue, send_buf, send_count, recv_buf, recv_counts);
+        }
+        else {
+            if ((send_count * sizeof(data_type)) % 4 == 0) {
+                e = allgatherv_esimd(queue, send_buf, send_count, recv_buf, recv_counts);
+            }
+            else {
+                done = false;
+            }
+        }
+
+        return ccl::event::create_from_native(e);
+    }
+
+    sycl::event allgatherv_esimd(sycl::queue &queue,
+                                 const void *send_buf,
+                                 size_t send_count,
+                                 void *recv_buf,
+                                 const ccl::vector_class<size_t> &recv_counts) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+
+        uint32_t myoffset = 0;
+        if (send_buf == recv_buf)
+            myoffset = send_count * temp_rank;
+
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel = size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+
+        int buffer_index_kernel = buffer_index;
+        buffer_index++;
+        buffer_index %= TRIPLE_BUFFER;
+
+        uint32_t total_threads_needed = (send_count + SIMD * UNROLL_SIZE - 1) / (SIMD * UNROLL_SIZE); //ceiling
+        int wg_size = 8;
+        uint32_t total_threads_dispatched = (total_threads_needed + wg_size - 1) / wg_size * wg_size;
+        uint32_t total_wg_count = total_threads_dispatched / wg_size;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<Allgatherv_small_kernel_esimd<data_type>>(
+                sycl::nd_range<1>({ total_threads_dispatched }, wg_size),
+                [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL {
+                    uint32_t idx = idx2.get_global_id();
+
+                    //ESIMD kernel
+                    uint32_t offset = idx * SIMD * UNROLL_SIZE;
+                    simd<data_type, SIMD * UNROLL_SIZE> buffer_small;
+                    simd<ushort, SIMD_ATOMIC> ramp;
+                    simd_mask<SIMD_ATOMIC> pred;
+                    simd<int, SIMD_ATOMIC> status0;
+                    int *local_sync_ptr;
+
+#pragma unroll
+                    for (uint32_t i = 0; i < SIMD_ATOMIC; i++) {
+                        ramp[i] = i * sizeof(int);
+                    }
+
+                    //use the temp buffer for the current rank to copy the data to.
+                    data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+                    local_temp_ptr +=
+                        buffer_index_kernel *
+                        size_per_buffer_kernel; //point to the correct buffer inside the triple buffer
+
+                    //process the input only if the thread is useful
+                    if (idx < total_threads_needed) {
+                    //do copy from input buffer to temp buffer.
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            buffer_small.template select<SIMD, 1>(unroll_i * SIMD) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::cached,
+                                               cache_hint::cached>((data_type *)send_buf + myoffset + offset +
+                                                                   unroll_i * SIMD);
+                        }
+
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            lsc_block_store<data_type,
+                                            SIMD,
+                                            lsc_data_size::default_size,
+                                            cache_hint::uncached,
+                                            cache_hint::uncached>(
+                                local_temp_ptr + offset + unroll_i * SIMD,
+                                buffer_small.template select<SIMD, 1>(unroll_i * SIMD));
+                        }
+                        //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                        //since each threads are copying small chunks of data to temp buffer, all the threads needs to sync globally using atomics within this rank
+                    }
+
+                    //sync locally within local GPU first.
+                    local_sync_ptr = (int *)temp_sync_buffer
+                        [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                    local_sync_ptr += (buffer_index_kernel * size_per_buffer_for_sync_kernel);
+
+                    //if there are more than 1 threads required per rank, then do the local sync within the rank first.
+                    if (total_threads_needed > 1) {
+                        //do local sync in two steps. First using TG barrier. Then global L3 atomics.
+                        uint32_t local_tid = idx2.get_local_linear_id();
+
+                        pred = false;
+                        pred[0] = true;
+                        if (local_tid == 0) {
+                            status0 = lsc_atomic_update<atomic_op::inc,
+                                                        int,
+                                                        SIMD_ATOMIC,
+                                                        lsc_data_size::default_size,
+                                                        cache_hint::none,
+                                                        cache_hint::none>(local_sync_ptr, ramp, pred);
+                            //wait for all the local TG to sync. Then sync the other remote GPUs
+                            while (status0[0] != total_wg_count) {
+                                status0 = lsc_atomic_update<atomic_op::load,
+                                                            int,
+                                                            SIMD_ATOMIC,
+                                                            lsc_data_size::default_size,
+                                                            cache_hint::none,
+                                                            cache_hint::none>(local_sync_ptr, ramp, pred);
+                            }
+                        }
+                        barrier();
+                    }
+
+                    //once the local level sync is done, atomically write its counter to other remote gpus' atomic counter
+                    pred = false;
+                    pred[1] = true; //use different lane for the remote gpu sync
+
+                    if (total_threads_dispatched >= temp_world) {
+                        if (idx < temp_world) {
+                            status0 = total_threads_needed;
+                            int *sync_ptr = (int *)temp_sync_buffer[idx];
+                            sync_ptr += buffer_index_kernel * size_per_buffer_for_sync_kernel;
+                            lsc_atomic_update<atomic_op::add,
+                                              int,
+                                              SIMD_ATOMIC,
+                                              lsc_data_size::default_size,
+                                              cache_hint::none,
+                                              cache_hint::none>(sync_ptr, ramp, status0, pred);
+                        }
+                    }
+                    else if (idx == 0) //one thread in the local gpu notifies the remote gpu of its status.
+                    {
+                        status0 = total_threads_needed;
+                        for (uint32_t i = 0; i < temp_world; i++) {
+                            int *sync_ptr;
+                            sync_ptr = (int *)temp_sync_buffer
+                                [i]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                            sync_ptr += buffer_index_kernel * size_per_buffer_for_sync_kernel;
+                            lsc_atomic_update<atomic_op::add,
+                                              int,
+                                              SIMD_ATOMIC,
+                                              lsc_data_size::default_size,
+                                              cache_hint::none,
+                                              cache_hint::none>(sync_ptr, ramp, status0, pred);
+                        }
+                    }
+
+                    //once the local sync is done, retire useless threads
+                    if (idx >= total_threads_needed)
+                        return;
+
+                    //once all the local TGs are sync, do fence so that other GPU can see.
+                    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                    //wait for completion of the atomic sync
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_ATOMIC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(local_sync_ptr, ramp, pred);
+                    while (status0[1] != total_threads_needed * temp_world) {
+                        status0 = lsc_atomic_update<atomic_op::load,
+                                                    int,
+                                                    SIMD_ATOMIC,
+                                                    lsc_data_size::default_size,
+                                                    cache_hint::none,
+                                                    cache_hint::none>(local_sync_ptr, ramp, pred);
+                    }
+
+                    //reset the sync counter for the next allreduce session. Each rank reset's its own buffer
+                    if (idx == 0) //one thread in the local gpu notifies the remote gpu of its status.
+                    {
+                        int buffer_index_to_reset = (buffer_index_kernel + TRIPLE_BUFFER - 1) % TRIPLE_BUFFER;
+                        status0 = 0;
+                        pred = true;
+                        local_sync_ptr = (int *)temp_sync_buffer
+                            [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        local_sync_ptr += (buffer_index_to_reset * size_per_buffer_for_sync_kernel);
+                        lsc_atomic_update<atomic_op::store,
+                                          int,
+                                          SIMD_ATOMIC,
+                                          lsc_data_size::default_size,
+                                          cache_hint::none,
+                                          cache_hint::none>(
+                            local_sync_ptr, ramp, status0, pred); //reset the first half of sync buffer
+                    }
+
+                    //at this point, all the threads are done copying data from input buffer to temp buffer.
+                    //for (uint32_t r = 0; r < temp_world; r++)  {
+                    if (offset + SIMD * UNROLL_SIZE <= send_count) {
+                        switch (temp_world) {
+                            case 2:
+                                gather_write<2, data_type>(offset,
+                                                           send_buf,
+                                                           myoffset,
+                                                           recv_buf,
+                                                           send_count,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel);
+                                break;
+                            case 4:
+                                gather_write<4, data_type>(offset,
+                                                           send_buf,
+                                                           myoffset,
+                                                           recv_buf,
+                                                           send_count,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel);
+                                break;
+                            case 6:
+                                gather_write<6, data_type>(offset,
+                                                           send_buf,
+                                                           myoffset,
+                                                           recv_buf,
+                                                           send_count,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel);
+                                break;
+                            case 8:
+                                gather_write<8, data_type>(offset,
+                                                           send_buf,
+                                                           myoffset,
+                                                           recv_buf,
+                                                           send_count,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel);
+                                break;
+                            case 10:
+                                gather_write<10, data_type>(offset,
+                                                            send_buf,
+                                                            myoffset,
+                                                            recv_buf,
+                                                            send_count,
+                                                            (void **)temp_buffer,
+                                                            temp_rank,
+                                                            size_per_buffer_kernel,
+                                                            buffer_index_kernel);
+                                break;
+                            case 12:
+                                gather_write<12, data_type>(offset,
+                                                            send_buf,
+                                                            myoffset,
+                                                            recv_buf,
+                                                            send_count,
+                                                            (void **)temp_buffer,
+                                                            temp_rank,
+                                                            size_per_buffer_kernel,
+                                                            buffer_index_kernel);
+                                break;
+                            case 14:
+                                gather_write<14, data_type>(offset,
+                                                            send_buf,
+                                                            myoffset,
+                                                            recv_buf,
+                                                            send_count,
+                                                            (void **)temp_buffer,
+                                                            temp_rank,
+                                                            size_per_buffer_kernel,
+                                                            buffer_index_kernel);
+                                break;
+                            case 16:
+                                gather_write<16, data_type>(offset,
+                                                            send_buf,
+                                                            myoffset,
+                                                            recv_buf,
+                                                            send_count,
+                                                            (void **)temp_buffer,
+                                                            temp_rank,
+                                                            size_per_buffer_kernel,
+                                                            buffer_index_kernel);
+                                break;
+                            default: break;
+                        }
+                    }
+                    else {
+                        for (uint32_t r = 0; r < temp_world; r++) {
+                            data_type *src_ptr = (data_type *)temp_buffer[r] +
+                                                 buffer_index_kernel * size_per_buffer_kernel + offset;
+                            data_type *dest_ptr = (data_type *)recv_buf + send_count * r + offset;
+                            for (size_t i = offset; i < send_count; i++) {
+                                *dest_ptr = *src_ptr;
+                                src_ptr++;
+                                dest_ptr++;
+                            }
+                        }
+                    }
+                });
+        });
+        //e.wait();
+        return e;
+    }
+
+    template <int wg_size>
+    sycl::event allgatherv_scalar(sycl::queue &queue,
+                                  const void *send_buf,
+                                  size_t send_count,
+                                  void *recv_buf,
+                                  const ccl::vector_class<size_t> &recv_counts) {
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+
+        uint32_t myoffset = 0;
+        if (send_buf == recv_buf)
+            myoffset = send_count * temp_rank;
+
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel = size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+
+        //uint32_t total_threads_needed = send_count;
+        assert(wg_size >= 8);
+        uint32_t total_threads_needed = send_count * temp_world;
+        uint32_t total_threads_dispatched = (total_threads_needed + wg_size - 1) / wg_size * wg_size;
+        uint32_t total_wg_count = total_threads_dispatched / wg_size;
+        uint32_t total_threads_needed_copy = send_count;
+
+        int buffer_index_kernel = buffer_index;
+        buffer_index++;
+        buffer_index %= TRIPLE_BUFFER;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<Allgatherv_small_kernel_scalar<
+                data_type,
+                wg_size>>(sycl::nd_range<1>({ total_threads_dispatched }, wg_size), [=](sycl::nd_item<1> idx2) {
+                uint32_t idx = idx2.get_global_id();
+
+                int *local_sync_ptr;
+
+                //use the temp buffer for the current rank to copy the data to.
+                data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+                local_temp_ptr += buffer_index_kernel *
+                                  size_per_buffer_kernel; //point to the correct buffer inside the triple buffer
+
+                //process the input only if the thread is useful
+                //if (idx < total_threads_needed)
+                if (idx < total_threads_needed_copy) {
+                    local_temp_ptr[idx] = *((data_type *)send_buf + myoffset + idx);
+                }
+
+                //since each threads are copying small chunks of data to temp buffer, all the threads needs to sync globally using atomics within this rank
+
+                //sync locally within local GPU first.
+                local_sync_ptr = (int *)temp_sync_buffer
+                    [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                local_sync_ptr += buffer_index_kernel * size_per_buffer_for_sync_kernel;
+
+                //if there are more than 1 threads required per rank, then do the local sync within the rank first.
+                if (total_threads_needed > 1) {
+                    //do local sync in two steps. First using TG barrier. Then global L3 atomics.
+                    uint32_t local_tid = idx2.get_local_linear_id();
+
+                    if (local_tid == 0) {
+                        sycl::atomic_ref<int,
+                                         sycl::memory_order::relaxed,
+                                         sycl::memory_scope::device,
+                                         sycl::access::address_space::global_space>
+                            atomic_p(local_sync_ptr[0]);
+                        atomic_p += 1;
+
+                        //wait for all the local TG to sync. Then sync the other remote GPUs
+                        uint32_t val = atomic_p.load();
+                        while (val != total_wg_count) {
+                            val = atomic_p.load();
+                        }
+                    }
+                    idx2.barrier();
+                }
+
+                //once the local level sync is done, atomically write its counter to other remote gpus' atomic counter
+                if (total_threads_dispatched >= temp_world) {
+                    if (idx < temp_world) {
+                        uint32_t status0 = total_threads_needed;
+                        int *sync_ptr = (int *)temp_sync_buffer
+                            [idx]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        sync_ptr += buffer_index_kernel * size_per_buffer_for_sync_kernel;
+                        sycl::atomic_ref<int,
+                                         sycl::memory_order::relaxed,
+                                         sycl::memory_scope::device,
+                                         sycl::access::address_space::global_space>
+                            atomic_p(sync_ptr[1]);
+                        atomic_p += status0;
+                    }
+                }
+                else if (idx == 0) //one thread in the local gpu notifies the remote gpu of its status.
+                {
+                    uint32_t status0 = total_threads_needed;
+                    for (uint32_t i = 0; i < temp_world; i++) {
+                        int *sync_ptr;
+                        sync_ptr = (int *)temp_sync_buffer
+                            [i]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        sync_ptr += buffer_index_kernel * size_per_buffer_for_sync_kernel;
+                        sycl::atomic_ref<int,
+                                         sycl::memory_order::relaxed,
+                                         sycl::memory_scope::device,
+                                         sycl::access::address_space::global_space>
+                            atomic_p(sync_ptr[1]);
+                        atomic_p += status0;
+                    }
+                }
+
+                //once the local sync is done, retire useless threads
+                if (idx >= total_threads_needed)
+                    return;
+
+                //once all the local TGs are sync, do fence so that other GPU can see.
+                //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                //wait for completion of the atomic sync
+                sycl::atomic_ref<int,
+                                 sycl::memory_order::relaxed,
+                                 sycl::memory_scope::device,
+                                 sycl::access::address_space::global_space>
+                    atomic_p(local_sync_ptr[1]);
+                uint32_t val = atomic_p.load();
+                while (val != total_threads_needed * temp_world) {
+                    val = atomic_p.load();
+                }
+
+                //reset the sync counter for the next allreduce session. Each rank reset's its own buffer
+                if (idx == 0) //one thread in the local gpu notifies the remote gpu of its status.
+                {
+                    int buffer_index_to_reset = (buffer_index_kernel + TRIPLE_BUFFER - 1) % TRIPLE_BUFFER;
+                    local_sync_ptr = (int *)temp_sync_buffer
+                        [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                    local_sync_ptr += buffer_index_to_reset * size_per_buffer_for_sync_kernel;
+                    local_sync_ptr[0] = local_sync_ptr[1] = 0;
+                }
+
+                //at this point, all the threads are done copying data from input buffer to temp buffer.
+                uint32_t r = idx / send_count;
+                data_type *peer_ptr = (data_type *)temp_buffer[r] + buffer_index_kernel * size_per_buffer_kernel;
+                int ii = idx % send_count;
+                *((data_type *)recv_buf + idx) = peer_ptr[ii];
+            });
+        });
+        return e;
+    }
+
+private:
+    void *buffers[max_rank]{};
+    void *sync_buffer[max_rank]{};
+    size_t offsets[max_rank]{};
+    ze_ipc_mem_handle_t ipc_handle[max_rank]{};
+    int rank{ ccl::utils::invalid_rank }, world{ ccl::utils::invalid_err_code };
+    int buffer_index{ ccl::utils::invalid_err_code };
+    int size_per_buffer{ ccl::utils::invalid_bytes_value };
+    int data_size_per_buffer{ ccl::utils::invalid_bytes_value };
+};
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_sycl.cpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_sycl.cpp
new file mode 100644
index 000000000..e95178ca3
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_sycl.cpp
@@ -0,0 +1,166 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+#include "coll/algorithms/allgatherv/sycl/allgatherv_sycl.hpp"
+// why large count written in opposite way vs small, medium
+#include "coll/algorithms/allgatherv/sycl/allgatherv_large_sycl.hpp"
+
+ccl::event allgatherv_impl(const void* send_buf,
+                           size_t send_count,
+                           void* recv_buf,
+                           const ccl::vector_class<size_t>& recv_counts,
+                           ccl::datatype dtype,
+                           const ccl::communicator& comm,
+                           const ccl::stream& op_stream,
+                           const ccl::allgatherv_attr& attr,
+                           const ccl::vector_class<ccl::event>& deps) {
+#ifdef CCL_ENABLE_ITT
+    std::string itt_string = "CCL_ALLGATHERV_SYCL " + std::to_string(send_count);
+    __itt_event coll_create_itt_event = ccl::profile::itt::event_get(itt_string.c_str());
+    ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+    auto ccl_dtype = ccl::global_data::get().dtypes->get(dtype);
+    ccl::event e;
+    if (send_count * ccl_dtype.size() <= ccl::global_data::env().allgatherv_small_size_threshold) {
+        // I don't know what it is
+    }
+    else {
+        e = allgatherv_large(send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+    }
+#ifdef CCL_ENABLE_ITT
+    ccl::profile::itt::event_end(coll_create_itt_event);
+#endif //CCL_ENABLE_ITT
+    return e;
+}
+
+namespace ccl {
+namespace v1 {
+
+struct impl_dispatch {
+    template <class Object>
+    const typename Object::impl_value_t& operator()(const Object& obj) {
+        return obj.get_impl();
+    }
+};
+
+ccl::event allgather_sycl(sycl::queue q,
+                          const void* send_buf,
+                          size_t send_count,
+                          void* recv_buf,
+                          const ccl::vector_class<size_t>& recv_counts,
+                          ccl::datatype dtype,
+                          const ccl::communicator& comm,
+                          const stream& op_stream,
+                          const allgatherv_attr& attr,
+                          const vector_class<event>& deps,
+                          bool& done) {
+    ccl::event e;
+    done = true;
+
+    uint32_t world = comm.size();
+    int rank = comm.rank();
+
+    auto ccl_dtype = ccl::global_data::get().dtypes->get(dtype);
+
+    ccl::impl_dispatch disp;
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    ccl_stream* global_stream = get_stream_ptr(disp(op_stream));
+    const bool is_single_tile = global_comm->get_pair_comm()->size() == 1;
+    const bool has_all_vertices_connected = global_comm->get_topo_manager().has_all_vertices_connected();
+    LOG_DEBUG("|CCL_SYCL| has_all_vertices_connected", has_all_vertices_connected);
+
+    for (uint32_t i = 0; i < recv_counts.size(); i++) {
+        if (send_count != recv_counts[i]) {
+            LOG_ERROR("Allgatherv only supports the case when all recv_counts are the same");
+            done = false;
+            return e;
+        }
+        assert(send_count == recv_counts[i]);
+    }
+
+    if (world == 1) {
+        sycl::event e_1;
+        if (send_buf != recv_buf) {
+            e_1 = q.memcpy(recv_buf, send_buf, send_count * ccl_dtype.size());
+        }
+        return ccl::event::create_from_native(e_1);
+    }
+
+    if (send_count * ccl_dtype.size() <= ccl::global_data::env().allgatherv_small_size_threshold &&
+        has_all_vertices_connected) {
+        init_allgatherv_small(dtype, q, global_comm, global_stream, rank, world);
+
+#ifdef CCL_ENABLE_ITT
+        __itt_event coll_create_itt_event = ccl::profile::itt::event_get("CCL_ALLGATHERV_SMALL");
+        ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+        LOG_DEBUG("|CCL_SYCL| allgatherv selects small kernel, count: ", send_count, " datatype: ", dtype);
+        e = run_allgatherv_small(dtype, q, send_buf, send_count, recv_buf, recv_counts, done);
+        LOG_DEBUG(
+            "|CCL_SYCL| allgatherv selects small kernel, count: ", send_count, " datatype: ", dtype, " done");
+#ifdef CCL_ENABLE_ITT
+        ccl::profile::itt::event_end(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+    }
+    else if (send_count * ccl_dtype.size() <= ccl::global_data::env().allgatherv_medium_size_threshold &&
+             !is_single_tile) {
+        init_allgatherv_medium(dtype, q, global_comm, global_stream, rank, world);
+
+#ifdef CCL_ENABLE_ITT
+        __itt_event coll_create_itt_event = ccl::profile::itt::event_get("CCL_ALLGATHERV_MEDIUM");
+        ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+        LOG_DEBUG("|CCL_SYCL| allgatherv selects medium kernel: count: ", send_count, " datatype: ", dtype);
+        e = run_allgatherv_medium(dtype, q, send_buf, send_count, recv_buf, recv_counts, done);
+        LOG_DEBUG(
+            "|CCL_SYCL| allgatherv selects medium kernel: count: ", send_count, " datatype: ", dtype, " done");
+#ifdef CCL_ENABLE_ITT
+        ccl::profile::itt::event_end(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+    }
+    else if (!is_single_tile) {
+        if (send_count % 2 == 0 || ccl_dtype.size() >= 4) {
+            // TODO: rewrite comments in way small
+            LOG_DEBUG("|CCL_SYCL| invoking large allgatherv: count: ", send_count, " datatype: ", dtype);
+            return allgatherv_impl(
+                send_buf, send_count, recv_buf, recv_counts, dtype, comm, op_stream, attr, deps);
+            LOG_DEBUG(
+                "|CCL_SYCL| allgatherv selects large kernel: count: ", send_count, " datatype: ", dtype, " done");
+        }
+        else {
+            LOG_DEBUG(
+                "[", rank, "] allgatherv selects ccl scheduler send_count: ", send_count, ", datatype: ", dtype);
+            return disp(comm)->allgatherv(
+                send_buf, send_count, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
+            LOG_DEBUG("[",
+                      rank,
+                      "] allgatherv selects ccl scheduler send_count: ",
+                      send_count,
+                      ", datatype: ",
+                      dtype,
+                      " done");
+        }
+    }
+    else {
+        done = false;
+    }
+
+    return e;
+}
+
+} // namespace v1
+} // namespace ccl
diff --git a/src/coll/algorithms/allgatherv/sycl/allgatherv_sycl.hpp b/src/coll/algorithms/allgatherv/sycl/allgatherv_sycl.hpp
new file mode 100644
index 000000000..5b77ddeab
--- /dev/null
+++ b/src/coll/algorithms/allgatherv/sycl/allgatherv_sycl.hpp
@@ -0,0 +1,61 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#define SYCL_ALLGATHERV_FUNCTIONS(MSGSIZE) \
+    void init_allgatherv_##MSGSIZE(ccl::datatype dtype, \
+                                   sycl::queue& queue, \
+                                   ccl_comm* comm, \
+                                   ccl_stream* stream, \
+                                   uint32_t rank_in, \
+                                   uint32_t world_in); \
+    ccl::event run_allgatherv_##MSGSIZE(ccl::datatype dtype, \
+                                        sycl::queue q, \
+                                        const void* send_buf, \
+                                        size_t send_count, \
+                                        void* rev_buf, \
+                                        const ccl::vector_class<size_t>& recv_counts, \
+                                        bool& done);
+
+SYCL_ALLGATHERV_FUNCTIONS(small)
+SYCL_ALLGATHERV_FUNCTIONS(medium)
+
+ccl::event allgatherv_impl(const void* send_buf,
+                           size_t send_count,
+                           void* recv_buf,
+                           const ccl::vector_class<size_t>& recv_counts,
+                           ccl::datatype dtype,
+                           const ccl::communicator& comm,
+                           const ccl::stream& op_stream,
+                           const ccl::allgatherv_attr& attr,
+                           const ccl::vector_class<ccl::event>& deps);
+
+namespace ccl {
+namespace v1 {
+
+ccl::event allgather_sycl(sycl::queue q,
+                          const void* send_buf,
+                          size_t send_count,
+                          void* recv_buf,
+                          const ccl::vector_class<size_t>& recv_counts,
+                          ccl::datatype dtype,
+                          const ccl::communicator& comm,
+                          const stream& op_stream,
+                          const allgatherv_attr& attr,
+                          const vector_class<event>& deps,
+                          bool& done);
+} // namespace v1
+} // namespace ccl
diff --git a/src/coll/algorithms/allreduce/allreduce.cpp b/src/coll/algorithms/allreduce/allreduce.cpp
index df001037e..f11d3dff8 100644
--- a/src/coll/algorithms/allreduce/allreduce.cpp
+++ b/src/coll/algorithms/allreduce/allreduce.cpp
@@ -439,6 +439,7 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
                                           ccl_buffer send_buf,
                                           ccl_buffer recv_buf,
                                           size_t count,
+                                          const std::vector<ccl_buffer>& recv_device_bufs,
                                           const ccl_datatype& dtype,
                                           ccl::reduction op,
                                           ccl_comm* comm) {
@@ -465,6 +466,7 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
 
     sched->add_barrier();
 
+    // Prepare recv_counts for allgatherv phase
     int comm_size = comm->size();
     size_t main_block_count = count / comm_size;
     size_t last_block_count = main_block_count + count % comm_size;
@@ -473,6 +475,48 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
         recv_counts[comm_size - 1] = last_block_count;
     }
 
+    // Due to the allreduce and allgatherv API differences, we have to
+    // prepare device buffers for copy overlapping.
+    // Transform single buffer to the array of buffers with offsets.
+    std::vector<ccl_buffer> recv_device_allgatherv_bufs;
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    // Note: HMEM case does not require copy to device stage
+    bool enable_hmem = (ccl::global_data::env().use_hmem && atl_base_comm::attr.out.enable_hmem);
+    if (!enable_hmem && !recv_device_bufs.empty()) {
+        std::vector<size_t> recv_offset(comm_size, 0);
+        for (int rank_idx = 1; rank_idx < comm_size; rank_idx++) {
+            recv_offset[rank_idx] =
+                recv_offset[rank_idx - 1] + recv_counts[rank_idx - 1] * dtype.size();
+        }
+        // recv_device_bufs array acts only as a storage for the allreduce receive device buffer
+        ccl_buffer recv_device_buf = recv_device_bufs.front();
+        for (int b_idx = 0; b_idx < comm_size; b_idx++) {
+            recv_device_allgatherv_bufs.emplace_back(recv_device_buf + recv_offset[b_idx]);
+        }
+
+        // Express dependency between the reduce_scatter and ze_copy_entry
+        auto signaled_event = ccl::add_signal_event(sched);
+
+        size_t rank = comm->rank();
+        size_t copy_counts = recv_counts[rank];
+
+        // This case can happend if previously "count < comm_size"
+        if (copy_counts) {
+            ccl_buffer copy_src = recv_buf + recv_offset[rank];
+            ccl_buffer copy_dst = recv_device_allgatherv_bufs[rank];
+
+            // Submit in-place parallel H2D copy with the next allgatherv operation (in-place init)
+            entry_factory::create<ze_copy_entry>(sched,
+                                                 copy_src,
+                                                 copy_dst,
+                                                 copy_counts,
+                                                 dtype,
+                                                 copy_attr(copy_direction::h2d),
+                                                 std::vector<ze_event_handle_t>{ signaled_event });
+        }
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
     std::vector<ccl_sched*> part_scheds = { sched };
     ccl_coll_build_ring_allgatherv(nullptr,
                                    part_scheds,
@@ -480,6 +524,7 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
                                    recv_counts[comm->rank()],
                                    recv_buf,
                                    recv_counts.data(),
+                                   recv_device_allgatherv_bufs,
                                    dtype,
                                    comm);
 
@@ -640,8 +685,15 @@ static void ccl_allreduce_2d_add_allreduce_allgather(ccl_sched* sched,
     // TODO: skip direct algo since it may be started
     // with different order on different ranks
     sched->hint_algo.allgatherv = ccl_coll_allgatherv_ring;
-    ccl_coll_build_allgatherv(
-        sched, rbuf, ar_count, rbuf, ag_recv_counts.data(), dtype, first_dim_comm, false);
+    ccl_coll_build_allgatherv(sched,
+                              rbuf,
+                              ar_count,
+                              rbuf,
+                              ag_recv_counts.data(),
+                              std::vector<ccl_buffer>{},
+                              dtype,
+                              first_dim_comm,
+                              false);
     sched->hint_algo.allgatherv = ccl_coll_allgatherv_undefined;
 }
 
@@ -822,11 +874,13 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
     bool use_reduce_scatter_pipeline =
         ccl::global_data::env().reduce_scatter_monolithic_pipeline_kernel &&
         even_comm->size() > 1 && pair_comm->size() > 1 && count >= (size_t)comm_size &&
-        is_multi_card && dtype != ccl::datatype::int8;
+        is_multi_card && dtype != ccl::datatype::int8 &&
+        ccl::global_data::env().enable_ze_bidir_algo;
 
     // allgatherv pipeline uses xelink read and mdfi write
     const bool use_allgatherv_pipeline =
-        ccl::global_data::env().allgatherv_monolithic_pipeline_kernel && count >= (size_t)comm_size;
+        ccl::global_data::env().allgatherv_monolithic_pipeline_kernel &&
+        count >= (size_t)comm_size && even_comm->size() > 1;
 
     size_t base_count = count;
     size_t pair_comm_offset = 0;
@@ -1011,10 +1065,9 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
                   pair_comm_offset);
         if (is_single_card) {
             // workaround for hardware issue that MDFI write from kernel
-            // with 8 bit and 16 bit data types are slow. Instead perform
+            // with 8 bit data type is slow. Instead perform
             // MDFI read + reduce in kernel and MDFI write using memcpy
-            if (dtype == ccl::datatype::int8 || dtype == ccl::datatype::float16 ||
-                dtype == ccl::datatype::bfloat16) {
+            if (dtype == ccl::datatype::int8) {
                 auto entry = entry_factory::create<ze_onesided_reduce_entry>(sched,
                                                                              pair_comm_send_buf,
                                                                              pair_comm_recv_buf,
@@ -1169,6 +1222,7 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
     coll_param.ctype = ccl_coll_allreduce;
     coll_param.send_buf = even_comm_recv_buf;
     coll_param.recv_buf = even_comm_recv_buf;
+    coll_param.recv_scale_out_bufs = std::vector<ccl_buffer>{ even_comm_recv_buf };
     coll_param.count = block_count;
     coll_param.dtype = dtype;
     coll_param.reduction = op;
@@ -1244,19 +1298,21 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
     if (!is_single_card && pair_comm->size() > 1 && !use_allgatherv_pipeline) {
         LOG_DEBUG("topo/scale_up/intra: use ze_onesided_bcast");
         int peer_rank = (pair_comm->rank() + 1) % pair_comm->size();
-        auto entry = entry_factory::create<ze_copy_entry>(sched,
-                                                          recv_buf,
-                                                          ccl_buffer(),
-                                                          base_count,
-                                                          dtype,
-                                                          copy_attr(peer_rank,
-                                                                    recv_buf_idx,
-                                                                    copy_direction::t2t,
-                                                                    false, /*pt2pt_op*/
-                                                                    pair_comm,
-                                                                    pair_comm_offset,
-                                                                    pair_comm_offset),
-                                                          wait_events);
+
+        auto attrs = copy_attr(peer_rank,
+                               recv_buf_idx,
+                               copy_direction::t2t,
+                               false, /*pt2pt_op*/
+                               pair_comm,
+                               pair_comm_offset,
+                               pair_comm_offset);
+
+        if (ccl::global_data::env().allreduce_pipe_chunk_count > 1) {
+            attrs.force_queue_type = ccl::ze::queue_group_type::main;
+        }
+
+        auto entry = entry_factory::create<ze_copy_entry>(
+            sched, recv_buf, ccl_buffer(), base_count, dtype, attrs, wait_events);
         clear_and_push_back(wait_events, entry->entry_event);
         sched->add_barrier();
     }
@@ -1280,154 +1336,20 @@ ccl::status ccl_coll_build_topo_allreduce(ccl_sched* sched,
                                           const ccl_datatype& dtype,
                                           ccl::reduction op,
                                           ccl_comm* comm) {
-    // Note about cache lines and pipelining: The same cache line must contain
-    // a single chunk only.
-    //
-    // If the same cache line contains two chunks (or more), and we parallelize
-    // the instructions required for both chunks, a conflict (race condition)
-    // may appear between the copy-out for the scaleout portion and the
-    // reduce_scatter phase.
-    //
-    // The easiest way to avoid that race condition is to require that each
-    // cache line contains a single entry. If that is not the case, we must not
-    // parallelize the instructions for different chunks.
-
-    size_t chunk_count = ccl::global_data::env().allreduce_pipe_chunk_count;
-    bool is_pipe = chunk_count > 0 && ccl::global_data::env().enable_ze_single_list;
-
-    // TODO: why does oneCCL have CACHELINE_SIZE *and* CCL_KERNEL_MEM_ALIGN?
-    size_t memalign = ccl::global_data::env().kernel_mem_align;
-    size_t buf_size_bytes = count * dtype.size();
-
-    // First, determine if we need to fallback to non-pipelined algorightm.
-    // Such a fallback may happen in cases such as (1) the user requests it,
-    // (2) message fits into a cache line, or (3) the cache line size is not
-    // divisible by the data type size.
-
-    size_t number_of_cache_lines_per_chunk =
-        !is_pipe ? 1 : std::max(memalign, buf_size_bytes / chunk_count) / memalign;
-    size_t main_chunk_size_bytes = memalign * number_of_cache_lines_per_chunk;
-
-    bool is_dtype_non_divisible = main_chunk_size_bytes % dtype.size();
-    bool is_msg_smaller_than_cache_line = buf_size_bytes <= main_chunk_size_bytes;
-
-    bool is_multiworker =
-        ccl::global_data::env().ze_multi_workers && ccl::global_data::env().worker_count > 1;
-
-    if (!is_pipe || is_dtype_non_divisible || is_msg_smaller_than_cache_line || is_multiworker) {
-        // Fall back to topo algorithm without pipelining
-
-        if (!is_pipe) {
-            LOG_DEBUG("Pipelining code disabled");
-        }
-        else if (is_dtype_non_divisible) {
-            LOG_INFO("Running without pipelining because datatype size (",
-                     dtype.size(),
-                     ") is not divisible by cache line size (",
-                     memalign,
-                     ")");
-        }
-        else if (is_msg_smaller_than_cache_line) {
-            LOG_INFO("Running without pipelining because message size (",
-                     buf_size_bytes,
-                     ") is smaller than a cache line (",
-                     memalign,
-                     ") and main_chunk_size_bytes (",
-                     main_chunk_size_bytes,
-                     ")");
-        }
-        else if (is_multiworker) {
-            LOG_INFO(
-                "Running without pipelining because ze_multi_workers was requested with more than one worker");
-        }
-        else {
-            CCL_THROW("Unexpected fallback to non-pipe code");
-        }
-
-        ccl_coll_build_topo_allreduce_fill(sched, send_buf, recv_buf, count, dtype, op, comm);
-
-        entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
-
-        return ccl::status::success;
-    }
-
-    LOG_DEBUG("build pipe allreduce");
-
-    size_t main_chunk_count = main_chunk_size_bytes / dtype.size();
-
-    // Need to re-calculate chunk_count after main_chunk_size_bytes calculation
-    // with cache alignment in mind.
-    chunk_count = count / main_chunk_count;
-    size_t last_chunk_count = main_chunk_count + (count % main_chunk_count);
-
-    sched->try_enable_ze_single_list();
-    auto sync_obj = std::make_shared<sync_object>(chunk_count);
-    bool is_parallelizable_chunks = true;
-
-    for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
-        size_t chunk_offset = chunk_idx * main_chunk_count * dtype.size();
-        ccl_buffer sbuf = send_buf + chunk_offset;
-        ccl_buffer rbuf = recv_buf + chunk_offset;
-        size_t this_chunk_count =
-            (chunk_idx == (chunk_count - 1)) ? last_chunk_count : main_chunk_count;
-
-        if (this_chunk_count || (count == 0 && chunk_idx == 0)) {
-            entry_factory::create<subsched_entry>(
-                sched,
-                chunk_idx,
-                [sched, sbuf, rbuf, this_chunk_count, dtype, op, comm, sync_obj](ccl_sched* s) {
-                    s->inherit_ze_managers_from(sched);
-                    s->set_init_ze_hook_sync_obj(sync_obj);
-                    s->set_ze_commands_bypass_flag(false);
-
-                    ccl_coll_build_topo_allreduce_fill(
-                        s, sbuf, rbuf, this_chunk_count, dtype, op, comm);
-                },
-                ("ALLREDUCE_PIPE" + std::to_string(chunk_idx)).c_str());
-        }
-        if (chunk_idx > 0) {
-            auto ptr = reinterpret_cast<uintptr_t>(rbuf.get_ptr());
-            auto prev_chunk_last_cache_line = (ptr - 1) / memalign;
-            auto this_chunk_first_cache_line = ptr / memalign;
-
-            if (prev_chunk_last_cache_line == this_chunk_first_cache_line) {
-                // WARNING: previous chunk has part of this chunk's first cache
-                // line. Cannot use pipelining. However, since this is a
-                // "local" decision (i.e., other ranks may decide differently),
-                // we still need to apply chunking. However, we will run one
-                // chunk at a time, without parallelizing them.
-                // Another way to have implemented this would be to link the
-                // last task of the prev chunk with the first of this chunk
-                // with an event.
-                is_parallelizable_chunks = false;
-            }
-        }
-    }
-
-    static bool is_chunk_memalign_warning_printed{};
-    if (!is_parallelizable_chunks && !is_chunk_memalign_warning_printed) {
-        is_chunk_memalign_warning_printed = true;
-        LOG_WARN(
-            "[allreduce pipelining]: For best performance, (i) chunk size should be a multiple of a cache line (",
-            memalign,
-            " bytes), and (ii) buffers in all ranks should be aligned to ",
-            memalign);
-    }
-
-    if (!is_parallelizable_chunks) {
-        ccl::global_data::get()
-            .metrics_profiler->allreduce_pipe_nonparallel_calls_per_count[count]++;
-    }
-    else {
-        ccl::global_data::get().metrics_profiler->allreduce_pipe_parallel_calls_per_count[count]++;
-    }
-
-    entry_factory::create<ze_execute_cmdlists_on_start_entry>(
+    return ccl_build_topo_uniform_buff_size_op(
         sched,
-        sync_obj,
-        is_parallelizable_chunks ? submit_ze_commands_in_subsched_entries : nullptr);
-
-    return ccl::status::success;
+        send_buf,
+        recv_buf,
+        count,
+        dtype.size(),
+        ccl::global_data::env().allreduce_pipe_chunk_count,
+        "ALLREDUCE",
+        ccl::global_data::get().metrics_profiler->allreduce_pipe,
+        [dtype, op, comm](ccl_sched* sched, ccl_buffer send_buf, ccl_buffer recv_buf, size_t count)
+            -> ccl::status {
+            return ccl_coll_build_topo_allreduce_fill(
+                sched, send_buf, recv_buf, count, dtype, op, comm);
+        });
 }
 
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl.cpp
new file mode 100644
index 000000000..8133096b8
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl.cpp
@@ -0,0 +1,79 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_large_sycl.hpp"
+
+#define MAX_RANK 16
+
+void *allreduce_large_buffer = NULL;
+void *allreduce_large_buffers[MAX_RANK];
+void *allreduce_large_sync_buffer[MAX_RANK];
+size_t allreduce_large_offsets[MAX_RANK];
+ze_ipc_mem_handle_t allreduce_large_ipc_handle[MAX_RANK];
+int allreduce_large_buffer_index = 0;
+
+#define ALLREDUCE_LARGE_API_DECL(TYPE) \
+    void init_allreduce_large_##TYPE(ccl::datatype dtype, \
+                                     sycl::queue &queue, \
+                                     ccl_comm *comm, \
+                                     ccl_stream *stream, \
+                                     uint32_t rank_in, \
+                                     uint32_t world_in); \
+    ccl::event run_allreduce_large_##TYPE( \
+        ccl::datatype dtype, sycl::queue queue, const void *in_buf, void *out_buf, size_t count)
+
+ALLREDUCE_LARGE_API_DECL(fp16);
+ALLREDUCE_LARGE_API_DECL(bf16);
+ALLREDUCE_LARGE_API_DECL(fp32);
+ALLREDUCE_LARGE_API_DECL(int32);
+
+#define SWITCH_INIT_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        init_allreduce_large_##TYPE(dtype, queue, comm, stream, rank_in, world_in); \
+        break;
+
+void init_allreduce_large(ccl::datatype dtype,
+                          sycl::queue &queue,
+                          ccl_comm *comm,
+                          ccl_stream *stream,
+                          uint32_t rank_in,
+                          uint32_t world_in) {
+    switch (dtype) {
+        SWITCH_INIT_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_INIT_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_INIT_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_INIT_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for allreduce"); assert(0);
+    }
+}
+
+#define SWITCH_RUN_TYPE(TYPE, ccl_type) \
+    case ccl_type: e = run_allreduce_large_##TYPE(dtype, queue, in_buf, out_buf, count); break;
+
+ccl::event run_allreduce_large(ccl::datatype dtype,
+                               sycl::queue queue,
+                               const void *in_buf,
+                               void *out_buf,
+                               size_t count) {
+    ccl::event e;
+    switch (dtype) {
+        SWITCH_RUN_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_RUN_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_RUN_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_RUN_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for allreduce"); assert(0);
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl.hpp b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl.hpp
new file mode 100644
index 000000000..c25208112
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl.hpp
@@ -0,0 +1,2086 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define MAX_RANK         16
+#define INIT_SIZE        64
+#define INIT_COUNT       1
+#define SIMD_INIT        (INIT_SIZE * INIT_COUNT)
+#define SIMD_COMPUTE_MAX 256
+#define SIMD_COMPUTE     (SIMD_COMPUTE_MAX / sizeof(data_type))
+#define SIMD_SYNC        32
+#define SYNC_BYTE        (SIMD_SYNC * sizeof(int) * 2)
+#define ALIGNMENT_BYTE   256
+#define NOCOPY_MAX_SIZE  (128 * 1024 * 1024)
+#define COPY_MAX_SIZE    (64 * 1024 * 1024)
+//#define EU_COUNT_PER_RANK 448
+#define EU_COUNT_PER_RANK   512
+#define THREAD_COUNT_PER_EU 8
+#define HW_THREAD_COUNT     (EU_COUNT_PER_RANK * THREAD_COUNT_PER_EU)
+#define RANKS_PER_GPU       2
+#define NO_KERNEL           0
+#define FIRST_KERNEL        1
+#define SECOND_KERNEL       2
+#define THIRD_KERNEL        4
+#define FOURTH_KERNEL       8
+#define FIFTH_KERNEL        16
+
+#define NOCOPY_KERNEL_NUM   3
+#define NOCOPY_LAST_KERNEL  THIRD_KERNEL
+#define NOCOPY_BUFFER_COUNT NOCOPY_KERNEL_NUM
+
+#define COPY_KERNEL_NUM   5
+#define COPY_LAST_KERNEL  FIFTH_KERNEL
+#define COPY_BUFFER_COUNT COPY_KERNEL_NUM
+
+extern void *allreduce_large_buffer;
+extern void *allreduce_large_buffers[MAX_RANK];
+extern void *allreduce_large_sync_buffer[MAX_RANK];
+extern size_t allreduce_large_offsets[MAX_RANK];
+extern ze_ipc_mem_handle_t allreduce_large_ipc_handle[MAX_RANK];
+extern int allreduce_large_buffer_index;
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void load_input_to_temp_buffer(int idx,
+                               const void *in_buffer,
+                               uint32_t size,
+                               int threads_already_processed,
+                               void *temp_buffer[],
+                               uint32_t temp_rank,
+                               int size_per_buffer_kernel,
+                               int buffer_index_kernel2) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    uint32_t read_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD;
+    simd<data_type, SIMD_COMPUTE *TEMP_WORLD> buffer = 0;
+
+    if (read_offset + SIMD_COMPUTE * TEMP_WORLD > size) {
+        int count = (size - read_offset + SIMD_COMPUTE - 1) / SIMD_COMPUTE;
+        for (int i = 0; i < count; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)in_buffer + read_offset +
+                                                     i * SIMD_COMPUTE);
+        }
+    }
+    else {
+#pragma unroll
+        for (unsigned int i = 0; i < TEMP_WORLD; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)in_buffer + read_offset +
+                                                     i * SIMD_COMPUTE);
+        }
+    }
+
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    ptr += size_per_buffer_kernel * buffer_index_kernel2;
+    ptr += idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD; i++) {
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::write_back>(
+            ptr + i * SIMD_COMPUTE, buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void local_sum_and_distribute_to_remote_ranks(int *even_ranks,
+                                              int myrank,
+                                              int idx,
+                                              const void *in_buffer,
+                                              uint32_t size,
+                                              int threads_already_processed,
+                                              void *temp_buffer[],
+                                              uint32_t temp_rank,
+                                              int size_per_buffer_kernel,
+                                              int buffer_index_kernel2) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int is_odd = (even_ranks[0] == 1);
+    //read the input data
+    data_type *ptr_even =
+        (data_type *)temp_buffer[temp_rank & 0xfffffffe] + is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    data_type *ptr_odd =
+        (data_type *)temp_buffer[temp_rank | 1] + is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    ptr_even +=
+        idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 + size_per_buffer_kernel * buffer_index_kernel2;
+    ptr_odd +=
+        idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 + size_per_buffer_kernel * buffer_index_kernel2;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD> buffer;
+    uint32_t i;
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr_even + i * SIMD_COMPUTE);
+    }
+#pragma unroll
+    for (i = TEMP_WORLD / 2; i < TEMP_WORLD; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr_odd +
+                                               (i - TEMP_WORLD / 2) * SIMD_COMPUTE);
+    }
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> sum;
+    sum = buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(0) +
+          buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(SIMD_COMPUTE * TEMP_WORLD / 2);
+
+    //store the result in at (SIMD_COMPUTE * TEMP_WORLD) offset in remote ranks' temp buffers.
+    //distribute to other ranks. But even(odd) rank goes to other even(odd) rank.
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        data_type *ptr = (data_type *)temp_buffer[even_ranks[i]];
+        ptr += idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 +
+               size_per_buffer_kernel * buffer_index_kernel2 + TEMP_WORLD * SIMD_COMPUTE;
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::write_back>(
+            ptr + (temp_rank / 2) * SIMD_COMPUTE,
+            sum.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void all_sum(int idx,
+             const void *in_buffer,
+             uint32_t size,
+             int threads_already_processed,
+             void *temp_buffer[],
+             uint32_t temp_rank,
+             int size_per_buffer_kernel,
+             int buffer_index_kernel2) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    int read_offset =
+        idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 +
+        SIMD_COMPUTE *
+            TEMP_WORLD; //points to second half of the temp slot since that's where the data is from other ranks.
+    ptr += read_offset + size_per_buffer_kernel * buffer_index_kernel2;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr + i * SIMD_COMPUTE);
+    }
+    simd<data_type, SIMD_COMPUTE> sum = 0;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        sum = sum + buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i);
+    }
+    //store the result back to same location, only one SIMD_COMPUTE
+    lsc_block_store<data_type,
+                    SIMD_COMPUTE,
+                    lsc_data_size::default_size,
+                    cache_hint::uncached,
+                    cache_hint::write_back> //save the all sum in the second half of the temp slot.
+        (ptr, sum);
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void gather_from_remote_and_dist_to_rank_pair(int *even_ranks,
+                                              int idx,
+                                              void *out_buffer,
+                                              uint32_t size,
+                                              int threads_already_processed,
+                                              void *temp_buffer[],
+                                              uint32_t temp_rank,
+                                              int size_per_buffer_kernel,
+                                              int buffer_index_kernel2) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        //read the values
+        data_type *read_ptr = (data_type *)temp_buffer[even_ranks[i]];
+        read_ptr += size_per_buffer_kernel * buffer_index_kernel2;
+        read_ptr += idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 +
+                    SIMD_COMPUTE * TEMP_WORLD; //get the sum from the second half of temp slot
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(read_ptr);
+    }
+
+    //write the data to the pair of ranks within the same gpu
+    //gather in the first half of the slot
+    data_type *mdfi_ptr = (data_type *)temp_buffer[temp_rank ^ 1];
+    mdfi_ptr += size_per_buffer_kernel * buffer_index_kernel2;
+    mdfi_ptr += idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::write_back>(
+            mdfi_ptr + i * SIMD_COMPUTE,
+            buffer.template select<SIMD_COMPUTE, 1>(
+                SIMD_COMPUTE * i)); //save the results in the first half of temp slot
+    }
+
+    int is_odd = (even_ranks[0] == 1);
+    data_type *out_ptr = (data_type *)out_buffer;
+    uint32_t write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD +
+                            is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    if (write_offset + SIMD_COMPUTE * TEMP_WORLD / 2 <= size) {
+#pragma unroll
+        for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                out_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+    }
+    else if (write_offset < size) {
+        int vec_count = (size - write_offset) / SIMD_COMPUTE;
+        for (int i = 0; i < vec_count; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                out_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+        int count = size - write_offset - vec_count * SIMD_COMPUTE;
+        for (int i = 0; i < count; i++) {
+            out_ptr[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void write_output(int *even_ranks,
+                  int idx,
+                  void *out_buffer,
+                  uint32_t size,
+                  int threads_already_processed,
+                  void *temp_buffer[],
+                  uint32_t temp_rank,
+                  int size_per_buffer_kernel,
+                  int buffer_index_kernel2) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+    data_type *read_ptr = (data_type *)temp_buffer[temp_rank];
+    read_ptr +=
+        idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 + size_per_buffer_kernel * buffer_index_kernel2;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        //read the values
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(read_ptr + i * SIMD_COMPUTE);
+    }
+
+    int is_odd = (even_ranks[0] == 1);
+    //write out the results
+    data_type *write_ptr = (data_type *)out_buffer;
+    uint32_t write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD +
+                            (1 - is_odd) * SIMD_COMPUTE * TEMP_WORLD / 2;
+    if (write_offset + SIMD_COMPUTE * TEMP_WORLD / 2 <= size) {
+#pragma unroll
+        for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                write_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+    }
+    else if (write_offset < size) {
+        int vec_count = (size - write_offset) / SIMD_COMPUTE;
+        for (int i = 0; i < vec_count; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                write_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+        int count = size - write_offset - vec_count * SIMD_COMPUTE;
+        for (int i = 0; i < count; i++) {
+            write_ptr[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_sum_and_distribute_to_remote_ranks(int *even_ranks,
+                                               int myrank,
+                                               int idx,
+                                               void **in_buffers,
+                                               uint32_t size,
+                                               int threads_already_processed,
+                                               void *temp_buffer[],
+                                               uint32_t temp_rank,
+                                               int size_per_buffer_kernel,
+                                               int buffer_index_kernel2) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    uint32_t read_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD;
+    int is_odd = (even_ranks[0] == 1);
+    //read the input data
+    data_type *ptr_even =
+        (data_type *)in_buffers[temp_rank & 0xfffffffe] + is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    data_type *ptr_odd =
+        (data_type *)in_buffers[temp_rank | 1] + is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> sum;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD> buffer;
+    uint32_t i;
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr_even + read_offset +
+                                               i * SIMD_COMPUTE);
+    }
+#pragma unroll
+    for (i = TEMP_WORLD / 2; i < TEMP_WORLD; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr_odd + read_offset +
+                                               (i - TEMP_WORLD / 2) * SIMD_COMPUTE);
+    }
+    sum = buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(0) +
+          buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(SIMD_COMPUTE * TEMP_WORLD / 2);
+
+    //store the result in at (SIMD_COMPUTE * TEMP_WORLD) offset in remote ranks' temp buffers.
+    //distribute to other ranks. But even(odd) rank goes to other even(odd) rank.
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        data_type *ptr = (data_type *)temp_buffer[even_ranks[i]];
+        ptr += idx * SIMD_COMPUTE * TEMP_WORLD + size_per_buffer_kernel * buffer_index_kernel2;
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::write_back>(
+            ptr + (temp_rank / 2) * SIMD_COMPUTE,
+            sum.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_all_sum(int idx,
+                    const void *in_buffer,
+                    uint32_t size,
+                    int threads_already_processed,
+                    void *temp_buffer[],
+                    uint32_t temp_rank,
+                    int size_per_buffer_kernel,
+                    int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    int read_offset = idx * SIMD_COMPUTE * TEMP_WORLD;
+    ptr +=
+        read_offset +
+        size_per_buffer_kernel *
+            buffer_index_kernel; //points to second half of the temp slot since that's where the data is from other ranks.
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr + i * SIMD_COMPUTE);
+    }
+    simd<data_type, SIMD_COMPUTE> sum = 0;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        sum = sum + buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i);
+    }
+    //store the result
+    lsc_block_store<data_type,
+                    SIMD_COMPUTE,
+                    lsc_data_size::default_size,
+                    cache_hint::uncached,
+                    cache_hint::write_back> //save the all sum in the second half of the temp slot.
+        (ptr, sum);
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_gather_from_remote_and_dist_to_rank_pair(int *even_ranks,
+                                                     int idx,
+                                                     void **out_buffers,
+                                                     uint32_t size,
+                                                     int threads_already_processed,
+                                                     void *temp_buffer[],
+                                                     uint32_t temp_rank,
+                                                     int size_per_buffer_kernel,
+                                                     int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int is_odd = (even_ranks[0] == 1);
+    //read the input data
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        //read the values
+        data_type *read_ptr_int = (data_type *)temp_buffer[even_ranks[i]];
+        read_ptr_int += size_per_buffer_kernel * buffer_index_kernel;
+        read_ptr_int += idx * SIMD_COMPUTE * TEMP_WORLD;
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(read_ptr_int);
+    }
+
+    //write the data to the pair of ranks within the same gpu
+    data_type *ptr_even = (data_type *)out_buffers[temp_rank ^ 1];
+    data_type *ptr_odd = (data_type *)out_buffers[temp_rank];
+    uint32_t write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD +
+                            is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    uint32_t i;
+    if (write_offset + SIMD_COMPUTE * TEMP_WORLD / 2 <= size) {
+#pragma unroll
+        for (i = 0; i < TEMP_WORLD / 2; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_even + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_odd + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+    }
+    else if (write_offset < size) {
+        uint32_t vec_count = (size - write_offset) / SIMD_COMPUTE;
+        for (i = 0; i < vec_count; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_even + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_odd + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+        uint32_t count = size - write_offset - vec_count * SIMD_COMPUTE;
+        for (i = 0; i < count; i++) {
+            ptr_even[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+            ptr_odd[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+        }
+    }
+}
+
+#define RUN_FIRST_KERNEL \
+    if (sw_pipeline_kernel_state[ii] & FIRST_KERNEL) { \
+        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) { \
+            int index = idx + inner_iter * HW_THREAD_COUNT; \
+            if ((uint32_t)index >= total_threads_needed) \
+                break; \
+\
+            switch (temp_world) { \
+                case 2: \
+                    nocopy_sum_and_distribute_to_remote_ranks<2, data_type>( \
+                        (int *)even_ranks, \
+                        myrank, \
+                        index, \
+                        (void **)in_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 4: \
+                    nocopy_sum_and_distribute_to_remote_ranks<4, data_type>( \
+                        (int *)even_ranks, \
+                        myrank, \
+                        index, \
+                        (void **)in_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 6: \
+                    nocopy_sum_and_distribute_to_remote_ranks<6, data_type>( \
+                        (int *)even_ranks, \
+                        myrank, \
+                        index, \
+                        (void **)in_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 8: \
+                    nocopy_sum_and_distribute_to_remote_ranks<8, data_type>( \
+                        (int *)even_ranks, \
+                        myrank, \
+                        index, \
+                        (void **)in_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 10: \
+                    nocopy_sum_and_distribute_to_remote_ranks<10, data_type>( \
+                        (int *)even_ranks, \
+                        myrank, \
+                        index, \
+                        (void **)in_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 12: \
+                    nocopy_sum_and_distribute_to_remote_ranks<12, data_type>( \
+                        (int *)even_ranks, \
+                        myrank, \
+                        index, \
+                        (void **)in_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 14: \
+                    nocopy_sum_and_distribute_to_remote_ranks<14, data_type>( \
+                        (int *)even_ranks, \
+                        myrank, \
+                        index, \
+                        (void **)in_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 16: \
+                    nocopy_sum_and_distribute_to_remote_ranks<16, data_type>( \
+                        (int *)even_ranks, \
+                        myrank, \
+                        index, \
+                        (void **)in_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                default: break; \
+            } \
+        } \
+    }
+
+#define RUN_SECOND_KERNEL \
+    if (sw_pipeline_kernel_state[ii] & SECOND_KERNEL) { \
+        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) { \
+            int index = idx + inner_iter * HW_THREAD_COUNT; \
+            if ((uint32_t)index >= total_threads_needed) \
+                break; \
+\
+            switch (temp_world) { \
+                case 2: \
+                    nocopy_all_sum<2, data_type>(index, \
+                                                 in_buffer, \
+                                                 size, \
+                                                 threads_already_processed[ii], \
+                                                 (void **)temp_buffer, \
+                                                 temp_rank, \
+                                                 size_per_buffer_kernel, \
+                                                 ii); \
+                    break; \
+                case 4: \
+                    nocopy_all_sum<4, data_type>(index, \
+                                                 in_buffer, \
+                                                 size, \
+                                                 threads_already_processed[ii], \
+                                                 (void **)temp_buffer, \
+                                                 temp_rank, \
+                                                 size_per_buffer_kernel, \
+                                                 ii); \
+                    break; \
+                case 6: \
+                    nocopy_all_sum<6, data_type>(index, \
+                                                 in_buffer, \
+                                                 size, \
+                                                 threads_already_processed[ii], \
+                                                 (void **)temp_buffer, \
+                                                 temp_rank, \
+                                                 size_per_buffer_kernel, \
+                                                 ii); \
+                    break; \
+                case 8: \
+                    nocopy_all_sum<8, data_type>(index, \
+                                                 in_buffer, \
+                                                 size, \
+                                                 threads_already_processed[ii], \
+                                                 (void **)temp_buffer, \
+                                                 temp_rank, \
+                                                 size_per_buffer_kernel, \
+                                                 ii); \
+                    break; \
+                case 10: \
+                    nocopy_all_sum<10, data_type>(index, \
+                                                  in_buffer, \
+                                                  size, \
+                                                  threads_already_processed[ii], \
+                                                  (void **)temp_buffer, \
+                                                  temp_rank, \
+                                                  size_per_buffer_kernel, \
+                                                  ii); \
+                    break; \
+                case 12: \
+                    nocopy_all_sum<12, data_type>(index, \
+                                                  in_buffer, \
+                                                  size, \
+                                                  threads_already_processed[ii], \
+                                                  (void **)temp_buffer, \
+                                                  temp_rank, \
+                                                  size_per_buffer_kernel, \
+                                                  ii); \
+                    break; \
+                case 14: \
+                    nocopy_all_sum<14, data_type>(index, \
+                                                  in_buffer, \
+                                                  size, \
+                                                  threads_already_processed[ii], \
+                                                  (void **)temp_buffer, \
+                                                  temp_rank, \
+                                                  size_per_buffer_kernel, \
+                                                  ii); \
+                    break; \
+                case 16: \
+                    nocopy_all_sum<16, data_type>(index, \
+                                                  in_buffer, \
+                                                  size, \
+                                                  threads_already_processed[ii], \
+                                                  (void **)temp_buffer, \
+                                                  temp_rank, \
+                                                  size_per_buffer_kernel, \
+                                                  ii); \
+                    break; \
+                default: break; \
+            } \
+        } \
+    }
+
+#define RUN_THIRD_KERNEL \
+    if (sw_pipeline_kernel_state[ii] & THIRD_KERNEL) { \
+        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) { \
+            int index = idx + inner_iter * HW_THREAD_COUNT; \
+            if ((uint32_t)index >= total_threads_needed) \
+                break; \
+\
+            switch (temp_world) { \
+                case 2: \
+                    nocopy_gather_from_remote_and_dist_to_rank_pair<2, data_type>( \
+                        (int *)even_ranks, \
+                        index, \
+                        (void **)out_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 4: \
+                    nocopy_gather_from_remote_and_dist_to_rank_pair<4, data_type>( \
+                        (int *)even_ranks, \
+                        index, \
+                        (void **)out_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 6: \
+                    nocopy_gather_from_remote_and_dist_to_rank_pair<6, data_type>( \
+                        (int *)even_ranks, \
+                        index, \
+                        (void **)out_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 8: \
+                    nocopy_gather_from_remote_and_dist_to_rank_pair<8, data_type>( \
+                        (int *)even_ranks, \
+                        index, \
+                        (void **)out_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 10: \
+                    nocopy_gather_from_remote_and_dist_to_rank_pair<10, data_type>( \
+                        (int *)even_ranks, \
+                        index, \
+                        (void **)out_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 12: \
+                    nocopy_gather_from_remote_and_dist_to_rank_pair<12, data_type>( \
+                        (int *)even_ranks, \
+                        index, \
+                        (void **)out_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 14: \
+                    nocopy_gather_from_remote_and_dist_to_rank_pair<14, data_type>( \
+                        (int *)even_ranks, \
+                        index, \
+                        (void **)out_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                case 16: \
+                    nocopy_gather_from_remote_and_dist_to_rank_pair<16, data_type>( \
+                        (int *)even_ranks, \
+                        index, \
+                        (void **)out_buffers, \
+                        size, \
+                        threads_already_processed[ii], \
+                        (void **)temp_buffer, \
+                        temp_rank, \
+                        size_per_buffer_kernel, \
+                        ii); \
+                    break; \
+                default: break; \
+            } \
+        } \
+    }
+
+template <typename dtype>
+class Kernel_compute;
+//template<typename dtype> class Kernel_rankSync;
+
+template <typename dtype>
+class NoCopyKernel_compute;
+//template<typename dtype> class NoCopyKernel_GlobalSync;
+
+template <typename dtype>
+class AllreduceLargeKernel_GlobalSync;
+template <typename dtype>
+class AllreduceLargeKernel_LocalSync;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK>
+class sycl_allreduce_large : public sycl_coll_base<data_type> {
+public:
+    sycl_allreduce_large() : sycl_coll_base<data_type>() {
+        size_per_buffer = 0;
+    }
+
+    void init(sycl::queue &queue,
+              ccl_comm *comm,
+              ccl_stream *stream,
+              uint32_t rank_in,
+              uint32_t world_in) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+        rank = rank_in;
+        world = world_in;
+        // temporal buffer used for allreduce temporal use only.
+        COPY_MAX_COUNT = COPY_MAX_SIZE / sizeof(data_type);
+
+        int size_per_buffer_copy = COPY_MAX_SIZE + SYNC_BYTE;
+        int alloc_size_copy = size_per_buffer_copy * COPY_BUFFER_COUNT;
+
+        NOCOPY_MAX_COUNT = NOCOPY_MAX_SIZE / sizeof(data_type);
+
+        int size_per_buffer_nocopy = NOCOPY_MAX_SIZE + SYNC_BYTE;
+        int alloc_size_nocopy = size_per_buffer_nocopy * NOCOPY_BUFFER_COUNT;
+
+        if (ccl::global_data::env().allreduce_use_tmp_buf) {
+            data_size_per_buffer = COPY_MAX_COUNT;
+            size_per_buffer = size_per_buffer_copy;
+        }
+        else {
+            data_size_per_buffer = NOCOPY_MAX_COUNT;
+            size_per_buffer = size_per_buffer_nocopy;
+        }
+
+        if (!allreduce_large_buffer) {
+            int alloc_size =
+                alloc_size_copy > alloc_size_nocopy ? alloc_size_copy : alloc_size_nocopy;
+            allreduce_large_buffer = sycl::malloc_device(alloc_size, queue);
+            auto e = queue.memset(allreduce_large_buffer, 0, alloc_size);
+            e.wait();
+
+            this->exchange_peer_ipc_mem(queue,
+                                        comm,
+                                        stream,
+                                        allreduce_large_buffer,
+                                        NULL,
+                                        rank,
+                                        world,
+                                        data_size_per_buffer * sizeof(data_type),
+                                        (void **)allreduce_large_buffers,
+                                        (void **)allreduce_large_sync_buffer,
+                                        allreduce_large_offsets,
+                                        allreduce_large_ipc_handle,
+                                        NULL,
+                                        NULL /* mmap_buffers */,
+                                        false /* to_cache */);
+        }
+        this->initialized = true;
+
+        global_stream = stream;
+        global_comm = comm;
+        even_comm = global_comm->get_even_comm().get();
+    }
+
+    ccl::event allreduce(sycl::queue &queue,
+                         const void *in_buffer,
+                         void *out_buffer,
+                         uint32_t size) {
+        if (ccl::global_data::env().allreduce_use_tmp_buf) {
+            return allreduce_copy(queue, in_buffer, out_buffer, size);
+        }
+        else {
+            return allreduce_nocopy(queue, in_buffer, out_buffer, size);
+        }
+    }
+
+private:
+    ccl::event allreduce_copy(sycl::queue &queue,
+                              const void *in_buffer,
+                              void *out_buffer,
+                              uint32_t size) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = allreduce_large_buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = allreduce_large_sync_buffer[i];
+        }
+
+        int even_ranks[max_rank];
+        int myrank;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank) {
+                myrank = i;
+            }
+        }
+
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        int buffer_index_kernel_for_sync = allreduce_large_buffer_index;
+        int outer_iter;
+        //todo:
+        //1. shuffle the kernel# executions so that resource utilization can be smoothed out. DONE
+        //2. increase the simd size there are less number of innerloop iterations. This mgiht be useful in reducing hte load stalls since the number of loads-consume pair is less. DONE
+        //3. reduce gpu-cpu sync?? DONE
+        //5. prefetch in persistent threads? DONE
+        //uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+        int start, end;
+
+        int outerloop_iter_count;
+        int sync_reset_counter = 0;
+        int max_threads_per_MAX_COUNT =
+            (COPY_MAX_COUNT * 2 / 3) /
+            (SIMD_COMPUTE *
+             temp_world); // each thread uses (SIMD_COMPUTE * temp_world) * 3 / 2 space
+        int max_elements_per_MAX_COUNT = max_threads_per_MAX_COUNT * (SIMD_COMPUTE * temp_world);
+
+        outerloop_iter_count =
+            size /
+            max_elements_per_MAX_COUNT; //this is the outerloop count that requires full hw thread count. This doesnt include the outloop iteration that only needs partial thread count
+
+        //init the sw pipeline
+        int sw_pipeline_insert_index = 0;
+        int sw_pipeline_insert_counter = 0;
+        int sw_pipeline_kernel_state[COPY_KERNEL_NUM];
+        int threads_already_processed[COPY_KERNEL_NUM];
+        for (int i = 0; i < COPY_KERNEL_NUM; i++) {
+            threads_already_processed[i] = 0;
+            sw_pipeline_kernel_state[i] = NO_KERNEL;
+        }
+        //----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        // pipeline design
+        // ---------------
+        // During each outerloop iteration, one iteration (job) will be inserted to the SW pipeline.
+        // Since there are 5 kernels in the workload, upto 5 jobs can be inflight as shown in the picture below. Hence only 5 states are needed in the implementation.
+        // For each job in the SW pipeline, kernels from 1 to 5 will be executed in 5 iterations in the order. While it is being executed in 5 iterations, more jobs can be added to the SW pipeline.
+        // Which means that at particular time, all five kernels will be executed together by a single rank. That means the HW resource utilization might be more balanced hence the improvements.
+        // Additionally, by using the SW pipelininig, the required number of syncs are reduced. The syncs in the same column in the picture below can be done by one sync execution.
+        //
+        //                          time0   time1   time2   time3   time4   time5   time6   time7   time8   time9   time10  time11  time12  time13  time14  time15  time16  time17  time16
+        //                          ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------
+        // outerloop iteration0:    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration1:                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration2:                                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration3:                                                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration4:                                                                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration5:                                                                                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // ...
+        //----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+        //The following outer-outer loop is handling the case where there are multiple outerloop iterations and the last iteration requires partial usage of the temp buffer with size (MAX_COUNT / 2).
+        //As more space is used inside the temp buffer, higher innerloop count is required. Separating the processing into two iterations (one for full usage and another for partial temp buffer usage), the number of innerloop iteration within each iteration is uniform.
+        for (int iter = 0; iter < 2; iter++) {
+            uint32_t total_threads_needed;
+            if (iter == 1) //if second iteration, then handle the partial usage of the temp buffer
+            {
+                //if there is little more left to compute, then finish them
+                if (outerloop_iter_count * max_elements_per_MAX_COUNT < (int)size) {
+                    start = outerloop_iter_count;
+                    end = start + 1;
+                    total_threads_needed = (size - start * max_elements_per_MAX_COUNT +
+                                            SIMD_COMPUTE * temp_world - 1) /
+                                           (SIMD_COMPUTE * temp_world);
+                }
+                else {
+                    break;
+                }
+            }
+            else {
+                start = 0;
+                end = outerloop_iter_count;
+                total_threads_needed = max_threads_per_MAX_COUNT;
+
+                if (end == 0) {
+                    //there is nothing to do when end is 0 so check the next iter.
+                    continue;
+                }
+            }
+            int innerloop_iter_count =
+                (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+
+            //There are total of two SW pipeline sessions, for iter={0, 1}
+            //SW pipeline is applied on outerloop processing.
+            //Since sw pipeline is implemented, there will be tail at the end of hte execution. The size of the tail is (KERNEL_NUM - 1) and the tail is completed in the following loop.
+            for (outer_iter = start; outer_iter < end + COPY_KERNEL_NUM - 1; outer_iter++) {
+                //if more outer_iter remaining since there is more new processing to do, then insert them to the SW pipeline.
+                //During the sw pipeline tail, there is nothing to dispatch.
+                if (outer_iter < end) {
+                    sw_pipeline_kernel_state[sw_pipeline_insert_index] = FIRST_KERNEL;
+                    threads_already_processed[sw_pipeline_insert_index] =
+                        sw_pipeline_insert_counter;
+                    sw_pipeline_insert_index++;
+                    if (sw_pipeline_insert_index >= COPY_KERNEL_NUM) {
+                        //By the time the index wraps arounds, the kernel that was in this slot previously has already completed.
+                        sw_pipeline_insert_index = 0;
+                    }
+                    sw_pipeline_insert_counter += total_threads_needed;
+                }
+
+                //The first kernel does the actual computation while the second kernel does the sync across ranks.
+                e = queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class Kernel_compute<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        //check if there is any kernel in the SW pipelines. If yes, execute them.
+                        //to optimize, the order of loop i=0,1,2,.. can be shuffled so that different ranks can do different kernels at particular time. The purpose is to better balance the HW resource usage in the PVC node.
+                        for (int ii = 0; ii < COPY_KERNEL_NUM; ii++) {
+                            if (sw_pipeline_kernel_state[ii] & FIRST_KERNEL) {
+                                for (int inner_iter = 0; inner_iter < innerloop_iter_count;
+                                     inner_iter++) {
+                                    int index = idx + inner_iter * HW_THREAD_COUNT;
+                                    if ((uint32_t)index >= total_threads_needed)
+                                        break;
+
+                                    switch (temp_world) {
+                                        case 2:
+                                            load_input_to_temp_buffer<2, data_type>(
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 4:
+                                            load_input_to_temp_buffer<4, data_type>(
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 6:
+                                            load_input_to_temp_buffer<6, data_type>(
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 8:
+                                            load_input_to_temp_buffer<8, data_type>(
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 10:
+                                            load_input_to_temp_buffer<10, data_type>(
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 12:
+                                            load_input_to_temp_buffer<12, data_type>(
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 14:
+                                            load_input_to_temp_buffer<14, data_type>(
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 16:
+                                            load_input_to_temp_buffer<16, data_type>(
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        default: break;
+                                    }
+                                }
+                            }
+                            if (sw_pipeline_kernel_state[ii] & SECOND_KERNEL) {
+                                for (int inner_iter = 0; inner_iter < innerloop_iter_count;
+                                     inner_iter++) {
+                                    int index = idx + inner_iter * HW_THREAD_COUNT;
+                                    if ((uint32_t)index >= total_threads_needed)
+                                        break;
+
+                                    switch (temp_world) {
+                                        case 2:
+                                            local_sum_and_distribute_to_remote_ranks<2, data_type>(
+                                                (int *)even_ranks,
+                                                myrank,
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 4:
+                                            local_sum_and_distribute_to_remote_ranks<4, data_type>(
+                                                (int *)even_ranks,
+                                                myrank,
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 6:
+                                            local_sum_and_distribute_to_remote_ranks<6, data_type>(
+                                                (int *)even_ranks,
+                                                myrank,
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 8:
+                                            local_sum_and_distribute_to_remote_ranks<8, data_type>(
+                                                (int *)even_ranks,
+                                                myrank,
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 10:
+                                            local_sum_and_distribute_to_remote_ranks<10, data_type>(
+                                                (int *)even_ranks,
+                                                myrank,
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 12:
+                                            local_sum_and_distribute_to_remote_ranks<12, data_type>(
+                                                (int *)even_ranks,
+                                                myrank,
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 14:
+                                            local_sum_and_distribute_to_remote_ranks<14, data_type>(
+                                                (int *)even_ranks,
+                                                myrank,
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 16:
+                                            local_sum_and_distribute_to_remote_ranks<16, data_type>(
+                                                (int *)even_ranks,
+                                                myrank,
+                                                index,
+                                                in_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        default: break;
+                                    }
+                                }
+                            }
+                            if (sw_pipeline_kernel_state[ii] & THIRD_KERNEL) {
+                                for (int inner_iter = 0; inner_iter < innerloop_iter_count;
+                                     inner_iter++) {
+                                    int index = idx + inner_iter * HW_THREAD_COUNT;
+                                    if ((uint32_t)index >= total_threads_needed)
+                                        break;
+
+                                    switch (temp_world) {
+                                        case 2:
+                                            all_sum<2, data_type>(index,
+                                                                  in_buffer,
+                                                                  size,
+                                                                  threads_already_processed[ii],
+                                                                  (void **)temp_buffer,
+                                                                  temp_rank,
+                                                                  size_per_buffer_kernel,
+                                                                  ii);
+                                            break;
+                                        case 4:
+                                            all_sum<4, data_type>(index,
+                                                                  in_buffer,
+                                                                  size,
+                                                                  threads_already_processed[ii],
+                                                                  (void **)temp_buffer,
+                                                                  temp_rank,
+                                                                  size_per_buffer_kernel,
+                                                                  ii);
+                                            break;
+                                        case 6:
+                                            all_sum<6, data_type>(index,
+                                                                  in_buffer,
+                                                                  size,
+                                                                  threads_already_processed[ii],
+                                                                  (void **)temp_buffer,
+                                                                  temp_rank,
+                                                                  size_per_buffer_kernel,
+                                                                  ii);
+                                            break;
+                                        case 8:
+                                            all_sum<8, data_type>(index,
+                                                                  in_buffer,
+                                                                  size,
+                                                                  threads_already_processed[ii],
+                                                                  (void **)temp_buffer,
+                                                                  temp_rank,
+                                                                  size_per_buffer_kernel,
+                                                                  ii);
+                                            break;
+                                        case 10:
+                                            all_sum<10, data_type>(index,
+                                                                   in_buffer,
+                                                                   size,
+                                                                   threads_already_processed[ii],
+                                                                   (void **)temp_buffer,
+                                                                   temp_rank,
+                                                                   size_per_buffer_kernel,
+                                                                   ii);
+                                            break;
+                                        case 12:
+                                            all_sum<12, data_type>(index,
+                                                                   in_buffer,
+                                                                   size,
+                                                                   threads_already_processed[ii],
+                                                                   (void **)temp_buffer,
+                                                                   temp_rank,
+                                                                   size_per_buffer_kernel,
+                                                                   ii);
+                                            break;
+                                        case 14:
+                                            all_sum<14, data_type>(index,
+                                                                   in_buffer,
+                                                                   size,
+                                                                   threads_already_processed[ii],
+                                                                   (void **)temp_buffer,
+                                                                   temp_rank,
+                                                                   size_per_buffer_kernel,
+                                                                   ii);
+                                            break;
+                                        case 16:
+                                            all_sum<16, data_type>(index,
+                                                                   in_buffer,
+                                                                   size,
+                                                                   threads_already_processed[ii],
+                                                                   (void **)temp_buffer,
+                                                                   temp_rank,
+                                                                   size_per_buffer_kernel,
+                                                                   ii);
+                                            break;
+                                        default: break;
+                                    }
+                                }
+                            }
+                            if (sw_pipeline_kernel_state[ii] & FOURTH_KERNEL) {
+                                for (int inner_iter = 0; inner_iter < innerloop_iter_count;
+                                     inner_iter++) {
+                                    int index = idx + inner_iter * HW_THREAD_COUNT;
+                                    if ((uint32_t)index >= total_threads_needed)
+                                        break;
+
+                                    switch (temp_world) {
+                                        case 2:
+                                            gather_from_remote_and_dist_to_rank_pair<2, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 4:
+                                            gather_from_remote_and_dist_to_rank_pair<4, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 6:
+                                            gather_from_remote_and_dist_to_rank_pair<6, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 8:
+                                            gather_from_remote_and_dist_to_rank_pair<8, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 10:
+                                            gather_from_remote_and_dist_to_rank_pair<10, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 12:
+                                            gather_from_remote_and_dist_to_rank_pair<12, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 14:
+                                            gather_from_remote_and_dist_to_rank_pair<14, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 16:
+                                            gather_from_remote_and_dist_to_rank_pair<16, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        default: break;
+                                    }
+                                }
+                            }
+                            if (sw_pipeline_kernel_state[ii] & FIFTH_KERNEL) {
+                                for (int inner_iter = 0; inner_iter < innerloop_iter_count;
+                                     inner_iter++) {
+                                    int index = idx + inner_iter * HW_THREAD_COUNT;
+                                    if ((uint32_t)index >= total_threads_needed)
+                                        break;
+
+                                    switch (temp_world) {
+                                        case 2:
+                                            write_output<2, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 4:
+                                            write_output<4, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 6:
+                                            write_output<6, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 8:
+                                            write_output<8, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 10:
+                                            write_output<10, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 12:
+                                            write_output<12, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 14:
+                                            write_output<14, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        case 16:
+                                            write_output<16, data_type>(
+                                                (int *)even_ranks,
+                                                index,
+                                                out_buffer,
+                                                size,
+                                                threads_already_processed[ii],
+                                                (void **)temp_buffer,
+                                                temp_rank,
+                                                size_per_buffer_kernel,
+                                                ii);
+                                            break;
+                                        default: break;
+                                    }
+                                }
+                            }
+                        }
+
+                        });//parallel_for
+                }); //submit()
+
+                sync_reset_counter++;
+
+                //sync all the ranks within the single GPU.
+                e = global_sync(queue,
+                                temp_rank,
+                                temp_world,
+                                size_per_buffer_for_sync_kernel * buffer_index_kernel_for_sync,
+                                4,
+                                1);
+
+                //update the sw pipeline process state so that next kernel will be processed in next round
+                for (int i = 0; i < COPY_KERNEL_NUM; i++) {
+                    if (sw_pipeline_kernel_state[i] & COPY_LAST_KERNEL)
+                        sw_pipeline_kernel_state[i] =
+                            0; //remove the kernel from the sw pipeline if it is fifth kernel. Everything is already executed.
+                    else
+                        sw_pipeline_kernel_state[i] <<= 1;
+                }
+
+                buffer_index_kernel_for_sync++;
+                buffer_index_kernel_for_sync %= COPY_KERNEL_NUM;
+            } //for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++)
+        } //for (int iter = 0; iter < 2; iter++)
+
+        allreduce_large_buffer_index += sync_reset_counter;
+        allreduce_large_buffer_index %= COPY_KERNEL_NUM;
+
+        return ccl::event::create_from_native(e);
+    }
+
+    // perform IPC exchange every time
+    ccl::event allreduce_nocopy(sycl::queue &queue,
+                                const void *in_buffer,
+                                void *out_buffer,
+                                uint32_t size) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = allreduce_large_buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = allreduce_large_sync_buffer[i];
+        }
+
+        int even_ranks[max_rank];
+        int myrank;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank)
+                myrank = i;
+        }
+
+        void *in_buffers[max_rank];
+        void *out_buffers[max_rank];
+        this->exchange_peer_ipc_mem(queue,
+                                    global_comm,
+                                    global_stream,
+                                    (void **)in_buffer,
+                                    out_buffer,
+                                    rank,
+                                    world,
+                                    0,
+                                    (void **)in_buffers,
+                                    NULL,
+                                    NULL,
+                                    NULL,
+                                    (void **)out_buffers);
+
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        int buffer_index_kernel_for_sync = allreduce_large_buffer_index;
+        int outer_iter;
+        //todo:
+        //1. shuffle the kernel# executions so that resource utilization can be smoothed out. DONE
+        //2. increase the simd size there are less number of innerloop iterations. This mgiht be useful in reducing hte load stalls since the number of loads-consume pair is less. DONE
+        //3. reduce gpu-cpu sync?? DONE
+        //5. prefetch in persistent threads? DONE
+        int wg_size = 1;
+        int start, end;
+
+        int outerloop_iter_count;
+        int sync_reset_counter = 0;
+        int max_threads_per_MAX_COUNT = (NOCOPY_MAX_COUNT) / (SIMD_COMPUTE * temp_world);
+        int max_elements_per_MAX_COUNT = max_threads_per_MAX_COUNT * (SIMD_COMPUTE * temp_world);
+
+        outerloop_iter_count =
+            size /
+            max_elements_per_MAX_COUNT; //this is the outerloop count that requires full hw thread count. This doesnt include the outloop iteration that only needs partial thread count
+
+        //init the sw pipeline
+        int sw_pipeline_insert_index = 0;
+        int sw_pipeline_insert_counter = 0;
+        int sw_pipeline_kernel_state[NOCOPY_KERNEL_NUM];
+        int threads_already_processed[NOCOPY_KERNEL_NUM];
+        for (int i = 0; i < NOCOPY_KERNEL_NUM; i++) {
+            threads_already_processed[i] = 0;
+            sw_pipeline_kernel_state[i] = NO_KERNEL;
+        }
+        //----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+        // pipeline design
+        // ---------------
+        // During each outerloop iteration, one iteration (job) will be inserted to the SW pipeline.
+        // Since there are 5 kernels in the workload, upto 5 jobs can be inflight as shown in the picture below. Hence only 5 states are needed in the implementation.
+        // For each job in the SW pipeline, kernels from 1 to 5 will be executed in 5 iterations in the order. While it is being executed in 5 iterations, more jobs can be added to the SW pipeline.
+        // Which means that at particular time, all five kernels will be executed together by a single rank. That means the HW resource utilization might be more balanced hence the improvements.
+        // Additionally, by using the SW pipelininig, the required number of syncs are reduced. The syncs in the same column in the picture below can be done by one sync execution.
+        //
+        //                          time0   time1   time2   time3   time4   time5   time6   time7   time8   time9   time10  time11  time12  time13  time14  time15  time16  time17  time16
+        //                          ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------
+        // outerloop iteration0:    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration1:                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration2:                                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration3:                                                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration4:                                                                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // outerloop iteration5:                                                                                    Kernel1 sync    kernel2 sync    kernel3 sync    kernel4 sync    kernel5
+        // ...
+        //----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+        // we need to sync between two tiles of the same GPU to make sure all data are ready
+        e = local_sync(queue,
+                       temp_rank,
+                       temp_world,
+                       size_per_buffer_for_sync_kernel * buffer_index_kernel_for_sync,
+                       2,
+                       0);
+
+        //The following outer-outer loop is handling the case where there are multiple outerloop iterations and the last iteration requires partial usage of the temp buffer with size (MAX_COUNT / 2).
+        //As more space is used inside the temp buffer, higher innerloop count is required. Separating the processing into two iterations (one for full usage and another for partial temp buffer usage), the number of innerloop iteration within each iteration is uniform.
+        for (int iter = 0; iter < 2; iter++) {
+            uint32_t total_threads_needed;
+            if (iter == 1) //if second iteration, then handle the partial usage of the temp buffer
+            {
+                //if there is little more left to compute, then finish them
+                if (outerloop_iter_count * max_elements_per_MAX_COUNT < (int)size) {
+                    start = outerloop_iter_count;
+                    end = start + 1;
+                    total_threads_needed = (size - start * max_elements_per_MAX_COUNT +
+                                            SIMD_COMPUTE * temp_world - 1) /
+                                           (SIMD_COMPUTE * temp_world);
+                }
+                else {
+                    break;
+                }
+            }
+            else {
+                start = 0;
+                end = outerloop_iter_count;
+                total_threads_needed = max_threads_per_MAX_COUNT;
+
+                if (end == 0) {
+                    //there is nothing to do when end is 0 so check the next iter.
+                    continue;
+                }
+            }
+            int innerloop_iter_count =
+                (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+
+            //There are total of two SW pipeline sessions, for iter={0, 1}
+            //SW pipeline is applied on outerloop processing.
+            //Since sw pipeline is implemented, there will be tail at the end of hte execution. The size of the tail is (KERNEL_NUM - 1) and the tail is completed in the following loop.
+            for (outer_iter = start; outer_iter < end + NOCOPY_KERNEL_NUM - 1; outer_iter++) {
+                //if more outer_iter remaining since there is more new processing to do, then insert them to the SW pipeline.
+                //During the sw pipeline tail, there is nothing to dispatch.
+                if (outer_iter < end) {
+                    sw_pipeline_kernel_state[sw_pipeline_insert_index] = FIRST_KERNEL;
+                    threads_already_processed[sw_pipeline_insert_index] =
+                        sw_pipeline_insert_counter;
+                    sw_pipeline_insert_index++;
+                    if (sw_pipeline_insert_index >= NOCOPY_KERNEL_NUM) {
+                        sw_pipeline_insert_index =
+                            0; //By the time the index wraps arounds, the kernel that was in this slot previously has already completed.
+                    }
+                    sw_pipeline_insert_counter += total_threads_needed;
+                }
+
+                //The first kernel does the actual computation while the second kernel does the sync across ranks.
+                e = queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class NoCopyKernel_compute<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        //check if there is any kernel in the SW pipelines. If yes, execute them.
+                        //to optimize, the order of loop i=0,1,2,.. can be shuffled so that different ranks can do different kernels at particular time. The purpose is to better balance the HW resource usage in the PVC node.
+                        for (int ii = 0; ii < NOCOPY_KERNEL_NUM; ii++) {
+                            RUN_FIRST_KERNEL
+                            RUN_SECOND_KERNEL
+                            RUN_THIRD_KERNEL
+                        } // end of for (int ii
+
+                        });//parallel_for
+                }); //submit()
+
+                //sync all the ranks within the single GPU.
+                e = global_sync(queue,
+                                temp_rank,
+                                temp_world,
+                                size_per_buffer_for_sync_kernel * buffer_index_kernel_for_sync,
+                                5,
+                                1);
+
+                sync_reset_counter++;
+
+                //update the sw pipeline process state so that next kernel will be processed in next round
+                for (int i = 0; i < NOCOPY_KERNEL_NUM; i++) {
+                    if (sw_pipeline_kernel_state[i] & NOCOPY_LAST_KERNEL)
+                        sw_pipeline_kernel_state[i] =
+                            0; //remove the kernel from the sw pipeline if it is last kernel. Everything is already executed.
+                    else
+                        sw_pipeline_kernel_state[i] <<= 1;
+                }
+
+                buffer_index_kernel_for_sync++;
+                buffer_index_kernel_for_sync %= NOCOPY_BUFFER_COUNT;
+            } //for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++)
+        } //for (int iter = 0; iter < 2; iter++)
+
+        allreduce_large_buffer_index += sync_reset_counter;
+        allreduce_large_buffer_index %= NOCOPY_BUFFER_COUNT;
+
+        return ccl::event::create_from_native(e);
+    }
+
+    //sync all the ranks here before consuming the results.
+    sycl::event global_sync(sycl::queue queue,
+                            int temp_rank,
+                            uint32_t temp_world,
+                            int offset,
+                            int index,
+                            int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (uint32_t i = 0; i < temp_world; i++) {
+            temp_sync_buffer[i] = allreduce_large_sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class AllreduceLargeKernel_GlobalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                //ESIMD kernel
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //since other ranks might still be doing local_sum, we need to sync ranks here.
+                //After the sync is done, the second half of hte temp buffer will be replaced with new sum val.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                for (uint32_t i = 0; i < temp_world; i++) {
+                    int *sync_ptr = (int *)temp_sync_buffer[i] + offset;
+                    ////never true. Used to force dependecy with prev kernel
+                    //if (total_threads_needed_sync == 0x7fffffff)
+                    //    sync_ptr = temp_buffer[0];
+                    lsc_atomic_update<atomic_op::inc,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(sync_ptr, ramp, pred);
+                }
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != temp_world) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    // sync tiles in a GPU
+    sycl::event local_sync(sycl::queue queue,
+                           int temp_rank,
+                           uint32_t temp_world,
+                           int offset,
+                           int index,
+                           int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = allreduce_large_sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class AllreduceLargeKernel_LocalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                //ESIMD kernel
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //sync only the rank pair within the same gpu.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank ^ 1] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+                sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] < RANKS_PER_GPU) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    void release(sycl::queue &queue) {
+        // Clean up, close/put ipc handles, free memory, etc.
+        auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_context());
+        for (int i = 0; i < world; i++) {
+            if (i != rank) {
+                ZE_CALL(zeMemCloseIpcHandle,
+                        (l0_ctx, (char *)allreduce_large_buffers[i] - allreduce_large_offsets[i]));
+            }
+        }
+
+        sycl::free(allreduce_large_buffers[rank], queue);
+        this->initialized = false;
+    }
+
+private:
+    int rank{ ccl::utils::invalid_rank }, world{ ccl::utils::invalid_err_code };
+    int COPY_MAX_COUNT{ ccl::utils::initial_count_value };
+    int NOCOPY_MAX_COUNT{ ccl::utils::initial_count_value };
+    int size_per_buffer{ ccl::utils::invalid_bytes_value };
+    int data_size_per_buffer{ ccl::utils::invalid_bytes_value };
+    ccl_stream *global_stream{};
+    ccl_comm *global_comm{};
+    ccl_comm *even_comm{};
+};
+
+#define ALLREDUCE_LARGE_API(TYPE) \
+    void init_allreduce_large_##TYPE(ccl::datatype dtype, \
+                                     sycl::queue &queue, \
+                                     ccl_comm *comm, \
+                                     ccl_stream *stream, \
+                                     uint32_t rank_in, \
+                                     uint32_t world_in) { \
+        if (!ar_large_##TYPE.inited()) { \
+            LOG_INFO("invoking large allreduce first time for datatype: ", dtype); \
+            ar_large_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+    } \
+\
+    ccl::event run_allreduce_large_##TYPE( \
+        ccl::datatype dtype, sycl::queue queue, const void *in_buf, void *out_buf, size_t count) { \
+        return ar_large_##TYPE.allreduce(queue, in_buf, out_buf, count); \
+    }
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_bf16.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_bf16.cpp
new file mode 100644
index 000000000..0bdb9497a
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_bf16.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_large_sycl.hpp"
+
+sycl_allreduce_large<sycl::_V1::ext::oneapi::bfloat16> ar_large_bf16;
+
+ALLREDUCE_LARGE_API(bf16);
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_fp16.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_fp16.cpp
new file mode 100644
index 000000000..6b25721c9
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_fp16.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_large_sycl.hpp"
+
+sycl_allreduce_large<sycl::half> ar_large_fp16;
+
+ALLREDUCE_LARGE_API(fp16);
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_fp32.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_fp32.cpp
new file mode 100644
index 000000000..d3b26f15d
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_fp32.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_large_sycl.hpp"
+
+sycl_allreduce_large<float> ar_large_fp32;
+
+ALLREDUCE_LARGE_API(fp32);
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_int32.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_int32.cpp
new file mode 100644
index 000000000..c05ab172d
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_large_sycl_int32.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_large_sycl.hpp"
+
+sycl_allreduce_large<int> ar_large_int32;
+
+ALLREDUCE_LARGE_API(int32);
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl.cpp
new file mode 100644
index 000000000..97977568e
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl.cpp
@@ -0,0 +1,79 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_medium_sycl.hpp"
+
+#define MAX_RANK 16
+
+void *allreduce_medium_buffer = NULL;
+void *allreduce_medium_buffers[MAX_RANK];
+void *allreduce_medium_sync_buffer[MAX_RANK];
+size_t allreduce_medium_offsets[MAX_RANK];
+ze_ipc_mem_handle_t allreduce_medium_ipc_handle[MAX_RANK];
+int allreduce_medium_buffer_index = 0;
+
+#define ALLREDUCE_MEDIUM_API_DECL(TYPE) \
+    void init_allreduce_medium_##TYPE(ccl::datatype dtype, \
+                                      sycl::queue &queue, \
+                                      ccl_comm *comm, \
+                                      ccl_stream *stream, \
+                                      uint32_t rank_in, \
+                                      uint32_t world_in); \
+    ccl::event run_allreduce_medium_##TYPE( \
+        ccl::datatype dtype, sycl::queue queue, const void *in_buf, void *out_buf, size_t count)
+
+ALLREDUCE_MEDIUM_API_DECL(fp16);
+ALLREDUCE_MEDIUM_API_DECL(bf16);
+ALLREDUCE_MEDIUM_API_DECL(fp32);
+ALLREDUCE_MEDIUM_API_DECL(int32);
+
+#define SWITCH_INIT_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        init_allreduce_medium_##TYPE(dtype, queue, comm, stream, rank_in, world_in); \
+        break;
+
+void init_allreduce_medium(ccl::datatype dtype,
+                           sycl::queue &queue,
+                           ccl_comm *comm,
+                           ccl_stream *stream,
+                           uint32_t rank_in,
+                           uint32_t world_in) {
+    switch (dtype) {
+        SWITCH_INIT_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_INIT_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_INIT_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_INIT_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for allreduce"); assert(0);
+    }
+}
+
+#define SWITCH_RUN_TYPE(TYPE, ccl_type) \
+    case ccl_type: e = run_allreduce_medium_##TYPE(dtype, queue, in_buf, out_buf, count); break;
+
+ccl::event run_allreduce_medium(ccl::datatype dtype,
+                                sycl::queue queue,
+                                const void *in_buf,
+                                void *out_buf,
+                                size_t count) {
+    ccl::event e;
+    switch (dtype) {
+        SWITCH_RUN_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_RUN_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_RUN_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_RUN_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for allreduce"); assert(0);
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl.hpp b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl.hpp
new file mode 100644
index 000000000..10c048d56
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl.hpp
@@ -0,0 +1,2260 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define MAX_RANK         16
+#define INIT_SIZE        64
+#define INIT_COUNT       1
+#define SIMD_INIT        (INIT_SIZE * INIT_COUNT)
+#define SIMD_COMPUTE_MAX 256
+#define SIMD_COMPUTE     (SIMD_COMPUTE_MAX / sizeof(data_type))
+#define SIMD_SYNC        32
+#define BUFFER_COUNT     2
+#define SYNC_BYTE        (SIMD_SYNC * sizeof(int) * 2)
+#define ALIGNMENT_BYTE   256
+#define MAX_SIZE         (128 * 1024 * 1024)
+//#define EU_COUNT_PER_RANK 448
+#define EU_COUNT_PER_RANK   512
+#define THREAD_COUNT_PER_EU 8
+#define HW_THREAD_COUNT     (EU_COUNT_PER_RANK * THREAD_COUNT_PER_EU)
+#define KERNEL_NUM          11
+#define RANKS_PER_GPU       2
+
+extern void *allreduce_medium_buffer;
+extern void *allreduce_medium_buffers[MAX_RANK];
+extern void *allreduce_medium_sync_buffer[MAX_RANK];
+extern size_t allreduce_medium_offsets[MAX_RANK];
+extern ze_ipc_mem_handle_t allreduce_medium_ipc_handle[MAX_RANK];
+extern int allreduce_medium_buffer_index;
+
+// kernels for use_tmp_buf == 1
+template <uint32_t TEMP_WORLD, typename data_type>
+void load_input_to_temp_buffer(int idx,
+                               const void *in_buffer,
+                               uint32_t size,
+                               int threads_already_processed,
+                               void *temp_buffer[],
+                               uint32_t temp_rank,
+                               int outer_iter,
+                               int size_per_buffer_kernel,
+                               int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    uint32_t read_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD;
+    simd<data_type, SIMD_COMPUTE *TEMP_WORLD> buffer = 0;
+
+    if (read_offset + SIMD_COMPUTE * TEMP_WORLD > size) {
+        int count = (size - read_offset + SIMD_COMPUTE - 1) / SIMD_COMPUTE;
+        for (int i = 0; i < count; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)in_buffer + read_offset +
+                                                     i * SIMD_COMPUTE);
+        }
+    }
+    else {
+#pragma unroll
+        for (uint32_t i = 0; i < TEMP_WORLD; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)in_buffer + read_offset +
+                                                     i * SIMD_COMPUTE);
+        }
+    }
+
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    ptr += size_per_buffer_kernel * buffer_index_kernel;
+    ptr += idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD; i++) {
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::write_back>(
+            ptr + i * SIMD_COMPUTE, buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void local_sum_and_distribute_to_remote_ranks(int *even_ranks,
+                                              int myrank,
+                                              int idx,
+                                              const void *in_buffer,
+                                              uint32_t size,
+                                              int threads_already_processed,
+                                              void *temp_buffer[],
+                                              uint32_t temp_rank,
+                                              int size_per_buffer_kernel,
+                                              int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int is_odd = (even_ranks[0] == 1);
+    //read the input data
+    data_type *ptr_even =
+        (data_type *)temp_buffer[temp_rank & 0xfffffffe] + is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    data_type *ptr_odd =
+        (data_type *)temp_buffer[temp_rank | 1] + is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    ptr_even +=
+        idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 + size_per_buffer_kernel * buffer_index_kernel;
+    ptr_odd +=
+        idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 + size_per_buffer_kernel * buffer_index_kernel;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD> buffer;
+    uint32_t i;
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr_even + i * SIMD_COMPUTE);
+    }
+#pragma unroll
+    for (i = TEMP_WORLD / 2; i < TEMP_WORLD; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr_odd +
+                                               (i - TEMP_WORLD / 2) * SIMD_COMPUTE);
+    }
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> sum;
+    sum = buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(0) +
+          buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(SIMD_COMPUTE * TEMP_WORLD / 2);
+
+    //store the result in at (SIMD_COMPUTE * TEMP_WORLD) offset in remote ranks' temp buffers.
+    //distribute to other ranks. But even(odd) rank goes to other even(odd) rank.
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        data_type *ptr = (data_type *)temp_buffer[even_ranks[i]];
+        ptr += idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 +
+               size_per_buffer_kernel * buffer_index_kernel + TEMP_WORLD * SIMD_COMPUTE;
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::write_back>(
+            ptr + (temp_rank / 2) * SIMD_COMPUTE,
+            sum.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void all_sum(int idx,
+             const void *in_buffer,
+             uint32_t size,
+             int threads_already_processed,
+             void *temp_buffer[],
+             uint32_t temp_rank,
+             int size_per_buffer_kernel,
+             int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    int read_offset =
+        idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 +
+        SIMD_COMPUTE *
+            TEMP_WORLD; //points to second half of the temp slot since that's where the data is from other ranks.
+    ptr += read_offset + size_per_buffer_kernel * buffer_index_kernel;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr + i * SIMD_COMPUTE);
+    }
+    simd<data_type, SIMD_COMPUTE> sum = 0;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        sum = sum + buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i);
+    }
+    //store the result
+    lsc_block_store<data_type,
+                    SIMD_COMPUTE,
+                    lsc_data_size::default_size,
+                    cache_hint::uncached,
+                    cache_hint::write_back> //save the all sum in the second half of the temp slot.
+        (ptr, sum);
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void gather_from_remote_and_dist_to_rank_pair(int *even_ranks,
+                                              int idx,
+                                              void *out_buffer,
+                                              uint32_t size,
+                                              int threads_already_processed,
+                                              void *temp_buffer[],
+                                              uint32_t temp_rank,
+                                              int outer_iter,
+                                              int size_per_buffer_kernel,
+                                              int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        //read the values
+        data_type *read_ptr = (data_type *)temp_buffer[even_ranks[i]];
+        read_ptr += size_per_buffer_kernel * buffer_index_kernel;
+        read_ptr += idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 +
+                    SIMD_COMPUTE * TEMP_WORLD; //get the sum from the second half of temp slot
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(read_ptr);
+    }
+
+    //write the data to the pair of ranks within the same gpu
+    //gather in the first half of the slot
+    data_type *mdfi_ptr = (data_type *)temp_buffer[temp_rank ^ 1];
+    mdfi_ptr += size_per_buffer_kernel * buffer_index_kernel;
+    mdfi_ptr += idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::write_back>(
+            mdfi_ptr + i * SIMD_COMPUTE,
+            buffer.template select<SIMD_COMPUTE, 1>(
+                SIMD_COMPUTE * i)); //save the results in the first half of temp slot
+    }
+
+    int is_odd = (even_ranks[0] == 1);
+    data_type *out_ptr = (data_type *)out_buffer;
+    uint32_t write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD +
+                            is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    if (write_offset + SIMD_COMPUTE * TEMP_WORLD / 2 <= size) {
+#pragma unroll
+        for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                out_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+    }
+    else if (write_offset < size) {
+        int vec_count = (size - write_offset) / SIMD_COMPUTE;
+        for (int i = 0; i < vec_count; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                out_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+        int count = size - write_offset - vec_count * SIMD_COMPUTE;
+        for (int i = 0; i < count; i++) {
+            out_ptr[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void write_output(int *even_ranks,
+                  int idx,
+                  void *out_buffer,
+                  uint32_t size,
+                  int threads_already_processed,
+                  void *temp_buffer[],
+                  uint32_t temp_rank,
+                  int outer_iter,
+                  int size_per_buffer_kernel,
+                  int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+    data_type *read_ptr = (data_type *)temp_buffer[temp_rank];
+    read_ptr +=
+        idx * SIMD_COMPUTE * TEMP_WORLD * 3 / 2 + size_per_buffer_kernel * buffer_index_kernel;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        //read the values
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(read_ptr + i * SIMD_COMPUTE);
+    }
+
+    int is_odd = (even_ranks[0] == 1);
+    //write out the results
+    data_type *write_ptr = (data_type *)out_buffer;
+    uint32_t write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD +
+                            (1 - is_odd) * SIMD_COMPUTE * TEMP_WORLD / 2;
+    if (write_offset + SIMD_COMPUTE * TEMP_WORLD / 2 <= size) {
+#pragma unroll
+        for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                write_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+    }
+    else if (write_offset < size) {
+        int vec_count = (size - write_offset) / SIMD_COMPUTE;
+        for (int i = 0; i < vec_count; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                write_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+        int count = size - write_offset - vec_count * SIMD_COMPUTE;
+        for (int i = 0; i < count; i++) {
+            write_ptr[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+        }
+    }
+}
+
+// kernels for use_tmp_buf == 0
+// tmp buffer is used for size: SIMD_COMPUTE * TEMP_WORLD / 2
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_sum_and_distribute_to_remote_ranks(int *even_ranks,
+                                               int myrank,
+                                               int idx,
+                                               void **in_buffers,
+                                               uint32_t size,
+                                               int threads_already_processed,
+                                               void *temp_buffer[],
+                                               uint32_t temp_rank,
+                                               int size_per_buffer_kernel,
+                                               int buffer_index_kernel2) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    //even rank and odd rank each read half
+    uint32_t read_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD;
+    int is_odd = (even_ranks[0] == 1);
+    data_type *ptr_even =
+        (data_type *)in_buffers[temp_rank & 0xfffffffe] + is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    data_type *ptr_odd =
+        (data_type *)in_buffers[temp_rank | 1] + is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> sum;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD> buffer;
+    uint32_t i;
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr_even + read_offset +
+                                               i * SIMD_COMPUTE);
+    }
+#pragma unroll
+    for (i = TEMP_WORLD / 2; i < TEMP_WORLD; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr_odd + read_offset +
+                                               (i - TEMP_WORLD / 2) * SIMD_COMPUTE);
+    }
+    sum = buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(0) +
+          buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(SIMD_COMPUTE * TEMP_WORLD / 2);
+
+    //store the result in at (SIMD_COMPUTE * TEMP_WORLD) offset in remote ranks' temp buffers.
+    //distribute to other ranks. But even(odd) rank goes to other even(odd) rank.
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        data_type *ptr = (data_type *)temp_buffer[even_ranks[i]];
+        ptr += idx * SIMD_COMPUTE * TEMP_WORLD + size_per_buffer_kernel * buffer_index_kernel2;
+        lsc_block_store<data_type,
+                        SIMD_COMPUTE,
+                        lsc_data_size::default_size,
+                        cache_hint::uncached,
+                        cache_hint::write_back>(
+            ptr + (temp_rank / 2) * SIMD_COMPUTE,
+            sum.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_all_sum(int idx,
+                    const void *in_buffer,
+                    uint32_t size,
+                    int threads_already_processed,
+                    void *temp_buffer[],
+                    uint32_t temp_rank,
+                    int size_per_buffer_kernel,
+                    int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    //read the input data
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    int read_offset = idx * SIMD_COMPUTE * TEMP_WORLD;
+    ptr +=
+        read_offset +
+        size_per_buffer_kernel *
+            buffer_index_kernel; //points to second half of the temp slot since that's where the data is from other ranks.
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>((data_type *)ptr + i * SIMD_COMPUTE);
+    }
+    simd<data_type, SIMD_COMPUTE> sum = 0;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        sum = sum + buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i);
+    }
+    //store the result
+    lsc_block_store<data_type,
+                    SIMD_COMPUTE,
+                    lsc_data_size::default_size,
+                    cache_hint::uncached,
+                    cache_hint::write_back> //save the all sum in the second half of the temp slot.
+        (ptr, sum);
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_gather_from_remote_and_dist_to_rank_pair(int *even_ranks,
+                                                     int idx,
+                                                     void **out_buffers,
+                                                     uint32_t size,
+                                                     int threads_already_processed,
+                                                     void *temp_buffer[],
+                                                     uint32_t temp_rank,
+                                                     int size_per_buffer_kernel,
+                                                     int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int is_odd = (even_ranks[0] == 1);
+    //read the input data
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> buffer;
+
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD / 2; i++) {
+        //read the values
+        data_type *read_ptr_int = (data_type *)temp_buffer[even_ranks[i]];
+        read_ptr_int += size_per_buffer_kernel * buffer_index_kernel;
+        read_ptr_int += idx * SIMD_COMPUTE * TEMP_WORLD;
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(read_ptr_int);
+    }
+
+    //write the data to the pair of ranks within the same gpu
+    data_type *ptr_even = (data_type *)out_buffers[temp_rank ^ 1];
+    data_type *ptr_odd = (data_type *)out_buffers[temp_rank];
+    uint32_t write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD +
+                            is_odd * SIMD_COMPUTE * TEMP_WORLD / 2;
+    uint32_t i;
+    if (write_offset + SIMD_COMPUTE * TEMP_WORLD / 2 <= size) {
+#pragma unroll
+        for (i = 0; i < TEMP_WORLD / 2; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_even + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(
+                    SIMD_COMPUTE * i)); //save the results in the first half of temp slot
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_odd + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(
+                    SIMD_COMPUTE * i)); //save the results in the first half of temp slot
+        }
+    }
+    else if (write_offset < size) {
+        uint32_t vec_count = (size - write_offset) / SIMD_COMPUTE;
+        for (i = 0; i < vec_count; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_even + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_odd + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+        uint32_t count = size - write_offset - vec_count * SIMD_COMPUTE;
+        for (i = 0; i < count; i++) {
+            ptr_even[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+            ptr_odd[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+        }
+    }
+}
+
+template <typename data_type>
+void nocopy_2rank(int idx,
+                  void **in_buffers,
+                  uint32_t size,
+                  void **out_buffers,
+                  uint32_t temp_rank) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    const uint32_t TEMP_WORLD = 2;
+    //read the input data
+    //even rank and odd rank each read half
+    uint32_t read_offset = idx * SIMD_COMPUTE * TEMP_WORLD;
+    data_type *ptr_even = (data_type *)in_buffers[0] + temp_rank * SIMD_COMPUTE * TEMP_WORLD / 2;
+    data_type *ptr_odd = (data_type *)in_buffers[1] + temp_rank * SIMD_COMPUTE * TEMP_WORLD / 2;
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD> buffer;
+    uint32_t i;
+#pragma unroll
+    for (i = 0; i < TEMP_WORLD / 2; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(ptr_even + read_offset + i * SIMD_COMPUTE);
+    }
+#pragma unroll
+    for (i = TEMP_WORLD / 2; i < TEMP_WORLD; i++) {
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(ptr_odd + read_offset +
+                                               (i - TEMP_WORLD / 2) * SIMD_COMPUTE);
+    }
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD / 2> sum;
+    sum = buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(0) +
+          buffer.template select<SIMD_COMPUTE * TEMP_WORLD / 2, 1>(SIMD_COMPUTE * TEMP_WORLD / 2);
+
+    //write the data to the pair of ranks within the same gpu
+    ptr_even = (data_type *)out_buffers[temp_rank ^ 1];
+    ptr_odd = (data_type *)out_buffers[temp_rank];
+    uint32_t write_offset =
+        idx * SIMD_COMPUTE * TEMP_WORLD + temp_rank * SIMD_COMPUTE * TEMP_WORLD / 2;
+    if (write_offset + SIMD_COMPUTE * TEMP_WORLD / 2 <= size) {
+#pragma unroll
+        for (i = 0; i < TEMP_WORLD / 2; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_even + write_offset + i * SIMD_COMPUTE,
+                sum.template select<SIMD_COMPUTE, 1>(
+                    SIMD_COMPUTE * i)); //save the results in the first half of temp slot
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_odd + write_offset + i * SIMD_COMPUTE,
+                sum.template select<SIMD_COMPUTE, 1>(
+                    SIMD_COMPUTE * i)); //save the results in the first half of temp slot
+        }
+    }
+    else if (write_offset < size) {
+        uint32_t vec_count = (size - write_offset) / SIMD_COMPUTE;
+        for (i = 0; i < vec_count; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_even + write_offset + i * SIMD_COMPUTE,
+                sum.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                ptr_odd + write_offset + i * SIMD_COMPUTE,
+                sum.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+        uint32_t count = size - write_offset - vec_count * SIMD_COMPUTE;
+        for (i = 0; i < count; i++) {
+            ptr_even[write_offset + vec_count * SIMD_COMPUTE + i] =
+                sum[vec_count * SIMD_COMPUTE + i];
+            ptr_odd[write_offset + vec_count * SIMD_COMPUTE + i] =
+                sum[vec_count * SIMD_COMPUTE + i];
+        }
+    }
+}
+
+template <typename data_type>
+void write_output_2rank(int idx,
+                        void *out_buffer,
+                        uint32_t size,
+                        int threads_already_processed,
+                        void *temp_buffer[],
+                        uint32_t temp_rank,
+                        int outer_iter,
+                        int size_per_buffer_kernel,
+                        int buffer_index_kernel) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    const uint32_t TEMP_WORLD = 2;
+    //read the input data
+    simd<data_type, SIMD_COMPUTE * TEMP_WORLD> buffer;
+    data_type *read_ptr = (data_type *)temp_buffer[temp_rank];
+    read_ptr += idx * SIMD_COMPUTE * TEMP_WORLD + size_per_buffer_kernel * buffer_index_kernel;
+#pragma unroll
+    for (uint32_t i = 0; i < TEMP_WORLD; i++) {
+        //read the values
+        buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i) =
+            lsc_block_load<data_type,
+                           SIMD_COMPUTE,
+                           lsc_data_size::default_size,
+                           cache_hint::uncached,
+                           cache_hint::cached>(read_ptr + i * SIMD_COMPUTE);
+    }
+
+    //write out the results
+    data_type *write_ptr = (data_type *)out_buffer;
+    uint32_t write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * TEMP_WORLD;
+    if (write_offset + SIMD_COMPUTE * TEMP_WORLD <= size) {
+#pragma unroll
+        for (uint32_t i = 0; i < TEMP_WORLD; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                write_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+    }
+    else if (write_offset < size) {
+        int vec_count = (size - write_offset) / SIMD_COMPUTE;
+        for (int i = 0; i < vec_count; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::uncached>(
+                write_ptr + write_offset + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+        int count = size - write_offset - vec_count * SIMD_COMPUTE;
+        for (int i = 0; i < count; i++) {
+            write_ptr[write_offset + vec_count * SIMD_COMPUTE + i] =
+                buffer[vec_count * SIMD_COMPUTE + i];
+        }
+    }
+}
+
+template <typename dtype>
+class Kernel_load_input_to_temp_buffer;
+template <typename dtype>
+class Kernel_local_sum_and_distribute_to_remote_ranks;
+template <typename dtype>
+class Kernel_all_sum;
+template <typename dtype>
+class Kernel_gather_from_remote_and_dist_to_rank_pair;
+template <typename dtype>
+class Kernel_write_output;
+
+template <typename dtype>
+class AllreduceMediumKernel_2rank;
+template <typename dtype>
+class AllreduceMediumKernel_write_output_2rank;
+
+template <typename dtype>
+class NoCopyKernel_local_sum_and_distribute_to_remote_ranks;
+template <typename dtype>
+class NoCopyKernel_all_sum;
+template <typename dtype>
+class NoCopyKernel_gather_from_remote_and_dist_to_rank_pair;
+template <typename dtype>
+class NoCopyKernel_2rank;
+
+template <typename dtype>
+class AllreduceMediumKernel_GlobalSync;
+template <typename dtype>
+class AllreduceMediumKernel_LocalSync;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK, uint32_t max_buffer = 1024 /*KB*/>
+class sycl_allreduce_medium : public sycl_coll_base<data_type> {
+public:
+    sycl_allreduce_medium() : sycl_coll_base<data_type>() {
+        size_per_buffer = 0;
+    }
+
+    void init(sycl::queue &queue,
+              ccl_comm *comm,
+              ccl_stream *stream,
+              uint32_t rank_in,
+              uint32_t world_in) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+        rank = rank_in;
+        world = world_in;
+
+        data_size_per_buffer = MAX_SIZE / sizeof(data_type);
+
+        size_per_buffer = MAX_SIZE + SYNC_BYTE;
+
+        if (allreduce_medium_buffer == NULL) {
+            allreduce_medium_buffer = sycl::malloc_device(size_per_buffer * BUFFER_COUNT, queue);
+
+            auto e = queue.memset(allreduce_medium_buffer, 0, size_per_buffer * BUFFER_COUNT);
+            e.wait();
+
+            // XXX: gain access to remote pointers
+            this->exchange_peer_ipc_mem(queue,
+                                        comm,
+                                        stream,
+                                        allreduce_medium_buffer,
+                                        NULL,
+                                        rank,
+                                        world,
+                                        data_size_per_buffer * sizeof(data_type),
+                                        (void **)allreduce_medium_buffers,
+                                        (void **)allreduce_medium_sync_buffer,
+                                        allreduce_medium_offsets,
+                                        allreduce_medium_ipc_handle,
+                                        NULL,
+                                        NULL /* mmap_buffers */,
+                                        false /* to_cache */);
+        }
+
+        this->initialized = true;
+
+        global_stream = stream;
+        global_comm = comm;
+        even_comm = global_comm->get_even_comm().get();
+    }
+
+    ccl::event allreduce(sycl::queue &queue,
+                         const void *in_buffer,
+                         void *out_buffer,
+                         uint32_t size) {
+        if (ccl::global_data::env().allreduce_use_tmp_buf) {
+            return allreduce_copy(queue, in_buffer, out_buffer, size);
+        }
+        else {
+            if (world == 2) {
+                return allreduce_nocopy_2rank(queue, in_buffer, out_buffer, size);
+            }
+            else {
+                return allreduce_nocopy(queue, in_buffer, out_buffer, size);
+            }
+        }
+    }
+
+private:
+    ccl::event allreduce_copy(sycl::queue &queue,
+                              const void *in_buffer,
+                              void *out_buffer,
+                              uint32_t size) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = allreduce_medium_buffers[i];
+        }
+        /*
+        void* temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) 
+        {
+            temp_sync_buffer[i] = allreduce_medium_sync_buffer[i];
+        }
+        */
+
+        int even_ranks[max_rank];
+        int myrank;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank)
+                myrank = i;
+            //printf("even rank %d: %d neighbor: %d\n", i, even_ranks[i], even_ranks[i] ^ 1);
+        }
+
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        int buffer_index_kernel = allreduce_medium_buffer_index;
+        int outerloop_iter_count; //Since 16 elements in temp buffer is used to process 8 element output, the outer loop count must be doubled roughly.
+        int outer_iter;
+        //todo:
+        //5. prefetch in persistent threads?
+        int max_threads_per_MAX_COUNT =
+            (data_size_per_buffer * 2 / 3) /
+            (SIMD_COMPUTE *
+             temp_world); // each thread uses (data_size_per_buffer * temp_world) * 3 / 2 space
+        int max_elements_per_MAX_COUNT = max_threads_per_MAX_COUNT * (SIMD_COMPUTE * temp_world);
+
+        int threads_already_processed = 0;
+        outerloop_iter_count =
+            (size + max_elements_per_MAX_COUNT - 1) /
+            max_elements_per_MAX_COUNT; //this is the outerloop count that requires full hw thread count. This doesnt include the outloop iteration that only needs partial thread count
+        //uint32_t total_threads_needed_sync = 1;
+        for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++) {
+            uint32_t total_threads_needed;
+            if ((outer_iter + 1) * max_elements_per_MAX_COUNT < (int)size) {
+                total_threads_needed = max_threads_per_MAX_COUNT;
+            }
+            else {
+                total_threads_needed = (size - outer_iter * max_elements_per_MAX_COUNT +
+                                        SIMD_COMPUTE * temp_world - 1) /
+                                       (SIMD_COMPUTE * temp_world);
+            }
+            int wg_size = 1;
+
+            int innerloop_iter_count =
+                (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+
+#define KERNEL_EXEC_MAP (1 + 2 + 4 + 8 + 16 + 32 + 64 + 128 + 256)
+
+#if KERNEL_EXEC_MAP & 1
+            //Data is sent to other tile within the same gpu via MDFI
+            queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for<class Kernel_load_input_to_temp_buffer<data_type>>(
+                    sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                    {
+                    //ESIMD kernel
+                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                        if ((uint32_t)index >= total_threads_needed)
+                            break;
+
+                        switch (temp_world) {
+                            case 2:
+                                load_input_to_temp_buffer<2, data_type>(index,
+                                                                        in_buffer,
+                                                                        size,
+                                                                        threads_already_processed,
+                                                                        (void **)temp_buffer,
+                                                                        temp_rank,
+                                                                        outer_iter,
+                                                                        size_per_buffer_kernel,
+                                                                        buffer_index_kernel);
+                                break;
+                            case 4:
+                                load_input_to_temp_buffer<4, data_type>(index,
+                                                                        in_buffer,
+                                                                        size,
+                                                                        threads_already_processed,
+                                                                        (void **)temp_buffer,
+                                                                        temp_rank,
+                                                                        outer_iter,
+                                                                        size_per_buffer_kernel,
+                                                                        buffer_index_kernel);
+                                break;
+                            case 6:
+                                load_input_to_temp_buffer<6, data_type>(index,
+                                                                        in_buffer,
+                                                                        size,
+                                                                        threads_already_processed,
+                                                                        (void **)temp_buffer,
+                                                                        temp_rank,
+                                                                        outer_iter,
+                                                                        size_per_buffer_kernel,
+                                                                        buffer_index_kernel);
+                                break;
+                            case 8:
+                                load_input_to_temp_buffer<8, data_type>(index,
+                                                                        in_buffer,
+                                                                        size,
+                                                                        threads_already_processed,
+                                                                        (void **)temp_buffer,
+                                                                        temp_rank,
+                                                                        outer_iter,
+                                                                        size_per_buffer_kernel,
+                                                                        buffer_index_kernel);
+                                break;
+                            case 10:
+                                load_input_to_temp_buffer<10, data_type>(index,
+                                                                         in_buffer,
+                                                                         size,
+                                                                         threads_already_processed,
+                                                                         (void **)temp_buffer,
+                                                                         temp_rank,
+                                                                         outer_iter,
+                                                                         size_per_buffer_kernel,
+                                                                         buffer_index_kernel);
+                                break;
+                            case 12:
+                                load_input_to_temp_buffer<12, data_type>(index,
+                                                                         in_buffer,
+                                                                         size,
+                                                                         threads_already_processed,
+                                                                         (void **)temp_buffer,
+                                                                         temp_rank,
+                                                                         outer_iter,
+                                                                         size_per_buffer_kernel,
+                                                                         buffer_index_kernel);
+                                break;
+                            case 14:
+                                load_input_to_temp_buffer<14, data_type>(index,
+                                                                         in_buffer,
+                                                                         size,
+                                                                         threads_already_processed,
+                                                                         (void **)temp_buffer,
+                                                                         temp_rank,
+                                                                         outer_iter,
+                                                                         size_per_buffer_kernel,
+                                                                         buffer_index_kernel);
+                                break;
+                            case 16:
+                                load_input_to_temp_buffer<16, data_type>(index,
+                                                                         in_buffer,
+                                                                         size,
+                                                                         threads_already_processed,
+                                                                         (void **)temp_buffer,
+                                                                         temp_rank,
+                                                                         outer_iter,
+                                                                         size_per_buffer_kernel,
+                                                                         buffer_index_kernel);
+                                break;
+                            default: break;
+                        }
+                    }
+
+                    });//parallel_for
+            }); //submit()
+#endif
+#if KERNEL_EXEC_MAP & 2
+                //sync all the ranks within the single GPU.
+            e = local_sync(queue,
+                           temp_rank,
+                           temp_world,
+                           size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                           0,
+                           0);
+            //printf("kernel1\n");
+#endif
+#if KERNEL_EXEC_MAP & 4
+            //local reduction kernel
+            queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class Kernel_local_sum_and_distribute_to_remote_ranks<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= total_threads_needed)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    local_sum_and_distribute_to_remote_ranks<2, data_type>(
+                                        (int *)even_ranks,
+                                        myrank,
+                                        index,
+                                        in_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 4:
+                                    local_sum_and_distribute_to_remote_ranks<4, data_type>(
+                                        (int *)even_ranks,
+                                        myrank,
+                                        index,
+                                        in_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 6:
+                                    local_sum_and_distribute_to_remote_ranks<6, data_type>(
+                                        (int *)even_ranks,
+                                        myrank,
+                                        index,
+                                        in_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 8:
+                                    local_sum_and_distribute_to_remote_ranks<8, data_type>(
+                                        (int *)even_ranks,
+                                        myrank,
+                                        index,
+                                        in_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 10:
+                                    local_sum_and_distribute_to_remote_ranks<10, data_type>(
+                                        (int *)even_ranks,
+                                        myrank,
+                                        index,
+                                        in_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 12:
+                                    local_sum_and_distribute_to_remote_ranks<12, data_type>(
+                                        (int *)even_ranks,
+                                        myrank,
+                                        index,
+                                        in_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 14:
+                                    local_sum_and_distribute_to_remote_ranks<14, data_type>(
+                                        (int *)even_ranks,
+                                        myrank,
+                                        index,
+                                        in_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 16:
+                                    local_sum_and_distribute_to_remote_ranks<16, data_type>(
+                                        (int *)even_ranks,
+                                        myrank,
+                                        index,
+                                        in_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                default: break;
+                            }
+                        }
+
+                        });//parallel_for
+            }); //submit()
+#endif
+#if KERNEL_EXEC_MAP & 8
+            //sync all the ranks here before consuming the results.
+            e = global_sync(queue,
+                            temp_rank,
+                            temp_world,
+                            size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                            1,
+                            0);
+#endif
+#if KERNEL_EXEC_MAP & 16
+            //local reduction kernel
+            queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class Kernel_all_sum<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= total_threads_needed)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    all_sum<2, data_type>(index,
+                                                          in_buffer,
+                                                          size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel);
+                                    break;
+                                case 4:
+                                    all_sum<4, data_type>(index,
+                                                          in_buffer,
+                                                          size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel);
+                                    break;
+                                case 6:
+                                    all_sum<6, data_type>(index,
+                                                          in_buffer,
+                                                          size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel);
+                                    break;
+                                case 8:
+                                    all_sum<8, data_type>(index,
+                                                          in_buffer,
+                                                          size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel);
+                                    break;
+                                case 10:
+                                    all_sum<10, data_type>(index,
+                                                           in_buffer,
+                                                           size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel);
+                                    break;
+                                case 12:
+                                    all_sum<12, data_type>(index,
+                                                           in_buffer,
+                                                           size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel);
+                                    break;
+                                case 14:
+                                    all_sum<14, data_type>(index,
+                                                           in_buffer,
+                                                           size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel);
+                                    break;
+                                case 16:
+                                    all_sum<16, data_type>(index,
+                                                           in_buffer,
+                                                           size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel);
+                                    break;
+                                default: break;
+                            }
+                        }
+
+                        });//parallel_for
+            }); //submit()
+#endif
+#if KERNEL_EXEC_MAP & 32
+            //sync all the ranks here before consuming the results.
+            e = global_sync(queue,
+                            temp_rank,
+                            temp_world,
+                            size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                            3,
+                            0);
+#endif
+#if KERNEL_EXEC_MAP & 64
+            //copy the results to all the ranks.
+            queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class Kernel_gather_from_remote_and_dist_to_rank_pair<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= total_threads_needed)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    gather_from_remote_and_dist_to_rank_pair<2, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        out_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        outer_iter,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 4:
+                                    gather_from_remote_and_dist_to_rank_pair<4, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        out_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        outer_iter,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 6:
+                                    gather_from_remote_and_dist_to_rank_pair<6, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        out_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        outer_iter,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 8:
+                                    gather_from_remote_and_dist_to_rank_pair<8, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        out_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        outer_iter,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 10:
+                                    gather_from_remote_and_dist_to_rank_pair<10, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        out_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        outer_iter,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 12:
+                                    gather_from_remote_and_dist_to_rank_pair<12, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        out_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        outer_iter,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 14:
+                                    gather_from_remote_and_dist_to_rank_pair<14, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        out_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        outer_iter,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 16:
+                                    gather_from_remote_and_dist_to_rank_pair<16, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        out_buffer,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        outer_iter,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                default: break;
+                            }
+                        }
+                        });//parallel_for
+            }); //submit()
+#endif
+#if KERNEL_EXEC_MAP & 128
+            //sync all the ranks within the same GPU.
+            e = local_sync(queue,
+                           temp_rank,
+                           temp_world,
+                           size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                           4,
+                           1);
+#endif
+#if KERNEL_EXEC_MAP & 256
+            //copy the results to all the ranks.
+            e = queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class Kernel_write_output<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= total_threads_needed)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    write_output<2, data_type>((int *)even_ranks,
+                                                               index,
+                                                               out_buffer,
+                                                               size,
+                                                               threads_already_processed,
+                                                               (void **)temp_buffer,
+                                                               temp_rank,
+                                                               outer_iter,
+                                                               size_per_buffer_kernel,
+                                                               buffer_index_kernel);
+                                    break;
+                                case 4:
+                                    write_output<4, data_type>((int *)even_ranks,
+                                                               index,
+                                                               out_buffer,
+                                                               size,
+                                                               threads_already_processed,
+                                                               (void **)temp_buffer,
+                                                               temp_rank,
+                                                               outer_iter,
+                                                               size_per_buffer_kernel,
+                                                               buffer_index_kernel);
+                                    break;
+                                case 6:
+                                    write_output<6, data_type>((int *)even_ranks,
+                                                               index,
+                                                               out_buffer,
+                                                               size,
+                                                               threads_already_processed,
+                                                               (void **)temp_buffer,
+                                                               temp_rank,
+                                                               outer_iter,
+                                                               size_per_buffer_kernel,
+                                                               buffer_index_kernel);
+                                    break;
+                                case 8:
+                                    write_output<8, data_type>((int *)even_ranks,
+                                                               index,
+                                                               out_buffer,
+                                                               size,
+                                                               threads_already_processed,
+                                                               (void **)temp_buffer,
+                                                               temp_rank,
+                                                               outer_iter,
+                                                               size_per_buffer_kernel,
+                                                               buffer_index_kernel);
+                                    break;
+                                case 10:
+                                    write_output<10, data_type>((int *)even_ranks,
+                                                                index,
+                                                                out_buffer,
+                                                                size,
+                                                                threads_already_processed,
+                                                                (void **)temp_buffer,
+                                                                temp_rank,
+                                                                outer_iter,
+                                                                size_per_buffer_kernel,
+                                                                buffer_index_kernel);
+                                    break;
+                                case 12:
+                                    write_output<12, data_type>((int *)even_ranks,
+                                                                index,
+                                                                out_buffer,
+                                                                size,
+                                                                threads_already_processed,
+                                                                (void **)temp_buffer,
+                                                                temp_rank,
+                                                                outer_iter,
+                                                                size_per_buffer_kernel,
+                                                                buffer_index_kernel);
+                                    break;
+                                case 14:
+                                    write_output<14, data_type>((int *)even_ranks,
+                                                                index,
+                                                                out_buffer,
+                                                                size,
+                                                                threads_already_processed,
+                                                                (void **)temp_buffer,
+                                                                temp_rank,
+                                                                outer_iter,
+                                                                size_per_buffer_kernel,
+                                                                buffer_index_kernel);
+                                    break;
+                                case 16:
+                                    write_output<16, data_type>((int *)even_ranks,
+                                                                index,
+                                                                out_buffer,
+                                                                size,
+                                                                threads_already_processed,
+                                                                (void **)temp_buffer,
+                                                                temp_rank,
+                                                                outer_iter,
+                                                                size_per_buffer_kernel,
+                                                                buffer_index_kernel);
+                                    break;
+                                default: break;
+                            }
+                        }
+                        });//parallel_for
+            }); //submit()
+#endif
+
+            buffer_index_kernel++;
+            buffer_index_kernel &= 1;
+            threads_already_processed += total_threads_needed;
+        } //for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++)
+
+        allreduce_medium_buffer_index += outerloop_iter_count;
+        allreduce_medium_buffer_index &= 1;
+
+        return ccl::event::create_from_native(e);
+    }
+
+    // perform IPC exchange every time
+    ccl::event allreduce_nocopy(sycl::queue &queue,
+                                const void *in_buffer,
+                                void *out_buffer,
+                                uint32_t size) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = allreduce_medium_buffers[i];
+        }
+
+        int even_ranks[max_rank];
+        int myrank;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank) {
+                myrank = i;
+            }
+        }
+
+        void *in_buffers[max_rank];
+        void *out_buffers[max_rank];
+        this->exchange_peer_ipc_mem(queue,
+                                    global_comm,
+                                    global_stream,
+                                    (void **)in_buffer,
+                                    out_buffer,
+                                    rank,
+                                    world,
+                                    0,
+                                    (void **)in_buffers,
+                                    NULL,
+                                    NULL,
+                                    NULL,
+                                    (void **)out_buffers);
+
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        int buffer_index_kernel = allreduce_medium_buffer_index;
+        //Since 16 elements in temp buffer is used to process 8 element output, the outer loop count must be doubled roughly.
+        int outerloop_iter_count;
+        int outer_iter;
+        //todo:
+        //5. prefetch in persistent threads?
+        int max_threads_per_MAX_COUNT = data_size_per_buffer / (SIMD_COMPUTE * temp_world);
+        int max_elements_per_MAX_COUNT = max_threads_per_MAX_COUNT * (SIMD_COMPUTE * temp_world);
+
+        int threads_already_processed = 0;
+        //this is the outerloop count that requires full hw thread count. This doesnt include the outloop iteration that only needs partial thread count
+        outerloop_iter_count = (size + max_elements_per_MAX_COUNT - 1) / max_elements_per_MAX_COUNT;
+        //uint32_t total_threads_needed_sync = 1;
+
+        // sync two tiles of a same GPU before entering the call
+        e = local_sync(queue,
+                       temp_rank,
+                       temp_world,
+                       size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                       0,
+                       0);
+
+        for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++) {
+            uint32_t total_threads_needed;
+            if ((outer_iter + 1) * max_elements_per_MAX_COUNT < (int)size) {
+                total_threads_needed = max_threads_per_MAX_COUNT;
+            }
+            else {
+                total_threads_needed = (size - outer_iter * max_elements_per_MAX_COUNT +
+                                        SIMD_COMPUTE * temp_world - 1) /
+                                       (SIMD_COMPUTE * temp_world);
+            }
+            int wg_size = 1;
+
+            int innerloop_iter_count =
+                (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+
+            //local reduction kernel
+            e = queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for<class NoCopyKernel_local_sum_and_distribute_to_remote_ranks<data_type>>(
+                    sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                    {
+                    //ESIMD kernel
+                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                        if ((uint32_t)index >= total_threads_needed)
+                            break;
+
+                        switch (temp_world) {
+                            case 2:
+                                nocopy_sum_and_distribute_to_remote_ranks<2, data_type>(
+                                    (int *)even_ranks,
+                                    myrank,
+                                    index,
+                                    (void **)in_buffers,
+                                    size,
+                                    threads_already_processed,
+                                    (void **)temp_buffer,
+                                    temp_rank,
+                                    size_per_buffer_kernel,
+                                    buffer_index_kernel);
+                                break;
+                            case 4:
+                                nocopy_sum_and_distribute_to_remote_ranks<4, data_type>(
+                                    (int *)even_ranks,
+                                    myrank,
+                                    index,
+                                    (void **)in_buffers,
+                                    size,
+                                    threads_already_processed,
+                                    (void **)temp_buffer,
+                                    temp_rank,
+                                    size_per_buffer_kernel,
+                                    buffer_index_kernel);
+                                break;
+                            case 6:
+                                nocopy_sum_and_distribute_to_remote_ranks<6, data_type>(
+                                    (int *)even_ranks,
+                                    myrank,
+                                    index,
+                                    (void **)in_buffers,
+                                    size,
+                                    threads_already_processed,
+                                    (void **)temp_buffer,
+                                    temp_rank,
+                                    size_per_buffer_kernel,
+                                    buffer_index_kernel);
+                                break;
+                            case 8:
+                                nocopy_sum_and_distribute_to_remote_ranks<8, data_type>(
+                                    (int *)even_ranks,
+                                    myrank,
+                                    index,
+                                    (void **)in_buffers,
+                                    size,
+                                    threads_already_processed,
+                                    (void **)temp_buffer,
+                                    temp_rank,
+                                    size_per_buffer_kernel,
+                                    buffer_index_kernel);
+                                break;
+                            case 10:
+                                nocopy_sum_and_distribute_to_remote_ranks<10, data_type>(
+                                    (int *)even_ranks,
+                                    myrank,
+                                    index,
+                                    (void **)in_buffers,
+                                    size,
+                                    threads_already_processed,
+                                    (void **)temp_buffer,
+                                    temp_rank,
+                                    size_per_buffer_kernel,
+                                    buffer_index_kernel);
+                                break;
+                            case 12:
+                                nocopy_sum_and_distribute_to_remote_ranks<12, data_type>(
+                                    (int *)even_ranks,
+                                    myrank,
+                                    index,
+                                    (void **)in_buffers,
+                                    size,
+                                    threads_already_processed,
+                                    (void **)temp_buffer,
+                                    temp_rank,
+                                    size_per_buffer_kernel,
+                                    buffer_index_kernel);
+                                break;
+                            case 14:
+                                nocopy_sum_and_distribute_to_remote_ranks<14, data_type>(
+                                    (int *)even_ranks,
+                                    myrank,
+                                    index,
+                                    (void **)in_buffers,
+                                    size,
+                                    threads_already_processed,
+                                    (void **)temp_buffer,
+                                    temp_rank,
+                                    size_per_buffer_kernel,
+                                    buffer_index_kernel);
+                                break;
+                            case 16:
+                                nocopy_sum_and_distribute_to_remote_ranks<16, data_type>(
+                                    (int *)even_ranks,
+                                    myrank,
+                                    index,
+                                    (void **)in_buffers,
+                                    size,
+                                    threads_already_processed,
+                                    (void **)temp_buffer,
+                                    temp_rank,
+                                    size_per_buffer_kernel,
+                                    buffer_index_kernel);
+                                break;
+                            default: break;
+                        }
+                    }
+
+                    });//parallel_for
+            }); //submit()
+            //e.wait();
+
+            //sync all the ranks here before consuming the results.
+            e = global_sync(queue,
+                            temp_rank,
+                            temp_world,
+                            size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                            2,
+                            0);
+
+            //local reduction kernel
+            e = queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for<class NoCopyKernel_all_sum<data_type>>(
+                    sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                    {
+                    //ESIMD kernel
+                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                        if ((uint32_t)index >= total_threads_needed)
+                            break;
+
+                        switch (temp_world) {
+                            case 2:
+                                nocopy_all_sum<2, data_type>(index,
+                                                             in_buffer,
+                                                             size,
+                                                             threads_already_processed,
+                                                             (void **)temp_buffer,
+                                                             temp_rank,
+                                                             size_per_buffer_kernel,
+                                                             buffer_index_kernel);
+                                break;
+                            case 4:
+                                nocopy_all_sum<4, data_type>(index,
+                                                             in_buffer,
+                                                             size,
+                                                             threads_already_processed,
+                                                             (void **)temp_buffer,
+                                                             temp_rank,
+                                                             size_per_buffer_kernel,
+                                                             buffer_index_kernel);
+                                break;
+                            case 6:
+                                nocopy_all_sum<6, data_type>(index,
+                                                             in_buffer,
+                                                             size,
+                                                             threads_already_processed,
+                                                             (void **)temp_buffer,
+                                                             temp_rank,
+                                                             size_per_buffer_kernel,
+                                                             buffer_index_kernel);
+                                break;
+                            case 8:
+                                nocopy_all_sum<8, data_type>(index,
+                                                             in_buffer,
+                                                             size,
+                                                             threads_already_processed,
+                                                             (void **)temp_buffer,
+                                                             temp_rank,
+                                                             size_per_buffer_kernel,
+                                                             buffer_index_kernel);
+                                break;
+                            case 10:
+                                nocopy_all_sum<10, data_type>(index,
+                                                              in_buffer,
+                                                              size,
+                                                              threads_already_processed,
+                                                              (void **)temp_buffer,
+                                                              temp_rank,
+                                                              size_per_buffer_kernel,
+                                                              buffer_index_kernel);
+                                break;
+                            case 12:
+                                nocopy_all_sum<12, data_type>(index,
+                                                              in_buffer,
+                                                              size,
+                                                              threads_already_processed,
+                                                              (void **)temp_buffer,
+                                                              temp_rank,
+                                                              size_per_buffer_kernel,
+                                                              buffer_index_kernel);
+                                break;
+                            case 14:
+                                nocopy_all_sum<14, data_type>(index,
+                                                              in_buffer,
+                                                              size,
+                                                              threads_already_processed,
+                                                              (void **)temp_buffer,
+                                                              temp_rank,
+                                                              size_per_buffer_kernel,
+                                                              buffer_index_kernel);
+                                break;
+                            case 16:
+                                nocopy_all_sum<16, data_type>(index,
+                                                              in_buffer,
+                                                              size,
+                                                              threads_already_processed,
+                                                              (void **)temp_buffer,
+                                                              temp_rank,
+                                                              size_per_buffer_kernel,
+                                                              buffer_index_kernel);
+                                break;
+                            default: break;
+                        }
+                    }
+
+                    });//parallel_for
+            }); //submit()
+            //e.wait();
+
+            //sync all the ranks here before consuming the results.
+            int reset = (outer_iter == outerloop_iter_count - 1) ? 0 : 1;
+            e = global_sync(queue,
+                            temp_rank,
+                            temp_world,
+                            size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                            3,
+                            reset);
+            //e.wait();
+
+            //copy the results to all the ranks.
+            e = queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class NoCopyKernel_gather_from_remote_and_dist_to_rank_pair<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= total_threads_needed)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    nocopy_gather_from_remote_and_dist_to_rank_pair<2, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        (void **)out_buffers,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 4:
+                                    nocopy_gather_from_remote_and_dist_to_rank_pair<4, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        (void **)out_buffers,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 6:
+                                    nocopy_gather_from_remote_and_dist_to_rank_pair<6, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        (void **)out_buffers,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 8:
+                                    nocopy_gather_from_remote_and_dist_to_rank_pair<8, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        (void **)out_buffers,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 10:
+                                    nocopy_gather_from_remote_and_dist_to_rank_pair<10, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        (void **)out_buffers,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 12:
+                                    nocopy_gather_from_remote_and_dist_to_rank_pair<12, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        (void **)out_buffers,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 14:
+                                    nocopy_gather_from_remote_and_dist_to_rank_pair<14, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        (void **)out_buffers,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                case 16:
+                                    nocopy_gather_from_remote_and_dist_to_rank_pair<16, data_type>(
+                                        (int *)even_ranks,
+                                        index,
+                                        (void **)out_buffers,
+                                        size,
+                                        threads_already_processed,
+                                        (void **)temp_buffer,
+                                        temp_rank,
+                                        size_per_buffer_kernel,
+                                        buffer_index_kernel);
+                                    break;
+                                default: break;
+                            }
+                        }
+                        });//parallel_for
+            }); //submit()
+            //e.wait();
+
+            if (outer_iter == outerloop_iter_count - 1) {
+                // sync two tiles of a same GPU before exiting the call
+                e = global_sync(queue,
+                                temp_rank,
+                                temp_world,
+                                size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                                4,
+                                1);
+            }
+
+            buffer_index_kernel++;
+            buffer_index_kernel &= 1;
+            threads_already_processed += total_threads_needed;
+        } //for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++)
+
+        allreduce_medium_buffer_index += outerloop_iter_count;
+        allreduce_medium_buffer_index &= 1;
+
+        return ccl::event::create_from_native(e);
+    }
+
+    // perform IPC exchange every time (2 rank version)
+    ccl::event allreduce_nocopy_2rank(sycl::queue &queue,
+                                      const void *in_buffer,
+                                      void *out_buffer,
+                                      uint32_t size) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+
+        //queue.wait();
+
+        void *in_buffers[2];
+        void *out_buffers[2];
+        this->exchange_peer_ipc_mem(queue,
+                                    global_comm,
+                                    global_stream,
+                                    (void **)in_buffer,
+                                    out_buffer,
+                                    rank,
+                                    world,
+                                    data_size_per_buffer,
+                                    (void **)in_buffers,
+                                    NULL,
+                                    NULL,
+                                    NULL,
+                                    (void **)out_buffers);
+
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        int buffer_index_kernel = allreduce_medium_buffer_index;
+
+        // a GPU barrier to make sure all ranks are ready
+        e = global_sync(queue,
+                        temp_rank,
+                        temp_world,
+                        size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                        0,
+                        0);
+
+        uint32_t total_threads_needed;
+        total_threads_needed = (size + SIMD_COMPUTE * temp_world - 1) / (SIMD_COMPUTE * temp_world);
+        int wg_size = 1;
+
+        int innerloop_iter_count = (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+
+        uint32_t persist_threads_needed = total_threads_needed;
+        if (persist_threads_needed > HW_THREAD_COUNT)
+            persist_threads_needed = HW_THREAD_COUNT;
+
+        //local reduction kernel
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class NoCopyKernel_2rank<data_type>>(
+                sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                {
+                uint32_t idx = idx2.get_global_id();
+                for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                    int index = idx + inner_iter * HW_THREAD_COUNT;
+                    if ((uint32_t)index >= total_threads_needed)
+                        break;
+
+                    nocopy_2rank<data_type>(
+                        index, (void **)in_buffers, size, (void **)out_buffers, temp_rank);
+                }
+
+                });//parallel_for
+        }); //submit()
+        //e.wait();
+
+        // sync two tiles of a same GPU before exiting the call
+        e = local_sync(queue,
+                       temp_rank,
+                       temp_world,
+                       size_per_buffer_for_sync_kernel * buffer_index_kernel,
+                       4,
+                       1);
+
+        allreduce_medium_buffer_index++; // acount for the local sync
+        allreduce_medium_buffer_index &= 1;
+
+        return ccl::event::create_from_native(e);
+    }
+
+    void release(sycl::queue &queue) {
+        // Clean up, close/put ipc handles, free memory, etc.
+        auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_context());
+        for (int i = 0; i < world; i++) {
+            if (i != rank) {
+                ZE_CALL(
+                    zeMemCloseIpcHandle,
+                    (l0_ctx, (char *)allreduce_medium_buffers[i] - allreduce_medium_offsets[i]));
+            }
+        }
+
+        sycl::free(allreduce_medium_buffers[rank], queue);
+        this->initialized = false;
+    }
+
+    //sync all the ranks here before consuming the results.
+    sycl::event global_sync(sycl::queue queue,
+                            int temp_rank,
+                            uint32_t temp_world,
+                            int offset,
+                            int index,
+                            int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = allreduce_medium_sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class AllreduceMediumKernel_GlobalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                //ESIMD kernel
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //since other ranks might still be doing local_sum, we need to sync ranks here.
+                //After the sync is done, the second half of hte temp buffer will be replaced with new sum val.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                for (uint32_t i = 0; i < temp_world; i++) {
+                    int *sync_ptr = (int *)temp_sync_buffer[i] + offset;
+                    ////never true. Used to force dependecy with prev kernel
+                    //if (total_threads_needed_sync == 0x7fffffff)
+                    //    sync_ptr = temp_buffer[0];
+                    lsc_atomic_update<atomic_op::inc,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(sync_ptr, ramp, pred);
+                }
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != temp_world) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    // sync tiles in a GPU
+    sycl::event local_sync(sycl::queue queue,
+                           int temp_rank,
+                           uint32_t temp_world,
+                           int offset,
+                           int index,
+                           int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = allreduce_medium_sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class AllreduceMediumKernel_LocalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                //ESIMD kernel
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //sync only the rank pair within the same gpu.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank ^ 1] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+                sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != RANKS_PER_GPU) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+private:
+    int rank{ ccl::utils::invalid_rank }, world{ ccl::utils::invalid_err_code };
+    int size_per_buffer{ ccl::utils::invalid_bytes_value };
+    int data_size_per_buffer{ ccl::utils::invalid_bytes_value };
+    ccl_stream *global_stream{};
+    ccl_comm *global_comm{};
+    ccl_comm *even_comm{};
+};
+
+#define ALLREDUCE_MEDIUM_API(TYPE) \
+    void init_allreduce_medium_##TYPE(ccl::datatype dtype, \
+                                      sycl::queue &queue, \
+                                      ccl_comm *comm, \
+                                      ccl_stream *stream, \
+                                      uint32_t rank_in, \
+                                      uint32_t world_in) { \
+        if (!ar_medium_##TYPE.inited()) { \
+            LOG_INFO("invoking medium allreduce first time for datatype: ", dtype); \
+            ar_medium_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+    } \
+\
+    ccl::event run_allreduce_medium_##TYPE( \
+        ccl::datatype dtype, sycl::queue queue, const void *in_buf, void *out_buf, size_t count) { \
+        return ar_medium_##TYPE.allreduce(queue, in_buf, out_buf, count); \
+    }
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_bf16.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_bf16.cpp
new file mode 100644
index 000000000..f6854ff53
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_bf16.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_medium_sycl.hpp"
+
+sycl_allreduce_medium<sycl::_V1::ext::oneapi::bfloat16> ar_medium_bf16;
+
+ALLREDUCE_MEDIUM_API(bf16);
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_fp16.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_fp16.cpp
new file mode 100644
index 000000000..8ed20aacb
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_fp16.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_medium_sycl.hpp"
+
+sycl_allreduce_medium<sycl::half> ar_medium_fp16;
+
+ALLREDUCE_MEDIUM_API(fp16);
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_fp32.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_fp32.cpp
new file mode 100644
index 000000000..3bf091d27
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_fp32.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_medium_sycl.hpp"
+
+sycl_allreduce_medium<float> ar_medium_fp32;
+
+ALLREDUCE_MEDIUM_API(fp32);
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_int32.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_int32.cpp
new file mode 100644
index 000000000..c0e752a89
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_medium_sycl_int32.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_medium_sycl.hpp"
+
+sycl_allreduce_medium<int> ar_medium_int32;
+
+ALLREDUCE_MEDIUM_API(int32);
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl.cpp
new file mode 100644
index 000000000..56188d121
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl.cpp
@@ -0,0 +1,82 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/allreduce/sycl/allreduce_small_sycl.hpp"
+
+#define MAX_RANK 16
+
+void *allreduce_small_buffer = NULL;
+void *allreduce_small_buffers[MAX_RANK];
+void *allreduce_small_sync_buffer[MAX_RANK];
+size_t allreduce_small_offsets[MAX_RANK];
+ze_ipc_mem_handle_t allreduce_small_ipc_handle[MAX_RANK];
+int allreduce_small_buffer_index = 0;
+
+#define SWITCH_INIT_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        if (!ar_small_##TYPE.inited()) { \
+            LOG_INFO("invoking small allreduce first time for datatype: ", ccl_type); \
+            ar_small_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+        break;
+
+#define SWITCH_RUN_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        e = ar_small_##TYPE.allreduce(queue, in_buf, out_buf, dtype, count); \
+        break;
+
+#define SWITCH_TYPE_UNSUPPORTED(TYPE, ccl_type) \
+    case ccl_type: \
+        fprintf(stderr, "allreduce with bf16 not supported!\n"); \
+        CCL_THROW("allreduce with bf16 not supported"); \
+        break;
+
+#include "coll/algorithms/allreduce/sycl/allreduce_small_sycl.hpp"
+
+sycl_allreducer_small<sycl::half> ar_small_fp16;
+sycl_allreducer_small<sycl::_V1::ext::oneapi::bfloat16> ar_small_bf16;
+sycl_allreducer_small<float> ar_small_fp32;
+sycl_allreducer_small<int> ar_small_int32;
+
+void init_allreduce_small(ccl::datatype dtype,
+                          sycl::queue &queue,
+                          ccl_comm *comm,
+                          ccl_stream *stream,
+                          uint32_t rank_in,
+                          uint32_t world_in) {
+    switch (dtype) {
+        SWITCH_INIT_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_INIT_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_INIT_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_INIT_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for allreduce"); assert(0);
+    }
+}
+
+ccl::event run_allreduce_small(ccl::datatype dtype,
+                               sycl::queue queue,
+                               const void *in_buf,
+                               void *out_buf,
+                               size_t count) {
+    ccl::event e;
+    switch (dtype) {
+        SWITCH_RUN_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_RUN_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_RUN_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_RUN_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for allreduce"); assert(0);
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl.hpp b/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl.hpp
new file mode 100644
index 000000000..ff7d7633a
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl.hpp
@@ -0,0 +1,1213 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define MAX_REPETITION                 16
+#define SIMD_MAX                       256
+#define SIMD                           (SIMD_MAX / sizeof(data_type))
+#define SIMD_ATOMIC                    16
+#define MAX_RANK                       16
+#define UNROLL_SIZE                    1
+#define TRIPLE_BUFFER                  3
+#define SYNC_BYTE                      (SIMD_ATOMIC * sizeof(int) * 2)
+#define ALIGNMENT_BYTE                 256
+#define EU_COUNT                       512
+#define THREADS_PER_EU                 8
+#define MAX_THREAD                     (EU_COUNT * THREADS_PER_EU)
+#define MAX_COUNT                      (SIMD * UNROLL_SIZE * kernel_inner_loop * MAX_THREAD)
+#define LOOP_COUNT_LIMIT               (1000000)
+#define DEBUG_DATA_SIZE                16
+#define DEBUG_THREAD_COUNT             2
+#define DEBUG_DUMP_TO_DEDICATED_OFFSET 1
+#define DEBUG                          0
+#define TEST_REP                       50
+#define INIT_SIZE                      64
+#define INIT_COUNT                     1
+#define SIMD_INIT                      (INIT_SIZE * INIT_COUNT)
+#define SMALLEST_NORM_FP16             0.00006103515625
+
+extern void *allreduce_small_buffer;
+extern void *allreduce_small_buffers[MAX_RANK];
+extern void *allreduce_small_sync_buffer[MAX_RANK];
+extern size_t allreduce_small_offsets[MAX_RANK];
+extern ze_ipc_mem_handle_t allreduce_small_ipc_handle[MAX_RANK];
+extern int allreduce_small_buffer_index;
+
+const int kernel_inner_loop = 1;
+const int kernel_inner_loop_scalar = 4;
+
+template <typename data_type, uint32_t N>
+ESIMD_INLINE void reduce_kernel(void **temp_buffer,
+                                int buf_offset,
+                                int offset,
+                                data_type result[]) {
+    data_type peer[N][kernel_inner_loop_scalar];
+#pragma unroll
+    for (uint32_t r = 0; r < N; r++) {
+        data_type *peer_ptr = (data_type *)(temp_buffer[r]) + buf_offset + offset;
+        gpu_kernel_copy(
+            (char *)peer[r], (const char *)peer_ptr, kernel_inner_loop_scalar * sizeof(data_type));
+    }
+    gpu_kernel_copy(
+        (char *)result, (const char *)peer[0], kernel_inner_loop_scalar * sizeof(data_type));
+#pragma unroll
+    for (uint32_t r = 1; r < N; r++) {
+        for (int j = 0; j < kernel_inner_loop_scalar; j++)
+            result[j] += peer[r][j];
+    }
+}
+
+template <typename dtype>
+class Allreduce_small_kernel;
+template <typename dtype>
+class Allreduce_small_kernel_scalar;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK, uint32_t max_buffer = 1024 /*KB*/>
+class sycl_allreducer_small : public sycl_coll_base<data_type> {
+public:
+    sycl_allreducer_small() : sycl_coll_base<data_type>() {
+        size_per_buffer = 0;
+        data_size_per_buffer = 0;
+    }
+
+    void init(sycl::queue &queue,
+              ccl_comm *comm,
+              ccl_stream *stream,
+              uint32_t rank_in,
+              uint32_t world_in) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+        rank = rank_in;
+        world = world_in;
+        // temporal buffer used for allreduce temporal use only.
+        data_size_per_buffer = ((MAX_COUNT + SIMD * UNROLL_SIZE * kernel_inner_loop - 1) /
+                                (SIMD * UNROLL_SIZE * kernel_inner_loop)) *
+                               SIMD * UNROLL_SIZE * kernel_inner_loop;
+        data_size_per_buffer =
+            ((data_size_per_buffer * sizeof(data_type) + ALIGNMENT_BYTE - 1) / ALIGNMENT_BYTE) *
+            ALIGNMENT_BYTE / sizeof(data_type); //aligned size
+        data_size_per_buffer *= 2;
+        size_per_buffer = data_size_per_buffer * sizeof(data_type) + SYNC_BYTE;
+
+        if (allreduce_small_buffer == NULL) {
+            allreduce_small_buffer_index = 0;
+            allreduce_small_buffer = sycl::malloc_device(size_per_buffer * TRIPLE_BUFFER, queue);
+
+            auto e = queue.memset(allreduce_small_buffer, 0, size_per_buffer * TRIPLE_BUFFER);
+            e.wait();
+            this->exchange_peer_ipc_mem(queue,
+                                        comm,
+                                        stream,
+                                        allreduce_small_buffer,
+                                        NULL,
+                                        rank,
+                                        world,
+                                        data_size_per_buffer * sizeof(data_type),
+                                        (void **)allreduce_small_buffers,
+                                        (void **)allreduce_small_sync_buffer,
+                                        allreduce_small_offsets,
+                                        allreduce_small_ipc_handle,
+                                        NULL,
+                                        NULL /* mmap_buffers */,
+                                        false /* to_cache */);
+
+            int wg_size = 1;
+            //dummy kernel to avoid hang. The hang happens when there is no dummy kernel and allreduce() is called right after init().
+            e = queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for(sycl::nd_range<1>({ 1 }, wg_size),
+                                 [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL {
+
+                                 });
+            });
+            e.wait();
+        }
+
+        this->initialized = true;
+
+        global_comm = comm;
+        even_comm = global_comm->get_even_comm().get();
+    }
+
+    ccl::event allreduce(sycl::queue &queue,
+                         const void *in_buffer,
+                         void *out_buffer,
+                         ccl::datatype dtype,
+                         uint32_t size) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+
+        //if (size * sizeof(data_type) <= 4096) {
+        if (size * sizeof(data_type) <= 65536) {
+            e = allreduce_scalar(queue, in_buffer, out_buffer, size);
+            return ccl::event::create_from_native(e);
+        }
+
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = allreduce_small_buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = allreduce_small_sync_buffer[i];
+        }
+        uint32_t total_threads_needed = (size + SIMD * UNROLL_SIZE * kernel_inner_loop - 1) /
+                                        (SIMD * UNROLL_SIZE * kernel_inner_loop); //ceiling
+        const uint32_t wg_size = 16;
+        int size_per_buffer_kernel = size_per_buffer;
+        uint32_t total_threads_dispatched =
+            (total_threads_needed + wg_size - 1) / wg_size * wg_size;
+        uint32_t __attribute__((unused)) total_wg_count = total_threads_dispatched / wg_size;
+
+        int buffer_index_kernel = allreduce_small_buffer_index;
+        allreduce_small_buffer_index++;
+        allreduce_small_buffer_index %= TRIPLE_BUFFER;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class Allreduce_small_kernel<data_type>>(
+                sycl::nd_range<1>({ total_threads_dispatched }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL{
+                uint32_t idx = idx2.get_global_id();
+
+                //ESIMD kernel
+                uint offset = idx * SIMD * UNROLL_SIZE * kernel_inner_loop;
+                simd<data_type, max_rank * SIMD * UNROLL_SIZE> buffer; //64 registers
+                simd<data_type, SIMD * UNROLL_SIZE> buffer_small;
+                simd<ushort, SIMD_ATOMIC> ramp;
+                simd_mask<SIMD_ATOMIC> pred;
+                simd<int, SIMD_ATOMIC> status0;
+                int *local_sync_ptr;
+
+                //to do:
+                //O3 compiler optimization: not much difference after the change.
+                //tune the fence: good perf improvements
+                //tune the cacheability for each IO message: no noticeable improvement
+                //tune the thread size: not much improvements
+                //tune the polling freq
+
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_ATOMIC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //process the input only if the thread is useful
+                if (idx < total_threads_needed) {
+                    //do copy from input buffer to temp buffer.
+                    for (int i = 0; i < kernel_inner_loop; i++) {
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            buffer_small.template select<SIMD, 1>(unroll_i * SIMD) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::cached,
+                                               cache_hint::cached>((data_type *)in_buffer + offset +
+                                                                   unroll_i * SIMD +
+                                                                   i * SIMD * UNROLL_SIZE);
+                        }
+
+                        //use the temp buffer for the current rank to copy the data to.
+                        data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+                        local_temp_ptr +=
+                            (buffer_index_kernel * size_per_buffer_kernel /
+                             sizeof(
+                                 data_type)); //point to the correct buffer inside the triple buffer
+
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            lsc_block_store<data_type,
+                                            SIMD,
+                                            lsc_data_size::default_size,
+                                            cache_hint::uncached,
+                                            cache_hint::uncached>(
+                                (data_type *)local_temp_ptr + offset + unroll_i * SIMD +
+                                    i * SIMD * UNROLL_SIZE,
+                                buffer_small.template select<SIMD, 1>(unroll_i * SIMD));
+                        }
+                    }
+                    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+                }
+
+                //since each threads are copying small chunks of data to temp buffer, all the threads needs to sync globally using atomics within this rank
+
+                //sync locally within local GPU first.
+                local_sync_ptr = (int *)temp_sync_buffer
+                    [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                local_sync_ptr += (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+
+                //if there are more than 1 threads required per rank, then do the local sync within the rank first.
+                if (total_threads_needed > 1) {
+                    //do local sync in two steps. First using TG barrier. Then global L3 atomics.
+                    uint32_t local_tid = idx2.get_local_linear_id();
+
+                    pred = false;
+                    pred[0] = true;
+                    if (local_tid == 0) {
+                        status0 = lsc_atomic_update<atomic_op::inc,
+                                                    int,
+                                                    SIMD_ATOMIC,
+                                                    lsc_data_size::default_size,
+                                                    cache_hint::none,
+                                                    cache_hint::none>(local_sync_ptr, ramp, pred);
+                        //wait for all the local TG to sync. Then sync the other remote GPUs
+                        while (status0[0] != total_wg_count) {
+                            status0 =
+                                lsc_atomic_update<atomic_op::load,
+                                                  int,
+                                                  SIMD_ATOMIC,
+                                                  lsc_data_size::default_size,
+                                                  cache_hint::none,
+                                                  cache_hint::none>(local_sync_ptr, ramp, pred);
+                        }
+                    }
+                    idx2.barrier();
+                }
+
+                //once the local level sync is done, atomically write its counter to other remote gpus' atomic counter
+                pred = false;
+                pred[1] = true; //use different lane for the remote gpu sync
+                if (total_threads_dispatched >= temp_world) {
+                    if (idx < temp_world) {
+                        int *sync_ptr;
+
+                        // DEBUG: rank3 seems to have some problem where its cpu time is always ~100us
+                        // to debug this, change the way inter-gpu are synced.
+
+                        status0 = total_threads_needed;
+                        sync_ptr = (int *)temp_sync_buffer
+                            [idx]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        sync_ptr += (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        lsc_atomic_update<atomic_op::add,
+                                          int,
+                                          SIMD_ATOMIC,
+                                          lsc_data_size::default_size,
+                                          cache_hint::none,
+                                          cache_hint::none>(sync_ptr, ramp, status0, pred);
+                    }
+                }
+                else if (idx ==
+                         0) //one thread in the local gpu notifies the remote gpu of its status.
+                {
+                    status0 = total_threads_needed;
+                    for (uint32_t i = 0; i < temp_world; i++) {
+                        int *sync_ptr = (int *)temp_sync_buffer
+                            [i]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        sync_ptr += (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        lsc_atomic_update<atomic_op::add,
+                                          int,
+                                          SIMD_ATOMIC,
+                                          lsc_data_size::default_size,
+                                          cache_hint::none,
+                                          cache_hint::none>(sync_ptr, ramp, status0, pred);
+                    }
+                }
+
+                //once the local sync is done, retire useless threads
+                if (idx >= total_threads_needed) {
+                    return;
+                }
+
+                //wait for completion of the atomic sync
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_ATOMIC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(local_sync_ptr, ramp, pred);
+                while (status0[1] != total_threads_needed * temp_world) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_ATOMIC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(local_sync_ptr, ramp, pred);
+                }
+
+                //reset the sync counter for the next allreduce session. Each rank reset's its own buffer
+                if (idx == 0) //one thread in the local gpu notifies the remote gpu of its status.
+                {
+                    int buffer_index_to_reset =
+                        (buffer_index_kernel + TRIPLE_BUFFER - 1) % TRIPLE_BUFFER;
+                    status0 = 0;
+                    pred = true;
+                    local_sync_ptr = (int *)temp_sync_buffer
+                        [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                    local_sync_ptr +=
+                        (buffer_index_to_reset * size_per_buffer_kernel / sizeof(int));
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_ATOMIC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        local_sync_ptr, ramp, status0, pred); //reset the first half of sync buffer
+                }
+
+                //at this point, all the threads are done copying data from input buffer to temp buffer.
+                //do All reduce
+                simd<data_type, SIMD * UNROLL_SIZE> result;
+                for (int i = 0; i < kernel_inner_loop; i++) {
+                    if (temp_world == 2) {
+                        simd<data_type, 2 * SIMD * UNROLL_SIZE> buffer2; //64 registers
+                        int *peer_ptr0 =
+                            ((int *)temp_buffer[0]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr1 =
+                            ((int *)temp_buffer[1]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            buffer2.template select<SIMD, 1>(unroll_i * SIMD +
+                                                             0 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr0 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer2.template select<SIMD, 1>(unroll_i * SIMD +
+                                                             1 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr1 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                        }
+                        //do the actual reduction
+                        result = buffer2.template select<SIMD * UNROLL_SIZE, 1>(0);
+                        result = result +
+                                 buffer2.template select<SIMD * UNROLL_SIZE, 1>(SIMD * UNROLL_SIZE);
+                    }
+                    else if (temp_world == 4) {
+                        int *peer_ptr0 =
+                            ((int *)temp_buffer[0]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr1 =
+                            ((int *)temp_buffer[1]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr2 =
+                            ((int *)temp_buffer[2]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr3 =
+                            ((int *)temp_buffer[3]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            0 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr0 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            1 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr1 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            2 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr2 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            3 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr3 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                        }
+                        //do the actual reduction
+                        result = 0;
+#pragma unroll
+                        for (int r = 0; r < 4; r++) {
+                            //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            result = result + buffer.template select<SIMD * UNROLL_SIZE, 1>(
+                                                  r * SIMD * UNROLL_SIZE);
+                        }
+                    }
+                    else if (temp_world == 8) {
+                        int *peer_ptr0 =
+                            ((int *)temp_buffer[0]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr1 =
+                            ((int *)temp_buffer[1]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr2 =
+                            ((int *)temp_buffer[2]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr3 =
+                            ((int *)temp_buffer[3]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr4 =
+                            ((int *)temp_buffer[4]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr5 =
+                            ((int *)temp_buffer[5]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr6 =
+                            ((int *)temp_buffer[6]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr7 =
+                            ((int *)temp_buffer[7]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            0 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr0 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            1 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr1 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            2 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr2 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            3 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr3 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            4 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr4 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            5 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr5 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            6 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr6 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            7 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr7 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                        }
+                        //do the actual reduction
+                        result = 0;
+#pragma unroll
+                        for (int r = 0; r < 8; r++) {
+                            //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            result = result + buffer.template select<SIMD * UNROLL_SIZE, 1>(
+                                                  r * SIMD * UNROLL_SIZE);
+                        }
+                    }
+                    else if (temp_world == 12) {
+                        int *peer_ptr0 =
+                            ((int *)temp_buffer[0]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr1 =
+                            ((int *)temp_buffer[1]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr2 =
+                            ((int *)temp_buffer[2]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr3 =
+                            ((int *)temp_buffer[3]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr4 =
+                            ((int *)temp_buffer[4]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr5 =
+                            ((int *)temp_buffer[5]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr6 =
+                            ((int *)temp_buffer[6]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr7 =
+                            ((int *)temp_buffer[7]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr8 =
+                            ((int *)temp_buffer[8]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr9 =
+                            ((int *)temp_buffer[9]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr10 =
+                            ((int *)temp_buffer[10]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr11 =
+                            ((int *)temp_buffer[11]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            0 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr0 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            1 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr1 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            2 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr2 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            3 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr3 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            4 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr4 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            5 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr5 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            6 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr6 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            7 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr7 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            8 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr8 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            9 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr9 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            10 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr10 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            11 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr11 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                        }
+                        //do the actual reduction
+                        result = 0;
+#pragma unroll
+                        for (int r = 0; r < 12; r++) {
+                            //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            result = result + buffer.template select<SIMD * UNROLL_SIZE, 1>(
+                                                  r * SIMD * UNROLL_SIZE);
+                        }
+                    }
+                    else if (temp_world == 16) {
+                        //first 8 ranks processing
+                        int *peer_ptr0 =
+                            ((int *)temp_buffer[0]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr1 =
+                            ((int *)temp_buffer[1]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr2 =
+                            ((int *)temp_buffer[2]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr3 =
+                            ((int *)temp_buffer[3]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr4 =
+                            ((int *)temp_buffer[4]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr5 =
+                            ((int *)temp_buffer[5]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr6 =
+                            ((int *)temp_buffer[6]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr7 =
+                            ((int *)temp_buffer[7]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        //second 8 ranks processing
+                        int *peer_ptr8 =
+                            ((int *)temp_buffer[8]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr9 =
+                            ((int *)temp_buffer[9]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr10 =
+                            ((int *)temp_buffer[10]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr11 =
+                            ((int *)temp_buffer[11]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr12 =
+                            ((int *)temp_buffer[12]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr13 =
+                            ((int *)temp_buffer[13]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr14 =
+                            ((int *)temp_buffer[14]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                        int *peer_ptr15 =
+                            ((int *)temp_buffer[15]) +
+                            (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+
+#pragma unroll
+                        for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            0 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr0 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            1 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr1 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            2 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr2 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            3 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr3 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            4 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr4 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            5 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr5 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            6 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr6 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            7 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr7 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            8 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr8 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            9 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr9 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            10 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr10 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            11 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr11 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            12 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr12 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            13 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr13 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            14 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr14 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                            buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                            15 * SIMD * UNROLL_SIZE) =
+                                lsc_block_load<data_type,
+                                               SIMD,
+                                               lsc_data_size::default_size,
+                                               cache_hint::uncached,
+                                               cache_hint::uncached>((data_type *)peer_ptr15 +
+                                                                     offset + unroll_i * SIMD +
+                                                                     i * SIMD * UNROLL_SIZE);
+                        }
+                        //do the actual reduction
+                        result = 0;
+#pragma unroll
+                        for (int r = 0; r < 16; r++) {
+                            //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            result = result + buffer.template select<SIMD * UNROLL_SIZE, 1>(
+                                                  r * SIMD * UNROLL_SIZE);
+                        }
+                    }
+                    else //this is for 2,4,6 ranks. So there is no problem of overflowing the buffer.
+                    {
+                        for (uint32_t r = 0; r < temp_world; r++) {
+                            int *peer_ptr =
+                                ((int *)temp_buffer[r]) +
+                                (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+#pragma unroll
+                            for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD +
+                                                                r * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>((data_type *)peer_ptr +
+                                                                         offset + unroll_i * SIMD +
+                                                                         i * SIMD * UNROLL_SIZE);
+                            }
+                        }
+                        //do the actual reduction
+                        result = 0;
+                        for (uint32_t r = 0; r < temp_world; r++) {
+                            //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            result = result + buffer.template select<SIMD * UNROLL_SIZE, 1>(
+                                                  r * SIMD * UNROLL_SIZE);
+                        }
+                    }
+
+                    //write out the results
+#pragma unroll
+                    for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                        lsc_block_store<data_type,
+                                        SIMD,
+                                        lsc_data_size::default_size,
+                                        cache_hint::write_back,
+                                        cache_hint::write_back>(
+                            (data_type *)out_buffer + offset + unroll_i * SIMD +
+                                i * SIMD * UNROLL_SIZE,
+                            result.template select<SIMD, 1>(unroll_i * SIMD));
+                    }
+                }
+
+                });
+        });
+        //e.wait();
+
+        return ccl::event::create_from_native(e);
+    }
+
+private:
+    sycl::event allreduce_scalar(sycl::queue &queue,
+                                 const void *in_buffer,
+                                 void *out_buffer,
+                                 uint32_t size) {
+        sycl::event e;
+
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = allreduce_small_buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = allreduce_small_sync_buffer[i];
+        }
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+
+        uint32_t max_wg_size __attribute__((unused)) =
+            queue.get_device().get_info<cl::sycl::info::device::max_work_group_size>(); // 1024
+        const uint32_t wg_size = 16;
+        assert(wg_size <= max_wg_size);
+
+        uint32_t total_threads_needed =
+            (size + kernel_inner_loop_scalar - 1) / kernel_inner_loop_scalar;
+        uint32_t total_threads_dispatched =
+            (total_threads_needed + wg_size - 1) / wg_size * wg_size;
+        uint32_t total_wg_count = total_threads_dispatched / wg_size;
+
+        int buffer_index_kernel = allreduce_small_buffer_index;
+        allreduce_small_buffer_index++;
+        allreduce_small_buffer_index %= TRIPLE_BUFFER;
+
+        // pure scalar kernel
+        e = queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for<class Allreduce_small_kernel_scalar<data_type>>(
+                    sycl::nd_range<1>( total_threads_dispatched, wg_size), [=](sycl::nd_item<1> idx2) [[intel::reqd_sub_group_size(wg_size)]] {
+                    uint32_t idx = idx2.get_global_id();
+                    uint32_t offset __attribute__((unused)) = idx * kernel_inner_loop_scalar;
+
+                    //to do:
+                    //O3 compiler optimization: not much difference after the change.
+                    //tune the fence: good perf improvements
+                    //tune the cacheability for each IO message: no noticeable improvement
+                    //tune the thread size: not much improvements
+                    //tune the polling freq
+
+                    if (idx < total_threads_needed) {
+                        //do copy from input buffer to temp buffer.
+                        data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+                        local_temp_ptr +=
+                            (buffer_index_kernel *
+                             size_per_buffer_kernel); //point to the correct buffer inside the triple buffer
+                        gpu_kernel_copy((char *)(local_temp_ptr + offset),
+                                        (const char *)((data_type *)in_buffer + offset),
+                                        kernel_inner_loop_scalar * sizeof(data_type));
+                        //since each threads are copying small chunks of data to temp buffer, all the threads needs to sync globally using atomics within this rank
+                    }
+
+                    int *local_sync_ptr = (int *)temp_sync_buffer[temp_rank] +
+                                          (buffer_index_kernel * size_per_buffer_for_sync_kernel);
+                    //if there are more than 1 threads required per rank, then do the local sync within the rank first.
+                    uint32_t local_tid = idx2.get_local_linear_id();
+                    if (total_threads_needed > 1) {
+                        //sync locally within local GPU first.
+                        if (local_tid == 0) {
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(local_sync_ptr[0]);
+                            atomic_p += 1;
+
+                            //wait for all the local TG to sync. Then sync the other remote GPUs
+                            uint32_t val = atomic_p.load();
+                            //sycl::_V1::ext::oneapi::experimental::printf("HERE in: rank%d sync: %p idx:%d val: %d %d\n", temp_rank, local_sync_ptr, idx, val, total_wg_count);
+                            while (val < total_wg_count) {
+                                val = atomic_p.load();
+                            }
+                        }
+                        //idx2.barrier();
+                    }
+
+                    //once the local level sync is done, atomically write its counter to other remote gpus' atomic counter
+                    if (total_threads_dispatched >= temp_world) {
+                        if (idx < temp_world) {
+                            int *sync_ptr = (int *)temp_sync_buffer
+                                [idx]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                            sync_ptr += (buffer_index_kernel * size_per_buffer_for_sync_kernel);
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(sync_ptr[1]);
+                            atomic_p++;
+                        }
+                    }
+                    else if (idx ==
+                             0) //one thread in the local gpu notifies the remote gpu of its status.
+                    {
+                        for (uint32_t i = 0; i < temp_world; i++) {
+                            int *sync_ptr;
+
+                            sync_ptr = (int *)temp_sync_buffer
+                                [i]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                            sync_ptr += (buffer_index_kernel * size_per_buffer_for_sync_kernel);
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(sync_ptr[1]);
+                            atomic_p++;
+                        }
+                    }
+
+                    //once the local sync is done, retire useless threads
+                    if (idx >= total_threads_needed)
+                        return;
+
+                    //once all the local TGs are sync, do fence so that other GPU can see.
+                    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                    //wait for completion of the atomic sync
+                    if (local_tid == 0) {
+                        sycl::atomic_ref<int,
+                                         sycl::memory_order::relaxed,
+                                         sycl::memory_scope::device,
+                                         sycl::access::address_space::global_space>
+                            atomic_p(local_sync_ptr[1]);
+                        uint32_t val = atomic_p.load();
+                        while (val < temp_world) {
+                            val = atomic_p.load();
+                        }
+                    }
+
+                    //reset the sync counter for the next allreduce session. Each rank reset's its own buffer
+                    if (idx ==
+                        0) //one thread in the local gpu notifies the remote gpu of its status.
+                    {
+                        int buffer_index_to_reset =
+                            (buffer_index_kernel + TRIPLE_BUFFER - 1) % TRIPLE_BUFFER;
+                        local_sync_ptr = (int *)temp_sync_buffer
+                            [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        local_sync_ptr += (buffer_index_to_reset * size_per_buffer_for_sync_kernel);
+                        local_sync_ptr[0] = local_sync_ptr[1] = 0;
+                    }
+
+                    //at this point, all the threads are done copying data from input buffer to temp buffer, do All reduce
+                    switch (temp_world) {
+                        case 2:
+                            reduce_kernel<data_type, 2>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                offset,
+                                (data_type *)out_buffer + offset);
+                            break;
+                        case 4:
+                            reduce_kernel<data_type, 4>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                offset,
+                                (data_type *)out_buffer + offset);
+                            break;
+                        case 6:
+                            reduce_kernel<data_type, 6>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                offset,
+                                (data_type *)out_buffer + offset);
+                            break;
+                        case 8:
+                            reduce_kernel<data_type, 8>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                offset,
+                                (data_type *)out_buffer + offset);
+                            break;
+                        case 10:
+                            reduce_kernel<data_type, 10>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                offset,
+                                (data_type *)out_buffer + offset);
+                            break;
+                        case 12:
+                            reduce_kernel<data_type, 12>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                offset,
+                                (data_type *)out_buffer + offset);
+                            break;
+                        case 14:
+                            reduce_kernel<data_type, 14>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                offset,
+                                (data_type *)out_buffer + offset);
+                            break;
+                        case 16:
+                            reduce_kernel<data_type, 16>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                offset,
+                                (data_type *)out_buffer + offset);
+                            break;
+                        default: assert(0);
+                    }
+                    });
+        });
+        //e.wait();
+        return e;
+    }
+
+private:
+    int rank{ ccl::utils::invalid_rank }, world{ ccl::utils::invalid_err_code };
+    int size_per_buffer{ ccl::utils::invalid_bytes_value };
+    int data_size_per_buffer{ ccl::utils::invalid_bytes_value };
+    ccl_stream *global_stream{};
+    ccl_comm *global_comm{};
+    ccl_comm *even_comm{};
+};
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl_noesimd.hpp b/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl_noesimd.hpp
new file mode 100644
index 000000000..5163a188f
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_small_sycl_noesimd.hpp
@@ -0,0 +1,710 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define MAX_REPETITION                 16
+#define SIMD                           128
+#define SIMD_ATOMIC                    16
+#define MAX_RANK                       16
+#define UNROLL_SIZE                    1
+#define TRIPLE_BUFFER                  3
+#define SYNC_BYTE                      (SIMD_ATOMIC * sizeof(int) * 2)
+#define ALIGNMENT_BYTE                 256
+#define EU_COUNT                       448
+#define THREADS_PER_EU                 8
+#define MAX_THREAD                     (EU_COUNT * THREADS_PER_EU)
+#define MAX_COUNT                      (SIMD * UNROLL_SIZE * kernel_inner_loop * MAX_THREAD)
+#define LOOP_COUNT_LIMIT               (1000000)
+#define DEBUG_DATA_SIZE                16
+#define DEBUG_THREAD_COUNT             2
+#define DEBUG_DUMP_TO_DEDICATED_OFFSET 1
+#define DEBUG                          0
+
+const int kernel_inner_loop = 1;
+const uint32_t vec_size = 16;
+
+template <typename dtype>
+class Allreduce_small_kernel_scalar;
+template <typename dtype>
+class Allreduce_small_kernel_block;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK, uint32_t max_buffer = 1024 /*KB*/>
+class sycl_allreducer_small : public sycl_coll_base<data_type> {
+public:
+    sycl_allreducer_small() : sycl_coll_base<data_type>() {
+        buffer_index = 0;
+        size_per_buffer = 0;
+    }
+
+    void init(sycl::queue &queue,
+              ccl_comm *comm,
+              ccl_stream *stream,
+              uint32_t rank_in,
+              uint32_t world_in) {
+        rank = rank_in;
+        world = world_in;
+        // temporal buffer used for allreduce temporal use only.
+        data_size_per_buffer = MAX_COUNT;
+        data_size_per_buffer =
+            ((data_size_per_buffer * sizeof(data_type) + ALIGNMENT_BYTE - 1) / ALIGNMENT_BYTE) *
+            ALIGNMENT_BYTE / sizeof(data_type); //aligned size
+        size_per_buffer = data_size_per_buffer * sizeof(data_type) + SYNC_BYTE;
+        void *local_triple_buffer = sycl::malloc_device(size_per_buffer * TRIPLE_BUFFER, queue);
+        int wg_size = 1;
+
+        auto e = queue.memset(local_triple_buffer, 0, size_per_buffer * TRIPLE_BUFFER);
+        e.wait();
+        this->exchange_peer_ipc_mem(queue,
+                                    comm,
+                                    local_triple_buffer,
+                                    rank,
+                                    world,
+                                    data_size_per_buffer,
+                                    (void **)buffers,
+                                    (void **)sync_buffer,
+                                    offsets,
+                                    ipc_handle);
+        this->initialized = true;
+
+        //dummy kernel to avoid hang. The hang happens when there is no dummy kernel and allreduce() is called right after init().
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for(sycl::nd_range<1>({ 1 }, wg_size), [=](sycl::item<1> idx) {
+
+            });
+        });
+        e.wait();
+    }
+    ccl::event allreduce(sycl::queue &queue,
+                         const void *in_buffer,
+                         void *out_buffer,
+                         uint32_t size,
+                         int repetition,
+                         bool print_en) {
+        sycl::event e;
+
+        if (repetition > MAX_REPETITION) {
+            printf("error: repetition cannot be larger than %d\n", MAX_REPETITION);
+            exit(-1);
+        }
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        int r;
+        assert(this->initialized == true);
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+        int size_per_buffer_kernel = size_per_buffer;
+
+        int max_wg_size __attribute__((unused)) =
+            queue.get_device().get_info<cl::sycl::info::device::max_work_group_size>(); // 1024
+        const int wg_size = 32;
+        // assert(wg_size <= max_wg_size); TODO
+
+        const int subgroup_size __attribute__((unused)) = 32;
+
+        int num_vecs = 1;
+        if (size >= 262144)
+            num_vecs = 4;
+        else if (size > 32768)
+            num_vecs = 2;
+
+        uint32_t v = vec_size * num_vecs;
+        if (size <= 2048)
+            v = 1;
+
+        assert(wg_size >= subgroup_size);
+
+        uint32_t total_threads_needed = size > v ? (size + v - 1) / v : 1;
+        uint32_t total_threads_dispatched =
+            (total_threads_needed + wg_size - 1) / wg_size * wg_size;
+        uint32_t total_wg_count = total_threads_dispatched / wg_size;
+
+        for (r = 0; r < repetition; r++) {
+            int buffer_index_kernel = buffer_index;
+            buffer_index++;
+            buffer_index %= TRIPLE_BUFFER;
+
+            if (v == 1) {
+                e = queue.submit([&](sycl::handler &cgh) {
+                cgh.parallel_for<class Allreduce_small_kernel_scalar<data_type>>(
+                    sycl::nd_range<1>( total_threads_dispatched, wg_size), [=](sycl::nd_item<1> idx2) {
+                    uint32_t idx = idx2.get_global_id();
+                    //uint32_t idx = idx2.get_linear_id();
+
+                    uint32_t offset __attribute__((unused)) = idx;
+
+                    //to do:
+                    //O3 compiler optimization: not much difference after the change.
+                    //tune the fence: good perf improvements
+                    //tune the cacheability for each IO message: no noticeable improvement
+                    //tune the thread size: not much improvements
+                    //tune the polling freq
+
+                    if (idx < total_threads_needed) {
+                        //do copy from input buffer to temp buffer.
+                        data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+                        local_temp_ptr +=
+                            (buffer_index_kernel * size_per_buffer_kernel /
+                             sizeof(
+                                 data_type)); //point to the correct buffer inside the triple buffer
+                        data_type *dest_ptr = local_temp_ptr + offset;
+                        data_type *src_ptr = (data_type *)in_buffer + offset;
+                        *(dest_ptr) = *(src_ptr);
+                        //since each threads are copying small chunks of data to temp buffer, all the threads needs to sync globally using atomics within this rank
+                    }
+
+                    int *local_sync_ptr;
+                    local_sync_ptr = (int *)temp_sync_buffer
+                        [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                    local_sync_ptr += (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                    //if there are more than 1 threads required per rank, then do the local sync within the rank first.
+                    if (total_threads_needed > 1) {
+                        uint32_t local_tid = idx2.get_local_linear_id();
+                        if (local_tid == 0) {
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(local_sync_ptr[0]);
+                            atomic_p += 1;
+
+                            uint32_t val = atomic_p.load();
+                            while (val != total_wg_count) {
+                                val = atomic_p.load();
+                            }
+                        }
+                        idx2.barrier();
+                    }
+
+                    //once the local sync is done, retire useless threads
+                    if (idx >= total_threads_needed)
+                        return;
+
+                    //once the local level sync is done, atomically write its counter to other remote gpus' atomic counter
+                    if (idx ==
+                        0) //one thread in the local gpu notifies the remote gpu of its status.
+                    {
+                        uint32_t status0 = total_threads_needed;
+                        for (uint32_t i = 0; i < temp_world; i++) {
+                            int *sync_ptr;
+
+                            sync_ptr = (int *)temp_sync_buffer
+                                [i]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                            sync_ptr +=
+                                (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(sync_ptr[1]);
+                            atomic_p += status0;
+                        }
+                    }
+
+                    //once all the local TGs are sync, do fence so that other GPU can see.
+                    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                    //wait for completion of the atomic sync
+                    sycl::atomic_ref<int,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>
+                        atomic_p(local_sync_ptr[1]);
+                    uint32_t val = atomic_p.load();
+                    while (val != total_threads_needed * temp_world) {
+                        val = atomic_p.load();
+                    }
+
+                    //reset the sync counter for the next allreduce session. Each rank reset's its own buffer
+                    if (idx ==
+                        0) //one thread in the local gpu notifies the remote gpu of its status.
+                    {
+                        int buffer_index_to_reset =
+                            (buffer_index_kernel + TRIPLE_BUFFER - 1) % TRIPLE_BUFFER;
+                        local_sync_ptr = (int *)temp_sync_buffer
+                            [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        local_sync_ptr +=
+                            (buffer_index_to_reset * size_per_buffer_kernel / sizeof(int));
+                        /*
+                            sycl::atomic_ref<int, sycl::memory_order::seq_cst,
+                                sycl::memory_scope::system,
+                                sycl::access::address_space::global_space>
+                                atomic_p(local_sync_ptr[0]);
+			    */
+                        local_sync_ptr[0] = local_sync_ptr[1] = 0;
+                    }
+
+                    //at this point, all the threads are done copying data from input buffer to temp buffer.
+                    //do All reduce
+                    //data_type result;
+                    data_type result;
+                    for (int i = 0; i < kernel_inner_loop; i++) {
+                        if (temp_world == 4) {
+                            data_type *peer_ptr0 =
+                                (data_type *)(((int *)temp_buffer[0]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr1 =
+                                (data_type *)(((int *)temp_buffer[1]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr2 =
+                                (data_type *)(((int *)temp_buffer[2]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr3 =
+                                (data_type *)(((int *)temp_buffer[3]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            //do the actual reduction
+                            result = *peer_ptr0;
+                            result += *peer_ptr1;
+                            result += *peer_ptr2;
+                            result += *peer_ptr3;
+                        }
+                        else if (temp_world == 8) {
+                            data_type *peer_ptr0 =
+                                (data_type *)(((int *)temp_buffer[0]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr1 =
+                                (data_type *)(((int *)temp_buffer[1]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr2 =
+                                (data_type *)(((int *)temp_buffer[2]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr3 =
+                                (data_type *)(((int *)temp_buffer[3]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr4 =
+                                (data_type *)(((int *)temp_buffer[4]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr5 =
+                                (data_type *)(((int *)temp_buffer[5]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr6 =
+                                (data_type *)(((int *)temp_buffer[6]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            data_type *peer_ptr7 =
+                                (data_type *)(((int *)temp_buffer[7]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int))) +
+                                offset;
+                            //do the actual reduction
+                            result = *peer_ptr0;
+                            result += *peer_ptr1;
+                            result += *peer_ptr2;
+                            result += *peer_ptr3;
+                            result += *peer_ptr4;
+                            result += *peer_ptr5;
+                            result += *peer_ptr6;
+                            result += *peer_ptr7;
+                        }
+                        else //this is for 2,4,6 ranks. So there is no problem of overflowing the buffer.
+                        {
+                            data_type *peer_ptr[MAX_RANK];
+                            //do the actual reduction
+                            result = 0;
+#pragma unroll
+                            for (uint32_t r = 0; r < temp_world; r++) {
+                                peer_ptr[r] = (data_type *)(((int *)temp_buffer[r]) +
+                                                            (buffer_index_kernel *
+                                                             size_per_buffer_kernel / sizeof(int)));
+                                peer_ptr[r] += offset;
+                                result += *(peer_ptr[r]);
+                            }
+                        }
+
+                        //write out the results
+                        *((data_type *)out_buffer + offset) = result;
+                    } // end of for loop
+
+                    });
+                });
+                //e.wait();
+            }
+            else {
+                // block read kernel
+                e = queue.submit([&](sycl::handler &cgh) {
+                    //                cgh.depends_on(memcpy_event);
+                cgh.parallel_for<class Allreduce_small_kernel_block<data_type>>(
+                    sycl::nd_range<1>( total_threads_dispatched, wg_size), [=](sycl::nd_item<1> idx2) [[intel::reqd_sub_group_size(subgroup_size)]] {
+                    //                    sycl::nd_range<1>( total_threads_dispatched, wg_size), [=](sycl::nd_item<1> idx2) {
+
+                    uint32_t idx = idx2.get_global_id();
+                    //uint32_t idx = idx2.get_linear_id();
+                    sycl::sub_group sg = idx2.get_sub_group();
+                    const size_t sgSize __attribute__((unused)) = sg.get_local_range()[0];
+
+                    //using global_ptr = sycl::multi_ptr<data_type, sycl::access::address_space::global_space, sycl::access::decorated::yes>;
+                    using global_ptr =
+                        sycl::multi_ptr<data_type, sycl::access::address_space::global_space>;
+                    uint32_t offset __attribute__((unused)) = idx;
+                    int base = (offset / sgSize) * sgSize * vec_size * num_vecs;
+                    //int base = (idx2.get_group(0) * wg_size + sg.get_group_id()[0] * sgSize * subgroup_size;
+                    int use_block_rw = 1;
+                    uint32_t nelem = sgSize * vec_size * num_vecs;
+                    if (size - sg.get_group_id()[0] * sgSize * v < sgSize * v) {
+                        use_block_rw = 0;
+                        //nelem = size - sg.get_group_id()[0] * sgSize * vec_size;
+                        nelem = size - idx * v;
+                        if (nelem > v)
+                            nelem = v;
+                        else if (nelem < 0)
+                            nelem = 0;
+                    }
+
+                    //to do:
+                    //O3 compiler optimization: not much difference after the change.
+                    //tune the fence: good perf improvements
+                    //tune the cacheability for each IO message: no noticeable improvement
+                    //tune the thread size: not much improvements
+                    //tune the polling freq
+
+                    if (idx < total_threads_needed) {
+                        //do copy from input buffer to temp buffer.
+                        data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+                        local_temp_ptr +=
+                            (buffer_index_kernel * size_per_buffer_kernel /
+                             sizeof(
+                                 data_type)); //point to the correct buffer inside the triple buffer
+                        if (use_block_rw) {
+                            int b = base;
+                            sycl::vec<data_type, vec_size> val;
+                            for (int m = 0; m < num_vecs; m++) {
+                                val = sg.load<vec_size>(global_ptr(&(((data_type *)in_buffer)[b])));
+                                sg.store<vec_size>(global_ptr(local_temp_ptr + b), val);
+                                b += sgSize * vec_size;
+                            }
+                        }
+                        else {
+                            data_type *dest_ptr = local_temp_ptr + offset * vec_size;
+                            data_type *src_ptr = (data_type *)in_buffer + offset * vec_size;
+#pragma unroll
+                            for (uint32_t n = 0; n < nelem; n++) {
+                                *(dest_ptr + n) = *(src_ptr + n);
+                            }
+                        }
+                        //since each threads are copying small chunks of data to temp buffer, all the threads needs to sync globally using atomics within this rank
+                    }
+
+                    int *local_sync_ptr;
+                    local_sync_ptr = (int *)temp_sync_buffer
+                        [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                    local_sync_ptr += (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                    //if there are more than 1 threads required per rank, then do the local sync within the rank first.
+                    if (total_threads_needed > 1) {
+                        uint32_t local_tid = idx2.get_local_linear_id();
+                        //sync locally within local GPU first.
+                        //sycl::_V1::ext::oneapi::experimental::printf("HERE in: rank%d %d\n", temp_rank, local_tid);
+                        if (local_tid == 0) {
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(local_sync_ptr[0]);
+                            atomic_p += 1;
+
+                            //wait for all the local TG to sync. Then sync the other remote GPUs
+                            uint32_t val = atomic_p.load();
+                            while (val != total_wg_count) {
+                                val = atomic_p.load();
+                            }
+                        }
+                        idx2.barrier();
+                    }
+
+                    //once the local sync is done, retire useless threads
+                    if (idx >= total_threads_needed)
+                        return;
+
+                    //once the local level sync is done, atomically write its counter to other remote gpus' atomic counter
+                    if (idx ==
+                        0) //one thread in the local gpu notifies the remote gpu of its status.
+                    {
+                        uint32_t status0 = total_threads_needed;
+                        for (uint32_t i = 0; i < temp_world; i++) {
+                            int *sync_ptr;
+
+                            sync_ptr = (int *)temp_sync_buffer
+                                [i]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                            sync_ptr +=
+                                (buffer_index_kernel * size_per_buffer_kernel / sizeof(int));
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(sync_ptr[1]);
+                            atomic_p += status0;
+                        }
+                    }
+
+                    //once all the local TGs are sync, do fence so that other GPU can see.
+                    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                    //wait for completion of the atomic sync
+                    sycl::atomic_ref<int,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>
+                        atomic_p(local_sync_ptr[1]);
+                    uint32_t val = atomic_p.load();
+                    while (val != total_threads_needed * temp_world) {
+                        val = atomic_p.load();
+                    }
+
+                    //reset the sync counter for the next allreduce session. Each rank reset's its own buffer
+                    if (idx ==
+                        0) //one thread in the local gpu notifies the remote gpu of its status.
+                    {
+                        int buffer_index_to_reset =
+                            (buffer_index_kernel + TRIPLE_BUFFER - 1) % TRIPLE_BUFFER;
+                        local_sync_ptr = (int *)temp_sync_buffer
+                            [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        local_sync_ptr +=
+                            (buffer_index_to_reset * size_per_buffer_kernel / sizeof(int));
+
+                        local_sync_ptr[0] = local_sync_ptr[1] = 0;
+                    }
+
+                    //at this point, all the threads are done copying data from input buffer to temp buffer.
+                    //do All reduce
+                    for (int i = 0; i < kernel_inner_loop; i++) {
+                        if (temp_world == 4) {
+                            data_type *peer_ptr0 =
+                                (data_type *)(((int *)temp_buffer[0]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr1 =
+                                (data_type *)(((int *)temp_buffer[1]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr2 =
+                                (data_type *)(((int *)temp_buffer[2]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr3 =
+                                (data_type *)(((int *)temp_buffer[3]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            if (use_block_rw) {
+                                int b = base;
+                                sycl::vec<data_type, vec_size> result;
+                                for (int m = 0; m < num_vecs; m++) {
+                                    result = sg.load<vec_size>(global_ptr(peer_ptr0 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr1 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr2 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr3 + b));
+                                    sg.store<vec_size>(global_ptr((data_type *)out_buffer + b),
+                                                       result);
+                                    b += sgSize * vec_size;
+                                }
+                            }
+                            else {
+                                peer_ptr0 += offset * vec_size * num_vecs;
+                                peer_ptr1 += offset * vec_size * num_vecs;
+                                peer_ptr2 += offset * vec_size * num_vecs;
+                                peer_ptr3 += offset * vec_size * num_vecs;
+                                data_type *dest =
+                                    (data_type *)out_buffer + offset * vec_size * num_vecs;
+                                data_type res;
+#pragma unroll
+                                for (uint32_t n = 0; n < nelem; n++) {
+                                    res = *(peer_ptr0++);
+                                    res += *(peer_ptr1++);
+                                    res += *(peer_ptr2++);
+                                    res += *(peer_ptr3++);
+                                    *dest = res;
+                                    dest++;
+                                }
+                            }
+                        }
+                        else if (temp_world == 8) {
+                            data_type *peer_ptr0 =
+                                (data_type *)(((int *)temp_buffer[0]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr1 =
+                                (data_type *)(((int *)temp_buffer[1]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr2 =
+                                (data_type *)(((int *)temp_buffer[2]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr3 =
+                                (data_type *)(((int *)temp_buffer[3]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr4 =
+                                (data_type *)(((int *)temp_buffer[4]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr5 =
+                                (data_type *)(((int *)temp_buffer[5]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr6 =
+                                (data_type *)(((int *)temp_buffer[6]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            data_type *peer_ptr7 =
+                                (data_type *)(((int *)temp_buffer[7]) +
+                                              (buffer_index_kernel * size_per_buffer_kernel /
+                                               sizeof(int)));
+                            if (use_block_rw) {
+                                int b = base;
+                                sycl::vec<data_type, vec_size> result;
+                                for (int m = 0; m < num_vecs; m++) {
+                                    result = sg.load<vec_size>(global_ptr(peer_ptr0 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr1 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr2 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr3 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr4 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr5 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr6 + b));
+                                    result += sg.load<vec_size>(global_ptr(peer_ptr7 + b));
+                                    sg.store<vec_size>(global_ptr((data_type *)out_buffer + b),
+                                                       result);
+                                    b += sgSize * vec_size;
+                                }
+                            }
+                            else {
+                                peer_ptr0 += offset * vec_size * num_vecs;
+                                peer_ptr1 += offset * vec_size * num_vecs;
+                                peer_ptr2 += offset * vec_size * num_vecs;
+                                peer_ptr3 += offset * vec_size * num_vecs;
+                                peer_ptr4 += offset * vec_size * num_vecs;
+                                peer_ptr5 += offset * vec_size * num_vecs;
+                                peer_ptr6 += offset * vec_size * num_vecs;
+                                peer_ptr7 += offset * vec_size * num_vecs;
+                                data_type *dest =
+                                    (data_type *)out_buffer + offset * vec_size * num_vecs;
+                                data_type res;
+#pragma unroll
+                                for (uint32_t n = 0; n < nelem; n++) {
+                                    res = *(peer_ptr0++);
+                                    res += *(peer_ptr1++);
+                                    res += *(peer_ptr2++);
+                                    res += *(peer_ptr3++);
+                                    res += *(peer_ptr4++);
+                                    res += *(peer_ptr5++);
+                                    res += *(peer_ptr6++);
+                                    res += *(peer_ptr7++);
+                                    *dest = res;
+                                    dest++;
+                                }
+                            }
+                        }
+                        else //this is for 2,4,6 ranks. So there is no problem of overflowing the buffer.
+                        {
+                            data_type *peer_ptr[MAX_RANK];
+#pragma unroll
+                            for (uint32_t r = 0; r < temp_world; r++) {
+                                peer_ptr[r] = (data_type *)(((int *)temp_buffer[r]) +
+                                                            (buffer_index_kernel *
+                                                             size_per_buffer_kernel / sizeof(int)));
+                            }
+                            if (use_block_rw) {
+                                int b = base;
+                                for (int m = 0; m < num_vecs; m++) {
+                                    sycl::vec<data_type, vec_size> result(0);
+                                        //sycl::vec<data_type, vec_size> result = sg.load<vec_size>(global_ptr(peer_ptr[0] + b));
+#pragma unroll
+                                    for (uint32_t r = 1; r < temp_world; r++) {
+                                        result += sg.load<vec_size>(global_ptr(peer_ptr[r] + b));
+                                    }
+                                    sg.store<vec_size>(global_ptr((data_type *)out_buffer + b),
+                                                       result);
+                                    b += sgSize * vec_size;
+                                }
+                            }
+                            else {
+#pragma unroll
+                                for (uint32_t r = 0; r < temp_world; r++) {
+                                    peer_ptr[r] += offset * vec_size * num_vecs;
+                                }
+                                data_type *dest =
+                                    (data_type *)out_buffer + offset * vec_size * num_vecs;
+                                for (uint32_t n = 0; n < nelem; n++) {
+                                    data_type res = 0;
+#pragma unroll
+                                    for (uint32_t r = 0; r < temp_world; r++) {
+                                        res += *(peer_ptr[r]++);
+                                    }
+                                    *dest = res;
+                                    dest++;
+                                }
+                            }
+                        }
+
+                    } // end of for loop
+
+                    });
+                });
+            } // kernel
+
+        } // for (r = 0; r < repetition; r++)
+
+        return ccl::event::create_from_native(e);
+    }
+    void release(sycl::queue &queue) {
+        // Clean up, close/put ipc handles, free memory, etc.
+        auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_context());
+        for (uint32_t i = 0; i < world; i++) {
+            if (i != rank) {
+                ZE_CALL(zeMemCloseIpcHandle, (l0_ctx, (char *)buffers[i] - offsets[i]));
+            }
+        }
+
+        sycl::free(buffers[rank], queue);
+        this->initialized = false;
+    }
+
+private:
+    void *buffers[max_rank];
+    void *sync_buffer[max_rank];
+    size_t offsets[max_rank];
+    ze_ipc_mem_handle_t ipc_handle[max_rank];
+    int rank, world;
+    int buffer_index;
+    int size_per_buffer;
+    int data_size_per_buffer;
+};
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_sycl.cpp b/src/coll/algorithms/allreduce/sycl/allreduce_sycl.cpp
new file mode 100644
index 000000000..97cb57201
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_sycl.cpp
@@ -0,0 +1,137 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+#include "coll/algorithms/allreduce/sycl/allreduce_sycl.hpp"
+#endif // defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+
+namespace ccl {
+namespace v1 {
+
+struct impl_dispatch {
+    template <class Object>
+    const typename Object::impl_value_t& operator()(const Object& obj) {
+        return obj.get_impl();
+    }
+};
+
+event allreduce_sycl(sycl::queue q,
+                     const void* send_buf,
+                     void* recv_buf,
+                     size_t count,
+                     datatype dtype,
+                     reduction reduction,
+                     const communicator& comm,
+                     const stream& op_stream,
+                     const allreduce_attr& attr,
+                     const vector_class<event>& deps,
+                     bool& done) {
+    ccl::event e;
+    done = true;
+
+    uint32_t world = comm.size();
+    int rank = comm.rank();
+
+    auto ccl_dtype = ccl::global_data::get().dtypes->get(dtype);
+
+    if (world == 1) {
+        sycl::event sycl_e;
+        if (send_buf != recv_buf) {
+            sycl_e = q.memcpy(recv_buf, send_buf, count * ccl_dtype.size());
+        }
+        return ccl::event::create_from_native(sycl_e);
+    }
+
+    ccl::impl_dispatch disp;
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    ccl_stream* global_stream = get_stream_ptr(disp(op_stream));
+    const bool is_single_tile = global_comm->get_pair_comm()->size() == 1;
+    const bool has_all_vertices_connected =
+        global_comm->get_topo_manager().has_all_vertices_connected();
+    LOG_DEBUG("|CCL_SYCL| is_single_tile: ",
+              is_single_tile,
+              ", has_all_vertices_connected: ",
+              has_all_vertices_connected);
+
+    if (count * ccl_dtype.size() <= ccl::global_data::env().allreduce_small_size_threshold &&
+        has_all_vertices_connected) {
+        init_allreduce_small(dtype, q, global_comm, global_stream, rank, world);
+
+#ifdef CCL_ENABLE_ITT
+        __itt_event coll_create_itt_event = ccl::profile::itt::event_get("CCL_ALLREDUCE_SMALL");
+        ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+        LOG_DEBUG("|CCL_SYCL| allreduce selects small kernel, count:", count, " datatype: ", dtype);
+        e = run_allreduce_small(dtype, q, send_buf, recv_buf, count);
+        LOG_DEBUG("|CCL_SYCL| allreduce selects small kernel, count:",
+                  count,
+                  " datatype: ",
+                  dtype,
+                  " done");
+#ifdef CCL_ENABLE_ITT
+        ccl::profile::itt::event_end(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+    }
+    else if ((count * ccl_dtype.size() <= ccl::global_data::env().allreduce_medium_size_threshold ||
+              (global_comm->size() == 2 && !ccl::global_data::env().allreduce_use_tmp_buf)) &&
+             !is_single_tile) { // medium message sizes
+        init_allreduce_medium(dtype, q, global_comm, global_stream, rank, world);
+
+#ifdef CCL_ENABLE_ITT
+        __itt_event coll_create_itt_event = ccl::profile::itt::event_get("CCL_ALLREDUCE_MEDIUM");
+        ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+        LOG_DEBUG(
+            "|CCL_SYCL| allreduce selects medium kernel, count:", count, " datatype: ", dtype);
+        e = run_allreduce_medium(dtype, q, send_buf, recv_buf, count);
+        LOG_DEBUG("|CCL_SYCL| allreduce selects medium kernel, count:",
+                  count,
+                  " datatype: ",
+                  dtype,
+                  " done");
+#ifdef CCL_ENABLE_ITT
+        ccl::profile::itt::event_end(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+    }
+    else if (!is_single_tile) { // large message sizes
+        init_allreduce_large(dtype, q, global_comm, global_stream, rank, world);
+
+#ifdef CCL_ENABLE_ITT
+        __itt_event coll_create_itt_event = ccl::profile::itt::event_get("CCL_ALLREDUCE_LARGE");
+        ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+        LOG_DEBUG("|CCL_SYCL| allreduce selects large kernel, count:", count, " datatype: ", dtype);
+        e = run_allreduce_large(dtype, q, send_buf, recv_buf, count);
+        LOG_DEBUG("|CCL_SYCL| allreduce selects large kernel, count:",
+                  count,
+                  " datatype: ",
+                  dtype,
+                  " done");
+#ifdef CCL_ENABLE_ITT
+        ccl::profile::itt::event_end(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+    }
+    else {
+        done = false;
+    }
+
+    return e;
+}
+
+} // namespace v1
+} // namespace ccl
diff --git a/src/coll/algorithms/allreduce/sycl/allreduce_sycl.hpp b/src/coll/algorithms/allreduce/sycl/allreduce_sycl.hpp
new file mode 100644
index 000000000..4d5432113
--- /dev/null
+++ b/src/coll/algorithms/allreduce/sycl/allreduce_sycl.hpp
@@ -0,0 +1,50 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#define SYCL_ALLREDUCE_FUNCTIONS(MSGSIZE) \
+    void init_allreduce_##MSGSIZE(ccl::datatype dtype, \
+                                  sycl::queue& queue, \
+                                  ccl_comm* comm, \
+                                  ccl_stream* stream, \
+                                  uint32_t rank_in, \
+                                  uint32_t world_in); \
+    ccl::event run_allreduce_##MSGSIZE( \
+        ccl::datatype dtype, sycl::queue q, const void* in_buf, void* out_buf, size_t count);
+
+SYCL_ALLREDUCE_FUNCTIONS(small)
+SYCL_ALLREDUCE_FUNCTIONS(medium)
+SYCL_ALLREDUCE_FUNCTIONS(large)
+
+namespace ccl {
+
+namespace v1 {
+
+event allreduce_sycl(sycl::queue q,
+                     const void* send_buf,
+                     void* recv_buf,
+                     size_t count,
+                     datatype dtype,
+                     reduction reduction,
+                     const communicator& comm,
+                     const stream& op_stream,
+                     const allreduce_attr& attr,
+                     const vector_class<event>& deps,
+                     bool& done);
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/coll/algorithms/barrier.cpp b/src/coll/algorithms/barrier/barrier.cpp
similarity index 100%
rename from src/coll/algorithms/barrier.cpp
rename to src/coll/algorithms/barrier/barrier.cpp
diff --git a/src/coll/algorithms/reduce.cpp b/src/coll/algorithms/reduce.cpp
index 41c5ec568..c10c1ab40 100644
--- a/src/coll/algorithms/reduce.cpp
+++ b/src/coll/algorithms/reduce.cpp
@@ -548,14 +548,14 @@ void get_counts_n_offsets_bidir(size_t count,
     even_comm_offset_bytes = main_block_count * even_comm_rank * dtype.size();
 }
 
-ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
-                                       ccl_buffer send_buf,
-                                       ccl_buffer recv_buf,
-                                       size_t count,
-                                       const ccl_datatype& dtype,
-                                       ccl::reduction op,
-                                       int root,
-                                       ccl_comm* comm) {
+ccl::status ccl_coll_build_topo_reduce_fill(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            ccl_buffer recv_buf,
+                                            size_t count,
+                                            const ccl_datatype& dtype,
+                                            ccl::reduction op,
+                                            int root,
+                                            ccl_comm* comm) {
     LOG_DEBUG("build gpu topo reduce");
 
     if (count == 0)
@@ -576,7 +576,8 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
     bool use_tmp_buf = !is_single_card;
     bool use_read_write_pipeline =
         ccl::global_data::env().reduce_scatter_monolithic_pipeline_kernel &&
-        even_comm->size() > 1 && pair_comm->size() > 1 && (int)count >= comm_size;
+        even_comm->size() > 1 && pair_comm->size() > 1 && (int)count >= comm_size &&
+        ccl::global_data::env().enable_ze_bidir_algo;
 
     // allocate tmp buff for write
     ccl_buffer tmp_write_buf;
@@ -666,10 +667,7 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
         in_buffers.push_back({ static_cast<void*>(ipc_event_pool), ccl::ze::ipc_mem_type::pool });
     }
 
-    // TODO: remove this if() condition. We want to always use a single list
-    if (is_rs_write) {
-        sched->try_enable_ze_single_list();
-    }
+    sched->try_enable_ze_single_list();
 
     ccl::add_handle_exchange(sched,
                              node_comm,
@@ -966,9 +964,33 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
     CCL_ASSERT(wait_events.size() == 1 && wait_events.back() != nullptr,
                "wait_events should have a single, valid event");
 
-    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
-
     return ccl::status::success;
 }
 
+ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
+                                       ccl_buffer send_buf,
+                                       ccl_buffer recv_buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       ccl::reduction op,
+                                       int root,
+                                       ccl_comm* comm) {
+    return ccl_build_topo_uniform_buff_size_op(
+        sched,
+        send_buf,
+        recv_buf,
+        count,
+        dtype.size(),
+        ccl::global_data::env().reduce_pipe_chunk_count,
+        "REDUCE",
+        ccl::global_data::get().metrics_profiler->reduce_pipe,
+        [dtype, op, root, comm](ccl_sched* sched,
+                                ccl_buffer send_buf,
+                                ccl_buffer recv_buf,
+                                size_t count) -> ccl::status {
+            return ccl_coll_build_topo_reduce_fill(
+                sched, send_buf, recv_buf, count, dtype, op, root, comm);
+        });
+}
+
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/reduce_scatter.cpp b/src/coll/algorithms/reduce_scatter/reduce_scatter.cpp
similarity index 95%
rename from src/coll/algorithms/reduce_scatter.cpp
rename to src/coll/algorithms/reduce_scatter/reduce_scatter.cpp
index 498be0a38..5780f09cf 100644
--- a/src/coll/algorithms/reduce_scatter.cpp
+++ b/src/coll/algorithms/reduce_scatter/reduce_scatter.cpp
@@ -354,13 +354,13 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
-ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
-                                               ccl_buffer send_buf,
-                                               ccl_buffer recv_buf,
-                                               size_t recv_count,
-                                               const ccl_datatype& dtype,
-                                               ccl::reduction op,
-                                               ccl_comm* comm) {
+ccl::status ccl_coll_build_topo_reduce_scatter_fill(ccl_sched* sched,
+                                                    ccl_buffer send_buf,
+                                                    ccl_buffer recv_buf,
+                                                    size_t recv_count,
+                                                    const ccl_datatype& dtype,
+                                                    ccl::reduction op,
+                                                    ccl_comm* comm) {
     LOG_DEBUG("build topo reduce_scatter, recv_count ", recv_count);
     if (recv_count == 0) {
         return ccl::status::success;
@@ -392,10 +392,9 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
     };
 
     // TODO: fix - reduce_scatter pipeline uses xelink write which seems to fail with int8
-    // TODO: enable pipeline for scaleout
     const bool use_reduce_scatter_pipeline =
         ccl::global_data::env().reduce_scatter_monolithic_pipeline_kernel && pair_comm_size > 1 &&
-        dtype != ccl::datatype::int8 && is_single_node;
+        dtype != ccl::datatype::int8 && ccl::global_data::env().enable_ze_bidir_algo;
     LOG_DEBUG("topo/reduce_scatter pipeline ", use_reduce_scatter_pipeline);
 
     // optimized non-fallback algorithm is currently supported for bidirectional case
@@ -467,6 +466,16 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
     if (use_non_fallback_algo && use_reduce_scatter_pipeline) {
         ze_utils::alloc_tmp_bufs(
             sched, comm, tmp_bufs, in_buffers, tmp_buf_idx_start, count, dtype);
+        if (!is_single_node) {
+            // TODO: add device memory manager support or any mechanism that
+            // allows to control the memory consumption once the pipleine chunking is implemented
+            size_t tmp_buf_bytes = count * dtype.size();
+            ccl::alloc_param alloc_param(
+                tmp_buf_bytes, ccl::buffer_type::ze, ccl::buffer_place::device);
+            tmp_buf = sched->alloc_buffer(alloc_param);
+            // scaleout rearranges send_buf into tmp_buf and uses this rearranged buf as input
+            in_buffers[0] = { tmp_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }; // 0
+        }
     }
     else if (use_tmp_buf) {
         size_t tmp_buf_bytes = count * dtype.size();
@@ -627,9 +636,10 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
         ccl_buffer out_tmp_buf = *tmp_bufs.begin();
         if (even_comm_size > 1 && use_reduce_scatter_pipeline) {
             LOG_DEBUG("topo/scale_up/intra: use ze_a2a_pipeline_reduce_entry");
-            // readuce from local buffer
+            // reduce from remote even_comm peer buffers or from local buffer
+            ccl_buffer dst_recv_buf = is_single_node ? recv_buf : out_tmp_buf;
             auto entry = entry_factory::create<ze_a2a_pipeline_reduce_entry>(
-                sched, comm, recv_buf, tmp_bufs, count, dtype, op, wait_events);
+                sched, comm, dst_recv_buf, tmp_bufs, count, dtype, op, wait_events);
             clear_and_push_back(wait_events, entry->entry_event);
         }
         else if (even_comm_size > 1) {
@@ -730,7 +740,6 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
             }
         }
 
-        entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
         return ccl::status::success;
     }
 
@@ -748,7 +757,6 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                          ipc_event_count,
                          ", expected max ",
                          max_ipc_event_count);
-        entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
         return ccl::status::success;
     }
     const ccl_buffer tmp_recv_buf = tmp_bufs.front();
@@ -935,8 +943,30 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
     sched->add_barrier();
     ccl::add_comm_barrier(sched, node_comm, wait_events, out_event);
 
-    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
     return ccl::status::success;
 }
 
+ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t count,
+                                               const ccl_datatype& dtype,
+                                               ccl::reduction op,
+                                               ccl_comm* comm) {
+    return ccl_build_topo_uniform_buff_size_op(
+        sched,
+        send_buf,
+        recv_buf,
+        count,
+        dtype.size(),
+        ccl::global_data::env().reduce_scatter_pipe_chunk_count,
+        "REDUCE_SCATTER",
+        ccl::global_data::get().metrics_profiler->reduce_scatter_pipe,
+        [dtype, op, comm](ccl_sched* sched, ccl_buffer send_buf, ccl_buffer recv_buf, size_t count)
+            -> ccl::status {
+            return ccl_coll_build_topo_reduce_scatter_fill(
+                sched, send_buf, recv_buf, count, dtype, op, comm);
+        });
+}
+
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/reduce_scatter/sycl/.clang-format b/src/coll/algorithms/reduce_scatter/sycl/.clang-format
new file mode 100644
index 000000000..726f33fd2
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/.clang-format
@@ -0,0 +1,145 @@
+---
+Language: Cpp
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
+AlignEscapedNewlines: DontAlign
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Empty
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: true
+  BeforeElse: true
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: false
+ColumnLimit: 115
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: true
+DisableFormat: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+ReflowComments: false
+SortIncludes: false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth: 1
+UseTab: Never
+...
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.cpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.cpp
new file mode 100644
index 000000000..0773fdb42
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.cpp
@@ -0,0 +1,73 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.hpp"
+
+#define REDUCE_SCATTER_LARGE_API_DECL(TYPE) \
+    void init_reduce_scatter_large_##TYPE(ccl::datatype dtype, \
+                                          sycl::queue &queue, \
+                                          ccl_comm *comm, \
+                                          ccl_stream *stream, \
+                                          uint32_t rank_in, \
+                                          uint32_t world_in); \
+    ccl::event run_reduce_scatter_large_##TYPE(ccl::datatype dtype, \
+                                               sycl::queue queue, \
+                                               const void *send_buf, \
+                                               void *recv_buf, \
+                                               size_t recv_count, \
+                                               bool &done);
+
+REDUCE_SCATTER_LARGE_API_DECL(fp16);
+REDUCE_SCATTER_LARGE_API_DECL(bf16);
+REDUCE_SCATTER_LARGE_API_DECL(fp32);
+REDUCE_SCATTER_LARGE_API_DECL(int32);
+
+#define SWITCH_INIT_TYPE(TYPE, ccl_type) \
+    case ccl_type: init_reduce_scatter_large_##TYPE(dtype, queue, comm, stream, rank_in, world_in); break;
+
+void init_reduce_scatter_large(ccl::datatype dtype,
+                               sycl::queue &queue,
+                               ccl_comm *comm,
+                               ccl_stream *stream,
+                               uint32_t rank_in,
+                               uint32_t world_in) {
+    switch (dtype) {
+        SWITCH_INIT_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_INIT_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_INIT_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_INIT_TYPE(int32, ccl::datatype::int32)
+        default: assert(0);
+    }
+}
+
+#define SWITCH_RUN_TYPE(TYPE, ccl_type) \
+    case ccl_type: e = run_reduce_scatter_large_##TYPE(dtype, queue, send_buf, recv_buf, recv_count, done); break;
+
+ccl::event run_reduce_scatter_large(ccl::datatype dtype,
+                                    sycl::queue queue,
+                                    const void *send_buf,
+                                    void *recv_buf,
+                                    size_t recv_count,
+                                    bool &done) {
+    ccl::event e;
+    switch (dtype) {
+        SWITCH_RUN_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_RUN_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_RUN_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_RUN_TYPE(int32, ccl::datatype::int32)
+        default: assert(0);
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.hpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.hpp
new file mode 100644
index 000000000..1edb0b009
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.hpp
@@ -0,0 +1,1740 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define MAX_RANK            16
+#define SIMD_COMPUTE_MAX    256
+#define SIMD_COMPUTE        (SIMD_COMPUTE_MAX / sizeof(data_type))
+#define SIMD_SYNC           32
+#define UNROLL_SIZE         1
+#define SYNC_BYTE           (SIMD_SYNC * sizeof(int) * 2)
+#define ALIGNMENT_BYTE      256
+#define MAX_COUNT           (16 * 1024 * 1024 / sizeof(data_type))
+#define EU_COUNT_PER_RANK   512
+#define THREAD_COUNT_PER_EU 8
+#define HW_THREAD_COUNT     (EU_COUNT_PER_RANK * THREAD_COUNT_PER_EU)
+#define RANKS_PER_GPU       2
+#define NO_KERNEL           0
+#define FIRST_KERNEL        1
+#define SECOND_KERNEL       2
+#define THIRD_KERNEL        4
+
+#define NOCOPY_KERNEL_NUM   2
+#define NOCOPY_BUFFER_COUNT NOCOPY_KERNEL_NUM
+#define NOCOPY_LAST_KERNEL  SECOND_KERNEL
+
+#define COPY_KERNEL_NUM   3
+#define COPY_BUFFER_COUNT COPY_KERNEL_NUM
+#define COPY_LAST_KERNEL  THIRD_KERNEL
+
+#define RUN_FIRST_KERNEL \
+    if (sw_pipeline_kernel_state[ii] & FIRST_KERNEL) { \
+        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) { \
+            int index = idx + inner_iter * HW_THREAD_COUNT; \
+            if ((uint32_t)index >= total_threads_needed) \
+                break; \
+            switch (temp_world) { \
+                case 2: \
+                    local_copy<2, data_type>((int *)even_ranks, \
+                                             index, \
+                                             send_buf, \
+                                             recv_size, \
+                                             threads_already_processed[ii], \
+                                             (void **)temp_buffer, \
+                                             temp_rank, \
+                                             outer_iter, \
+                                             size_per_buffer_kernel, \
+                                             ii, \
+                                             threads_needed_per_chunk); \
+                    break; \
+                case 4: \
+                    local_copy<4, data_type>((int *)even_ranks, \
+                                             index, \
+                                             send_buf, \
+                                             recv_size, \
+                                             threads_already_processed[ii], \
+                                             (void **)temp_buffer, \
+                                             temp_rank, \
+                                             outer_iter, \
+                                             size_per_buffer_kernel, \
+                                             ii, \
+                                             threads_needed_per_chunk); \
+                    break; \
+                case 6: \
+                    local_copy<6, data_type>((int *)even_ranks, \
+                                             index, \
+                                             send_buf, \
+                                             recv_size, \
+                                             threads_already_processed[ii], \
+                                             (void **)temp_buffer, \
+                                             temp_rank, \
+                                             outer_iter, \
+                                             size_per_buffer_kernel, \
+                                             ii, \
+                                             threads_needed_per_chunk); \
+                    break; \
+                case 8: \
+                    local_copy<8, data_type>((int *)even_ranks, \
+                                             index, \
+                                             send_buf, \
+                                             recv_size, \
+                                             threads_already_processed[ii], \
+                                             (void **)temp_buffer, \
+                                             temp_rank, \
+                                             outer_iter, \
+                                             size_per_buffer_kernel, \
+                                             ii, \
+                                             threads_needed_per_chunk); \
+                    break; \
+                case 10: \
+                    local_copy<10, data_type>((int *)even_ranks, \
+                                              index, \
+                                              send_buf, \
+                                              recv_size, \
+                                              threads_already_processed[ii], \
+                                              (void **)temp_buffer, \
+                                              temp_rank, \
+                                              outer_iter, \
+                                              size_per_buffer_kernel, \
+                                              ii, \
+                                              threads_needed_per_chunk); \
+                    break; \
+                case 12: \
+                    local_copy<12, data_type>((int *)even_ranks, \
+                                              index, \
+                                              send_buf, \
+                                              recv_size, \
+                                              threads_already_processed[ii], \
+                                              (void **)temp_buffer, \
+                                              temp_rank, \
+                                              outer_iter, \
+                                              size_per_buffer_kernel, \
+                                              ii, \
+                                              threads_needed_per_chunk); \
+                    break; \
+                case 14: \
+                    local_copy<14, data_type>((int *)even_ranks, \
+                                              index, \
+                                              send_buf, \
+                                              recv_size, \
+                                              threads_already_processed[ii], \
+                                              (void **)temp_buffer, \
+                                              temp_rank, \
+                                              outer_iter, \
+                                              size_per_buffer_kernel, \
+                                              ii, \
+                                              threads_needed_per_chunk); \
+                    break; \
+                case 16: \
+                    local_copy<16, data_type>((int *)even_ranks, \
+                                              index, \
+                                              send_buf, \
+                                              recv_size, \
+                                              threads_already_processed[ii], \
+                                              (void **)temp_buffer, \
+                                              temp_rank, \
+                                              outer_iter, \
+                                              size_per_buffer_kernel, \
+                                              ii, \
+                                              threads_needed_per_chunk); \
+                    break; \
+                default: break; \
+            } \
+        } \
+    } // end of if FIRST_KERNEL
+
+#define RUN_SECOND_KERNEL \
+    if (sw_pipeline_kernel_state[ii] & FIRST_KERNEL) { \
+        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) { \
+            int index = idx + inner_iter * HW_THREAD_COUNT; \
+            if ((uint32_t)index >= total_threads_needed) \
+                break; \
+            switch (temp_world) { \
+                case 2: \
+                    nocopy_reduce_read_write<2, data_type>((int *)even_ranks, \
+                                                           my_rank_index, \
+                                                           index, \
+                                                           (void **)in_buffers, \
+                                                           out_buffer, \
+                                                           recv_size, \
+                                                           threads_already_processed[ii], \
+                                                           (void **)temp_buffer, \
+                                                           temp_rank, \
+                                                           outer_iter, \
+                                                           size_per_buffer_kernel, \
+                                                           ii, \
+                                                           threads_needed_per_chunk); \
+                    break; \
+                case 4: \
+                    nocopy_reduce_read_write<4, data_type>((int *)even_ranks, \
+                                                           my_rank_index, \
+                                                           index, \
+                                                           (void **)in_buffers, \
+                                                           out_buffer, \
+                                                           recv_size, \
+                                                           threads_already_processed[ii], \
+                                                           (void **)temp_buffer, \
+                                                           temp_rank, \
+                                                           outer_iter, \
+                                                           size_per_buffer_kernel, \
+                                                           ii, \
+                                                           threads_needed_per_chunk); \
+                    break; \
+                case 6: \
+                    nocopy_reduce_read_write<6, data_type>((int *)even_ranks, \
+                                                           my_rank_index, \
+                                                           index, \
+                                                           (void **)in_buffers, \
+                                                           out_buffer, \
+                                                           recv_size, \
+                                                           threads_already_processed[ii], \
+                                                           (void **)temp_buffer, \
+                                                           temp_rank, \
+                                                           outer_iter, \
+                                                           size_per_buffer_kernel, \
+                                                           ii, \
+                                                           threads_needed_per_chunk); \
+                    break; \
+                case 8: \
+                    nocopy_reduce_read_write<8, data_type>((int *)even_ranks, \
+                                                           my_rank_index, \
+                                                           index, \
+                                                           (void **)in_buffers, \
+                                                           out_buffer, \
+                                                           recv_size, \
+                                                           threads_already_processed[ii], \
+                                                           (void **)temp_buffer, \
+                                                           temp_rank, \
+                                                           outer_iter, \
+                                                           size_per_buffer_kernel, \
+                                                           ii, \
+                                                           threads_needed_per_chunk); \
+                    break; \
+                case 10: \
+                    nocopy_reduce_read_write<10, data_type>((int *)even_ranks, \
+                                                            my_rank_index, \
+                                                            index, \
+                                                            (void **)in_buffers, \
+                                                            out_buffer, \
+                                                            recv_size, \
+                                                            threads_already_processed[ii], \
+                                                            (void **)temp_buffer, \
+                                                            temp_rank, \
+                                                            outer_iter, \
+                                                            size_per_buffer_kernel, \
+                                                            ii, \
+                                                            threads_needed_per_chunk); \
+                    break; \
+                case 12: \
+                    nocopy_reduce_read_write<12, data_type>((int *)even_ranks, \
+                                                            my_rank_index, \
+                                                            index, \
+                                                            (void **)in_buffers, \
+                                                            out_buffer, \
+                                                            recv_size, \
+                                                            threads_already_processed[ii], \
+                                                            (void **)temp_buffer, \
+                                                            temp_rank, \
+                                                            outer_iter, \
+                                                            size_per_buffer_kernel, \
+                                                            ii, \
+                                                            threads_needed_per_chunk); \
+                    break; \
+                case 14: \
+                    nocopy_reduce_read_write<14, data_type>((int *)even_ranks, \
+                                                            my_rank_index, \
+                                                            index, \
+                                                            (void **)in_buffers, \
+                                                            out_buffer, \
+                                                            recv_size, \
+                                                            threads_already_processed[ii], \
+                                                            (void **)temp_buffer, \
+                                                            temp_rank, \
+                                                            outer_iter, \
+                                                            size_per_buffer_kernel, \
+                                                            ii, \
+                                                            threads_needed_per_chunk); \
+                    break; \
+                case 16: \
+                    nocopy_reduce_read_write<16, data_type>((int *)even_ranks, \
+                                                            my_rank_index, \
+                                                            index, \
+                                                            (void **)in_buffers, \
+                                                            out_buffer, \
+                                                            recv_size, \
+                                                            threads_already_processed[ii], \
+                                                            (void **)temp_buffer, \
+                                                            temp_rank, \
+                                                            outer_iter, \
+                                                            size_per_buffer_kernel, \
+                                                            ii, \
+                                                            threads_needed_per_chunk); \
+                    break; \
+                default: break; \
+            } \
+        } \
+    } // end of if SECOND_KERNEL
+
+#define RUN_THIRD_KERNEL \
+    if (sw_pipeline_kernel_state[ii] & SECOND_KERNEL) { \
+        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) { \
+            int index = idx + inner_iter * HW_THREAD_COUNT; \
+            if ((uint32_t)index >= total_threads_needed) \
+                break; \
+            switch (temp_world) { \
+                case 2: \
+                    all_sum<2, data_type>(index, \
+                                          send_buf, \
+                                          out_buffer, \
+                                          recv_size, \
+                                          threads_already_processed[ii], \
+                                          (void **)temp_buffer, \
+                                          temp_rank, \
+                                          outer_iter, \
+                                          size_per_buffer_kernel, \
+                                          ii, \
+                                          threads_needed_per_chunk); \
+                    break; \
+                case 4: \
+                    all_sum<4, data_type>(index, \
+                                          send_buf, \
+                                          out_buffer, \
+                                          recv_size, \
+                                          threads_already_processed[ii], \
+                                          (void **)temp_buffer, \
+                                          temp_rank, \
+                                          outer_iter, \
+                                          size_per_buffer_kernel, \
+                                          ii, \
+                                          threads_needed_per_chunk); \
+                    break; \
+                case 6: \
+                    all_sum<6, data_type>(index, \
+                                          send_buf, \
+                                          out_buffer, \
+                                          recv_size, \
+                                          threads_already_processed[ii], \
+                                          (void **)temp_buffer, \
+                                          temp_rank, \
+                                          outer_iter, \
+                                          size_per_buffer_kernel, \
+                                          ii, \
+                                          threads_needed_per_chunk); \
+                    break; \
+                case 8: \
+                    all_sum<8, data_type>(index, \
+                                          send_buf, \
+                                          out_buffer, \
+                                          recv_size, \
+                                          threads_already_processed[ii], \
+                                          (void **)temp_buffer, \
+                                          temp_rank, \
+                                          outer_iter, \
+                                          size_per_buffer_kernel, \
+                                          ii, \
+                                          threads_needed_per_chunk); \
+                    break; \
+                case 10: \
+                    all_sum<10, data_type>(index, \
+                                           send_buf, \
+                                           out_buffer, \
+                                           recv_size, \
+                                           threads_already_processed[ii], \
+                                           (void **)temp_buffer, \
+                                           temp_rank, \
+                                           outer_iter, \
+                                           size_per_buffer_kernel, \
+                                           ii, \
+                                           threads_needed_per_chunk); \
+                    break; \
+                case 12: \
+                    all_sum<12, data_type>(index, \
+                                           send_buf, \
+                                           out_buffer, \
+                                           recv_size, \
+                                           threads_already_processed[ii], \
+                                           (void **)temp_buffer, \
+                                           temp_rank, \
+                                           outer_iter, \
+                                           size_per_buffer_kernel, \
+                                           ii, \
+                                           threads_needed_per_chunk); \
+                    break; \
+                case 14: \
+                    all_sum<14, data_type>(index, \
+                                           send_buf, \
+                                           out_buffer, \
+                                           recv_size, \
+                                           threads_already_processed[ii], \
+                                           (void **)temp_buffer, \
+                                           temp_rank, \
+                                           outer_iter, \
+                                           size_per_buffer_kernel, \
+                                           ii, \
+                                           threads_needed_per_chunk); \
+                    break; \
+                case 16: \
+                    all_sum<16, data_type>(index, \
+                                           send_buf, \
+                                           out_buffer, \
+                                           recv_size, \
+                                           threads_already_processed[ii], \
+                                           (void **)temp_buffer, \
+                                           temp_rank, \
+                                           outer_iter, \
+                                           size_per_buffer_kernel, \
+                                           ii, \
+                                           threads_needed_per_chunk); \
+                    break; \
+                default: break; \
+            } \
+        } \
+    } // end of if THIRD_KERNEL
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_reduce_read_write(int *even_ranks,
+                              int my_rank_index,
+                              int idx,
+                              void **in_buffers,
+                              void *out_buffer,
+                              uint32_t recv_size,
+                              int threads_already_processed,
+                              void *temp_buffer[],
+                              uint32_t temp_rank,
+                              int outer_iter,
+                              int size_per_buffer_kernel,
+                              int buffer_index_kernel,
+                              uint32_t threads_needed_per_chunk) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int chunk_size = threads_needed_per_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    int read_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+
+    data_type *mdfi_ptr = (data_type *)in_buffers[temp_rank ^ 1];
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> mdfi_buffer;
+    data_type *local_ptr = (data_type *)in_buffers[temp_rank];
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> local_buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r]; // even rank copies odd chunks
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            mdfi_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>(mdfi_ptr + rr * recv_size + read_offset + i * SIMD_COMPUTE);
+            local_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>(local_ptr + rr * recv_size + read_offset + i * SIMD_COMPUTE);
+        }
+    }
+
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> sum;
+    if (even_ranks[0] == 0) {
+#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            sum.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) =
+                local_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) +
+                mdfi_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+        }
+    }
+    else {
+#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            sum.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) =
+                mdfi_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) +
+                local_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+        }
+    }
+
+    //store the result to the buffer
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r];
+        data_type *write_ptr = (data_type *)temp_buffer[rr];
+        write_ptr += size_per_buffer_kernel * buffer_index_kernel;
+        int out_offset = (temp_rank / 2) * chunk_size + idx * SIMD_COMPUTE * UNROLL_SIZE;
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::write_back> //save the all sum in the second half of the temp slot.
+                (write_ptr + out_offset + i * SIMD_COMPUTE,
+                 sum.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i));
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void local_copy(int *even_ranks,
+                int idx,
+                const void *in_buffer,
+                uint32_t recv_size,
+                int threads_already_processed,
+                void *temp_buffer[],
+                uint32_t temp_rank,
+                int outer_iter,
+                int size_per_buffer_kernel,
+                int buffer_index_kernel,
+                uint32_t threads_needed_per_chunk) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int chunk_size = threads_needed_per_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    // even rank copies odd chunks
+    int read_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r] ^ 1; // even rank copies odd chunks
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)in_buffer + rr * recv_size + read_offset +
+                                                     i * SIMD_COMPUTE);
+        }
+    }
+
+    // write to myrank's second half of the temp buffer
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    ptr += size_per_buffer_kernel * buffer_index_kernel;
+    ptr += chunk_size * TEMP_WORLD / 2 + idx * SIMD_COMPUTE * UNROLL_SIZE;
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::write_back>(
+                ptr + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+        }
+        ptr += chunk_size;
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void reduce_read_write(int *even_ranks,
+                       int my_rank_index,
+                       int idx,
+                       const void *in_buffer,
+                       void *out_buffer,
+                       uint32_t recv_size,
+                       int threads_already_processed,
+                       void *temp_buffer[],
+                       uint32_t temp_rank,
+                       int outer_iter,
+                       int size_per_buffer_kernel,
+                       int buffer_index_kernel,
+                       uint32_t threads_needed_per_chunk) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int chunk_size = threads_needed_per_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+
+    data_type *mdfi_ptr = (data_type *)temp_buffer[temp_rank ^ 1];
+    mdfi_ptr += size_per_buffer_kernel * buffer_index_kernel;
+    int mdfi_offset = chunk_size * TEMP_WORLD / 2 + idx * SIMD_COMPUTE * UNROLL_SIZE;
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> mdfi_buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            mdfi_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)mdfi_ptr + mdfi_offset + i * SIMD_COMPUTE);
+        }
+        mdfi_ptr += chunk_size;
+    }
+
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    int read_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> local_buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r];
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            local_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)in_buffer + rr * recv_size + read_offset +
+                                                     i * SIMD_COMPUTE);
+        }
+    }
+
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> sum;
+    if (even_ranks[0] == 0) {
+#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            sum.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) =
+                local_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) +
+                mdfi_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+        }
+    }
+    else {
+#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            sum.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) =
+                mdfi_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) +
+                local_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+        }
+    }
+
+    //store the result to the first half of the buffer
+    if (TEMP_WORLD > 2) {
+        //#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            int rr = even_ranks[r];
+            data_type *write_ptr = (data_type *)temp_buffer[rr];
+            write_ptr += size_per_buffer_kernel * buffer_index_kernel;
+            int out_offset = (temp_rank / 2) * chunk_size + idx * SIMD_COMPUTE * UNROLL_SIZE;
+#pragma unroll
+            for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+                lsc_block_store<data_type,
+                                SIMD_COMPUTE,
+                                lsc_data_size::default_size,
+                                cache_hint::uncached,
+                                cache_hint::write_back> //save the all sum in the second half of the temp slot.
+                    (write_ptr + out_offset + i * SIMD_COMPUTE,
+                     sum.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i));
+            }
+        }
+    }
+    else {
+        // directly write to output
+        data_type *write_ptr = (data_type *)out_buffer;
+        write_ptr += (idx + threads_already_processed) * SIMD_COMPUTE * UNROLL_SIZE;
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::write_back> //save the all sum in the second half of the temp slot.
+                (write_ptr + i * SIMD_COMPUTE, sum.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void all_sum(int idx,
+             const void *in_buffer,
+             void *out_buffer,
+             uint32_t recv_size,
+             int threads_already_processed,
+             void *temp_buffer[],
+             uint32_t temp_rank,
+             int outer_iter,
+             int size_per_buffer_kernel,
+             int buffer_index_kernel,
+             uint32_t threads_needed_per_chunk) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int chunk_size = threads_needed_per_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    //read the input data
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    ptr += size_per_buffer_kernel * buffer_index_kernel;
+    int read_offset = idx * SIMD_COMPUTE * UNROLL_SIZE;
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * UNROLL_SIZE * r + i * SIMD_COMPUTE) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::cached>(ptr + read_offset + i * SIMD_COMPUTE);
+        }
+        ptr += chunk_size;
+    }
+    simd<data_type, SIMD_COMPUTE *UNROLL_SIZE> sum = 0;
+#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#if 0
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            sum.template select<SIMD_COMPUTE, 1>(i * SIMD_COMPUTE) += buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i);
+        }
+#else
+        sum = sum + buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+#endif
+    }
+
+    //store the result
+    data_type *write_ptr = (data_type *)out_buffer;
+    int write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * UNROLL_SIZE;
+    if (write_offset + SIMD_COMPUTE * UNROLL_SIZE <= recv_size) {
+        write_ptr += write_offset;
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::write_back> //save the all sum in the second half of the temp slot.
+                (write_ptr + i * SIMD_COMPUTE, sum.template select<SIMD_COMPUTE, 1>(i * SIMD_COMPUTE));
+        }
+    }
+    else {
+        for (uint32_t i = write_offset; i < recv_size; i++)
+            *(write_ptr + i) = sum[i - write_offset];
+    }
+}
+
+template <typename dtype>
+class ReduceScatterLargeKernel;
+
+template <typename dtype>
+class ReduceScatterLargeNoCopyKernel;
+
+template <typename dtype>
+class ReduceScatterLargeKernel_GlobalSync;
+template <typename dtype>
+class ReduceScatterLargeKernel_LocalSync;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK, uint32_t max_buffer = 1024 /*KB*/>
+class sycl_reduce_scatter_large : public sycl_coll_base<data_type> {
+public:
+    sycl_reduce_scatter_large() : sycl_coll_base<data_type>() {
+        size_per_buffer = 0;
+        buffer_index = 0;
+    }
+
+    void init(sycl::queue &queue, ccl_comm *comm, ccl_stream *stream, uint32_t rank_in, uint32_t world_in) {
+        //using namespace __ESIMD_NS;
+        //using namespace __ESIMD_ENS;
+
+        rank = rank_in;
+        world = world_in;
+        // temporal buffer used for allreduce temporal use only.
+        size_t alloc_size;
+        if (ccl::global_data::env().reduce_scatter_use_tmp_buf) {
+            max_count_per_rank = (MAX_COUNT + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE) *
+                                 SIMD_COMPUTE * UNROLL_SIZE;
+            data_size_per_buffer = max_count_per_rank * world;
+            size_per_buffer = data_size_per_buffer * sizeof(data_type) + SYNC_BYTE;
+            alloc_size = size_per_buffer * COPY_BUFFER_COUNT;
+        }
+        else { // use half of the size
+            max_count_per_rank = (MAX_COUNT + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE) *
+                                 SIMD_COMPUTE * UNROLL_SIZE;
+            data_size_per_buffer = max_count_per_rank * world / 2;
+            size_per_buffer = data_size_per_buffer * sizeof(data_type) + SYNC_BYTE;
+            alloc_size = size_per_buffer * NOCOPY_BUFFER_COUNT;
+        }
+        void *local_buffer = sycl::malloc_device(alloc_size, queue);
+        auto e = queue.memset(local_buffer, 0, alloc_size);
+        e.wait();
+
+        // XXX: gain access to remote pointers
+        this->exchange_peer_ipc_mem(queue,
+                                    comm,
+                                    stream,
+                                    local_buffer,
+                                    NULL,
+                                    rank,
+                                    world,
+                                    data_size_per_buffer * sizeof(data_type),
+                                    (void **)buffers,
+                                    (void **)sync_buffer,
+                                    offsets,
+                                    ipc_handle,
+                                    NULL,
+                                    NULL /* mmap_buffers */,
+                                    false /* to_cache */);
+        this->initialized = true;
+
+        global_stream = stream;
+        global_comm = comm;
+        even_comm = global_comm->get_even_comm().get();
+    }
+
+    ccl::event reduce_scatter(sycl::queue &queue,
+                              const void *send_buf,
+                              void *out_buffer,
+                              uint32_t recv_size,
+                              bool &done) {
+        if (ccl::global_data::env().reduce_scatter_use_tmp_buf) {
+            return reduce_scatter_copy(queue, send_buf, out_buffer, recv_size, done);
+        }
+        else {
+            return reduce_scatter_nocopy(queue, send_buf, out_buffer, recv_size, done);
+        }
+    }
+
+private:
+    ccl::event reduce_scatter_copy(sycl::queue &queue,
+                                   const void *send_buf,
+                                   void *out_buffer,
+                                   uint32_t recv_size,
+                                   bool &done) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        done = true;
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+
+        if (recv_size / (SIMD_COMPUTE * UNROLL_SIZE) < temp_world) {
+            done = false;
+            return ccl::event::create_from_native(e);
+        }
+
+        int even_ranks[max_rank];
+        int my_rank_index = -1;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank)
+                my_rank_index = i;
+            //printf("even rank %d: %d neighbor: %d\n", i, even_ranks[i], even_ranks[i] ^ 1);
+        }
+        int size_per_buffer_kernel __attribute__((unused)) = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel __attribute__((unused)) =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        //int buffer_index_kernel = buffer_index;
+        int outerloop_iter_count; //Since 16 elements in temp buffer is used to process 8 element output, the outer loop count must be doubled roughly.
+        int sync_reset_counter = 0;
+        int buffer_index_kernel_for_sync = buffer_index;
+        int outer_iter;
+
+        //this is the outerloop count that requires full hw thread count.
+        //This doesnt include the outloop iteration that only needs partial thread count
+        outerloop_iter_count = recv_size / max_count_per_rank;
+
+        //uint32_t total_threads_needed_sync = 1;
+        int wg_size __attribute__((unused)) = 1;
+        int start, end;
+
+        //printf("[%d] max_count_per_rank: %d max_threads_per_MAX_COUNT: %d max_elements_per_MAX_COUNT: %d outerloop_iter_count: %d\n",
+        // temp_rank, max_count_per_rank, max_threads_per_MAX_COUNT, max_elements_per_MAX_COUNT, outerloop_iter_count);
+        //init the sw pipeline
+        int sw_pipeline_insert_index = 0;
+        int sw_pipeline_insert_counter = 0;
+        int sw_pipeline_kernel_state[COPY_KERNEL_NUM];
+        int threads_already_processed[COPY_KERNEL_NUM];
+        for (int i = 0; i < COPY_KERNEL_NUM; i++) {
+            threads_already_processed[i] = 0;
+            sw_pipeline_kernel_state[i] = NO_KERNEL;
+        }
+
+        int first_iter = 1;
+        for (int iter = 0; iter < 2; iter++) {
+            uint32_t total_threads_needed;
+            uint32_t threads_needed_per_chunk;
+            if (iter == 1) //if second iteration, then handle the partial usage of the temp buffer
+            {
+                //if there is little more left to compute, then finish them
+                if (outerloop_iter_count * max_count_per_rank < recv_size) {
+                    start = outerloop_iter_count;
+                    end = start + 1;
+                    //total_threads_needed = (recv_size - start * max_elements_per_MAX_COUNT + SIMD_COMPUTE * temp_world - 1) / (SIMD_COMPUTE * temp_world);
+                    uint32_t leftover = recv_size - outerloop_iter_count * max_count_per_rank;
+                    threads_needed_per_chunk =
+                        (leftover + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE);
+                }
+                else {
+                    break;
+                }
+            }
+            else {
+                start = 0;
+                end = outerloop_iter_count;
+                //total_threads_needed = max_threads_per_MAX_COUNT;
+                threads_needed_per_chunk = max_count_per_rank / (SIMD_COMPUTE * UNROLL_SIZE);
+
+                if (end == 0)
+                    continue; //there is nothing to do when end is 0 so check the next iter.
+            }
+            total_threads_needed = threads_needed_per_chunk;
+
+            int innerloop_iter_count = (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+
+            //printf("[%d] iter: %d outer_iter start: %d end: %d\n", temp_rank, iter, start, end);
+            for (outer_iter = start; outer_iter < end + COPY_KERNEL_NUM - 1; outer_iter++) {
+                //if more outer_iter remaining since there is more new processing to do, then insert them to the SW pipeline.
+                //During the sw pipeline tail, there is nothing to dispatch.
+                if (outer_iter < end) {
+                    sw_pipeline_kernel_state[sw_pipeline_insert_index] = FIRST_KERNEL;
+                    threads_already_processed[sw_pipeline_insert_index] = sw_pipeline_insert_counter;
+                    sw_pipeline_insert_index++;
+                    if (sw_pipeline_insert_index >= COPY_KERNEL_NUM) {
+                        sw_pipeline_insert_index =
+                            0; //By the time the index wraps arounds, the kernel that was in this slot previously has already completed.
+                    }
+                    sw_pipeline_insert_counter += threads_needed_per_chunk;
+                }
+
+                // printf("[%d] outer_iter: %d threads_already_processed: %d %d %d sw_pipeline_kernel_state: %x %x %x\n", temp_rank, outer_iter,
+                // threads_already_processed[0], threads_already_processed[1], threads_already_processed[2], sw_pipeline_kernel_state[0],
+                // sw_pipeline_kernel_state[1], sw_pipeline_kernel_state[2]);
+
+                if (!first_iter) {
+                    //sync all the ranks within the single GPU.
+                    e = global_sync(queue,
+                                    temp_rank,
+                                    temp_world,
+                                    size_per_buffer_for_sync_kernel * buffer_index_kernel_for_sync,
+                                    4,
+                                    1);
+                    sync_reset_counter++;
+                    buffer_index_kernel_for_sync++;
+                    buffer_index_kernel_for_sync %= COPY_BUFFER_COUNT;
+                }
+                first_iter = 0;
+
+                e = queue.submit([&](sycl::handler &cgh) {
+                        cgh.parallel_for<class ReduceScatterLargeKernel<data_type>>(
+                            sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                            {
+                            //check if there is any kernel in the SW pipelines. If yes, execute them.
+                            //to optimize, the order of loop i=0,1,2,.. can be shuffled so that different
+                            //ranks can do different kernels at particular time. The purpose is to better
+                            //balance the HW resource usage in the PVC node.
+                            for (int ii = 0; ii < COPY_KERNEL_NUM; ii++) {
+                                //wrap the SW pipeline index so that it is [0, KERNEL_NUM - 1]. Used instead of the expensive modulo.
+                                //sycl::_V1::ext::oneapi::experimental::printf("Kernel rank%d  %d - %x  innerloop_iter_count: %d\n", temp_rank, ii, sw_pipeline_kernel_state[ii], innerloop_iter_count);
+
+                                if (sw_pipeline_kernel_state[ii] & FIRST_KERNEL) {
+                                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                                        if ((uint32_t)index >= total_threads_needed)
+                                            break;
+                                        switch (temp_world) {
+                                            case 2:
+                                                local_copy<2, data_type>((int *)even_ranks,
+                                                                         index,
+                                                                         send_buf,
+                                                                         recv_size,
+                                                                         threads_already_processed[ii],
+                                                                         (void **)temp_buffer,
+                                                                         temp_rank,
+                                                                         outer_iter,
+                                                                         size_per_buffer_kernel,
+                                                                         ii,
+                                                                         threads_needed_per_chunk);
+                                                break;
+                                            case 4:
+                                                local_copy<4, data_type>((int *)even_ranks,
+                                                                         index,
+                                                                         send_buf,
+                                                                         recv_size,
+                                                                         threads_already_processed[ii],
+                                                                         (void **)temp_buffer,
+                                                                         temp_rank,
+                                                                         outer_iter,
+                                                                         size_per_buffer_kernel,
+                                                                         ii,
+                                                                         threads_needed_per_chunk);
+                                                break;
+                                            case 6:
+                                                local_copy<6, data_type>((int *)even_ranks,
+                                                                         index,
+                                                                         send_buf,
+                                                                         recv_size,
+                                                                         threads_already_processed[ii],
+                                                                         (void **)temp_buffer,
+                                                                         temp_rank,
+                                                                         outer_iter,
+                                                                         size_per_buffer_kernel,
+                                                                         ii,
+                                                                         threads_needed_per_chunk);
+                                                break;
+                                            case 8:
+                                                local_copy<8, data_type>((int *)even_ranks,
+                                                                         index,
+                                                                         send_buf,
+                                                                         recv_size,
+                                                                         threads_already_processed[ii],
+                                                                         (void **)temp_buffer,
+                                                                         temp_rank,
+                                                                         outer_iter,
+                                                                         size_per_buffer_kernel,
+                                                                         ii,
+                                                                         threads_needed_per_chunk);
+                                                break;
+                                            case 10:
+                                                local_copy<10, data_type>((int *)even_ranks,
+                                                                          index,
+                                                                          send_buf,
+                                                                          recv_size,
+                                                                          threads_already_processed[ii],
+                                                                          (void **)temp_buffer,
+                                                                          temp_rank,
+                                                                          outer_iter,
+                                                                          size_per_buffer_kernel,
+                                                                          ii,
+                                                                          threads_needed_per_chunk);
+                                                break;
+                                            case 12:
+                                                local_copy<12, data_type>((int *)even_ranks,
+                                                                          index,
+                                                                          send_buf,
+                                                                          recv_size,
+                                                                          threads_already_processed[ii],
+                                                                          (void **)temp_buffer,
+                                                                          temp_rank,
+                                                                          outer_iter,
+                                                                          size_per_buffer_kernel,
+                                                                          ii,
+                                                                          threads_needed_per_chunk);
+                                                break;
+                                            case 14:
+                                                local_copy<14, data_type>((int *)even_ranks,
+                                                                          index,
+                                                                          send_buf,
+                                                                          recv_size,
+                                                                          threads_already_processed[ii],
+                                                                          (void **)temp_buffer,
+                                                                          temp_rank,
+                                                                          outer_iter,
+                                                                          size_per_buffer_kernel,
+                                                                          ii,
+                                                                          threads_needed_per_chunk);
+                                                break;
+                                            case 16:
+                                                local_copy<16, data_type>((int *)even_ranks,
+                                                                          index,
+                                                                          send_buf,
+                                                                          recv_size,
+                                                                          threads_already_processed[ii],
+                                                                          (void **)temp_buffer,
+                                                                          temp_rank,
+                                                                          outer_iter,
+                                                                          size_per_buffer_kernel,
+                                                                          ii,
+                                                                          threads_needed_per_chunk);
+                                                break;
+                                            default: break;
+                                        }
+                                    }
+                                } // end of if FIRST_KERNEL
+                                if (sw_pipeline_kernel_state[ii] & SECOND_KERNEL) {
+                                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                                        if ((uint32_t)index >= total_threads_needed)
+                                            break;
+                                        switch (temp_world) {
+                                            case 2:
+                                                reduce_read_write<2, data_type>((int *)even_ranks,
+                                                                                my_rank_index,
+                                                                                index,
+                                                                                send_buf,
+                                                                                out_buffer,
+                                                                                recv_size,
+                                                                                threads_already_processed[ii],
+                                                                                (void **)temp_buffer,
+                                                                                temp_rank,
+                                                                                outer_iter,
+                                                                                size_per_buffer_kernel,
+                                                                                ii,
+                                                                                threads_needed_per_chunk);
+                                                break;
+                                            case 4:
+                                                reduce_read_write<4, data_type>((int *)even_ranks,
+                                                                                my_rank_index,
+                                                                                index,
+                                                                                send_buf,
+                                                                                out_buffer,
+                                                                                recv_size,
+                                                                                threads_already_processed[ii],
+                                                                                (void **)temp_buffer,
+                                                                                temp_rank,
+                                                                                outer_iter,
+                                                                                size_per_buffer_kernel,
+                                                                                ii,
+                                                                                threads_needed_per_chunk);
+                                                break;
+                                            case 6:
+                                                reduce_read_write<6, data_type>((int *)even_ranks,
+                                                                                my_rank_index,
+                                                                                index,
+                                                                                send_buf,
+                                                                                out_buffer,
+                                                                                recv_size,
+                                                                                threads_already_processed[ii],
+                                                                                (void **)temp_buffer,
+                                                                                temp_rank,
+                                                                                outer_iter,
+                                                                                size_per_buffer_kernel,
+                                                                                ii,
+                                                                                threads_needed_per_chunk);
+                                                break;
+                                            case 8:
+                                                reduce_read_write<8, data_type>((int *)even_ranks,
+                                                                                my_rank_index,
+                                                                                index,
+                                                                                send_buf,
+                                                                                out_buffer,
+                                                                                recv_size,
+                                                                                threads_already_processed[ii],
+                                                                                (void **)temp_buffer,
+                                                                                temp_rank,
+                                                                                outer_iter,
+                                                                                size_per_buffer_kernel,
+                                                                                ii,
+                                                                                threads_needed_per_chunk);
+                                                break;
+                                            case 10:
+                                                reduce_read_write<10, data_type>((int *)even_ranks,
+                                                                                 my_rank_index,
+                                                                                 index,
+                                                                                 send_buf,
+                                                                                 out_buffer,
+                                                                                 recv_size,
+                                                                                 threads_already_processed[ii],
+                                                                                 (void **)temp_buffer,
+                                                                                 temp_rank,
+                                                                                 outer_iter,
+                                                                                 size_per_buffer_kernel,
+                                                                                 ii,
+                                                                                 threads_needed_per_chunk);
+                                                break;
+                                            case 12:
+                                                reduce_read_write<12, data_type>((int *)even_ranks,
+                                                                                 my_rank_index,
+                                                                                 index,
+                                                                                 send_buf,
+                                                                                 out_buffer,
+                                                                                 recv_size,
+                                                                                 threads_already_processed[ii],
+                                                                                 (void **)temp_buffer,
+                                                                                 temp_rank,
+                                                                                 outer_iter,
+                                                                                 size_per_buffer_kernel,
+                                                                                 ii,
+                                                                                 threads_needed_per_chunk);
+                                                break;
+                                            case 14:
+                                                reduce_read_write<14, data_type>((int *)even_ranks,
+                                                                                 my_rank_index,
+                                                                                 index,
+                                                                                 send_buf,
+                                                                                 out_buffer,
+                                                                                 recv_size,
+                                                                                 threads_already_processed[ii],
+                                                                                 (void **)temp_buffer,
+                                                                                 temp_rank,
+                                                                                 outer_iter,
+                                                                                 size_per_buffer_kernel,
+                                                                                 ii,
+                                                                                 threads_needed_per_chunk);
+                                                break;
+                                            case 16:
+                                                reduce_read_write<16, data_type>((int *)even_ranks,
+                                                                                 my_rank_index,
+                                                                                 index,
+                                                                                 send_buf,
+                                                                                 out_buffer,
+                                                                                 recv_size,
+                                                                                 threads_already_processed[ii],
+                                                                                 (void **)temp_buffer,
+                                                                                 temp_rank,
+                                                                                 outer_iter,
+                                                                                 size_per_buffer_kernel,
+                                                                                 ii,
+                                                                                 threads_needed_per_chunk);
+                                                break;
+                                            default: break;
+                                        }
+                                    }
+                                } // end of if SECOND_KERNEL
+                                if ((sw_pipeline_kernel_state[ii] & THIRD_KERNEL) && temp_world > 2) {
+                                    for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                                        int index = idx + inner_iter * HW_THREAD_COUNT;
+                                        if ((uint32_t)index >= total_threads_needed)
+                                            break;
+                                        switch (temp_world) {
+                                            case 2:
+                                                all_sum<2, data_type>(index,
+                                                                      send_buf,
+                                                                      out_buffer,
+                                                                      recv_size,
+                                                                      threads_already_processed[ii],
+                                                                      (void **)temp_buffer,
+                                                                      temp_rank,
+                                                                      outer_iter,
+                                                                      size_per_buffer_kernel,
+                                                                      ii,
+                                                                      threads_needed_per_chunk);
+                                                break;
+                                            case 4:
+                                                all_sum<4, data_type>(index,
+                                                                      send_buf,
+                                                                      out_buffer,
+                                                                      recv_size,
+                                                                      threads_already_processed[ii],
+                                                                      (void **)temp_buffer,
+                                                                      temp_rank,
+                                                                      outer_iter,
+                                                                      size_per_buffer_kernel,
+                                                                      ii,
+                                                                      threads_needed_per_chunk);
+                                                break;
+                                            case 6:
+                                                all_sum<6, data_type>(index,
+                                                                      send_buf,
+                                                                      out_buffer,
+                                                                      recv_size,
+                                                                      threads_already_processed[ii],
+                                                                      (void **)temp_buffer,
+                                                                      temp_rank,
+                                                                      outer_iter,
+                                                                      size_per_buffer_kernel,
+                                                                      ii,
+                                                                      threads_needed_per_chunk);
+                                                break;
+                                            case 8:
+                                                all_sum<8, data_type>(index,
+                                                                      send_buf,
+                                                                      out_buffer,
+                                                                      recv_size,
+                                                                      threads_already_processed[ii],
+                                                                      (void **)temp_buffer,
+                                                                      temp_rank,
+                                                                      outer_iter,
+                                                                      size_per_buffer_kernel,
+                                                                      ii,
+                                                                      threads_needed_per_chunk);
+                                                break;
+                                            case 10:
+                                                all_sum<10, data_type>(index,
+                                                                       send_buf,
+                                                                       out_buffer,
+                                                                       recv_size,
+                                                                       threads_already_processed[ii],
+                                                                       (void **)temp_buffer,
+                                                                       temp_rank,
+                                                                       outer_iter,
+                                                                       size_per_buffer_kernel,
+                                                                       ii,
+                                                                       threads_needed_per_chunk);
+                                                break;
+                                            case 12:
+                                                all_sum<12, data_type>(index,
+                                                                       send_buf,
+                                                                       out_buffer,
+                                                                       recv_size,
+                                                                       threads_already_processed[ii],
+                                                                       (void **)temp_buffer,
+                                                                       temp_rank,
+                                                                       outer_iter,
+                                                                       size_per_buffer_kernel,
+                                                                       ii,
+                                                                       threads_needed_per_chunk);
+                                                break;
+                                            case 14:
+                                                all_sum<14, data_type>(index,
+                                                                       send_buf,
+                                                                       out_buffer,
+                                                                       recv_size,
+                                                                       threads_already_processed[ii],
+                                                                       (void **)temp_buffer,
+                                                                       temp_rank,
+                                                                       outer_iter,
+                                                                       size_per_buffer_kernel,
+                                                                       ii,
+                                                                       threads_needed_per_chunk);
+                                                break;
+                                            case 16:
+                                                all_sum<16, data_type>(index,
+                                                                       send_buf,
+                                                                       out_buffer,
+                                                                       recv_size,
+                                                                       threads_already_processed[ii],
+                                                                       (void **)temp_buffer,
+                                                                       temp_rank,
+                                                                       outer_iter,
+                                                                       size_per_buffer_kernel,
+                                                                       ii,
+                                                                       threads_needed_per_chunk);
+                                                break;
+                                            default: break;
+                                        }
+                                    }
+                                } // end of if THIRD_KERNEL
+                            } // end of for
+                         });//parallel_for
+                }); //submit()
+                //e.wait();
+
+                //update the sw pipeline process state so that next kernel will be processed in next round
+                for (int i = 0; i < COPY_KERNEL_NUM; i++) {
+                    if (sw_pipeline_kernel_state[i] & COPY_LAST_KERNEL)
+                        sw_pipeline_kernel_state[i] =
+                            0; //remove the kernel from the sw pipeline if it is fifth kernel. Everything is already executed.
+                    else
+                        sw_pipeline_kernel_state[i] <<= 1;
+                }
+
+                //std::cout << "rank" << temp_rank << " iter" << iter << " outer_iter" << outer_iter << " kernel1 done." << "\n";
+
+            } // end of outer_iter
+        } // end of for iter = 2
+
+        buffer_index += sync_reset_counter;
+        buffer_index %= COPY_BUFFER_COUNT;
+
+        return ccl::event::create_from_native(e);
+    }
+
+    ccl::event reduce_scatter_nocopy(sycl::queue &queue,
+                                     const void *send_buf,
+                                     void *out_buffer,
+                                     uint32_t recv_size,
+                                     bool &done) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        done = true;
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+
+        if (recv_size / (SIMD_COMPUTE * UNROLL_SIZE) < temp_world) {
+            done = false;
+            return ccl::event::create_from_native(e);
+        }
+
+        int even_ranks[max_rank];
+        int my_rank_index = -1;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank)
+                my_rank_index = i;
+            //printf("even rank %d: %d neighbor: %d\n", i, even_ranks[i], even_ranks[i] ^ 1);
+        }
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel = size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        int outerloop_iter_count; //Since 16 elements in temp buffer is used to process 8 element output, the outer loop count must be doubled roughly.
+        int sync_reset_counter = 0;
+        int buffer_index_kernel_for_sync = buffer_index;
+        int outer_iter;
+
+        int max_elements_per_MAX_COUNT __attribute__((unused)) = (recv_size + SIMD_COMPUTE * UNROLL_SIZE - 1) /
+                                                                 (SIMD_COMPUTE * UNROLL_SIZE) * SIMD_COMPUTE *
+                                                                 UNROLL_SIZE;
+        int max_threads_per_MAX_COUNT __attribute__((unused)) = max_count_per_rank / (SIMD_COMPUTE * UNROLL_SIZE);
+
+        //this is the outerloop count that requires full hw thread count.
+        //This doesnt include the outloop iteration that only needs partial thread count
+        outerloop_iter_count = recv_size / max_count_per_rank;
+
+        //uint32_t total_threads_needed_sync = 1;
+        int wg_size __attribute__((unused)) = 1;
+        int start, end;
+
+        //printf("[%d] max_count_per_rank: %d max_threads_per_MAX_COUNT: %d max_elements_per_MAX_COUNT: %d outerloop_iter_count: %d\n",
+        // temp_rank, max_count_per_rank, max_threads_per_MAX_COUNT, max_elements_per_MAX_COUNT, outerloop_iter_count);
+        //init the sw pipeline
+        int sw_pipeline_insert_index = 0;
+        int sw_pipeline_insert_counter = 0;
+        int sw_pipeline_kernel_state[NOCOPY_KERNEL_NUM];
+        int threads_already_processed[NOCOPY_KERNEL_NUM];
+        for (int i = 0; i < NOCOPY_KERNEL_NUM; i++) {
+            threads_already_processed[i] = 0;
+            sw_pipeline_kernel_state[i] = NO_KERNEL;
+        }
+
+        //cpu_timer<1> ctimer;
+        //ctimer.start(0);
+        void *in_buffers[max_rank];
+        this->exchange_peer_ipc_mem(queue,
+                                    global_comm,
+                                    global_stream,
+                                    (void **)send_buf,
+                                    NULL,
+                                    rank,
+                                    world,
+                                    0,
+                                    (void **)in_buffers,
+                                    NULL,
+                                    NULL,
+                                    NULL,
+                                    NULL);
+        //ctimer.stop(0);
+        //printf("exchange_peer_ipc_mem time: %fus\n", ctimer.get_us(0));
+
+        int first_iter = 1;
+        for (int iter = 0; iter < 2; iter++) {
+            uint32_t total_threads_needed;
+            uint32_t threads_needed_per_chunk;
+            if (iter == 1) //if second iteration, then handle the partial usage of the temp buffer
+            {
+                //if there is little more left to compute, then finish them
+                if (outerloop_iter_count * max_count_per_rank < recv_size) {
+                    start = outerloop_iter_count;
+                    end = start + 1;
+                    //total_threads_needed = (recv_size - start * max_elements_per_MAX_COUNT + SIMD_COMPUTE * temp_world - 1) / (SIMD_COMPUTE * temp_world);
+                    uint32_t leftover = recv_size - outerloop_iter_count * max_count_per_rank;
+                    threads_needed_per_chunk =
+                        (leftover + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE);
+                }
+                else {
+                    break;
+                }
+            }
+            else {
+                start = 0;
+                end = outerloop_iter_count;
+                //total_threads_needed = max_threads_per_MAX_COUNT;
+                threads_needed_per_chunk = max_count_per_rank / (SIMD_COMPUTE * UNROLL_SIZE);
+
+                if (end == 0)
+                    continue; //there is nothing to do when end is 0 so check the next iter.
+            }
+            total_threads_needed = threads_needed_per_chunk;
+
+            int innerloop_iter_count = (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+
+            //printf("iter: %d outer_iter start: %d end: %d\n", iter, start, end);
+            for (outer_iter = start; outer_iter < end + NOCOPY_KERNEL_NUM - 1; outer_iter++) {
+                //if more outer_iter remaining since there is more new processing to do, then insert them to the SW pipeline.
+                //During the sw pipeline tail, there is nothing to dispatch.
+                if (outer_iter < end) {
+                    sw_pipeline_kernel_state[sw_pipeline_insert_index] = FIRST_KERNEL;
+                    threads_already_processed[sw_pipeline_insert_index] = sw_pipeline_insert_counter;
+                    sw_pipeline_insert_index++;
+                    if (sw_pipeline_insert_index >= NOCOPY_KERNEL_NUM) {
+                        sw_pipeline_insert_index =
+                            0; //By the time the index wraps arounds, the kernel that was in this slot previously has already completed.
+                    }
+                    sw_pipeline_insert_counter += threads_needed_per_chunk;
+                }
+
+                if (first_iter) {
+                    e = local_sync(queue,
+                                   temp_rank,
+                                   temp_world,
+                                   size_per_buffer_for_sync_kernel * buffer_index_kernel_for_sync,
+                                   0,
+                                   0);
+                }
+                else {
+                    //sync all the ranks within the single GPU.
+                    e = global_sync(queue,
+                                    temp_rank,
+                                    temp_world,
+                                    size_per_buffer_for_sync_kernel * buffer_index_kernel_for_sync,
+                                    4,
+                                    1);
+                    sync_reset_counter++;
+                    buffer_index_kernel_for_sync++;
+                    buffer_index_kernel_for_sync %= NOCOPY_BUFFER_COUNT;
+                }
+                first_iter = 0;
+
+                e = queue.submit([&](sycl::handler &cgh) {
+                        cgh.parallel_for<class ReduceScatterLargeNoCopyKernel<data_type>>(
+                            sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                            {
+                            //ESIMD kernel
+
+                            //check if there is any kernel in the SW pipelines. If yes, execute them.
+                            //to optimize, the order of loop i=0,1,2,.. can be shuffled so that different
+                            // ranks can do different kernels at particular time.
+                            // The purpose is to better balance the HW resource usage in the PVC node.
+                            for (int ii = 0; ii < NOCOPY_KERNEL_NUM; ii++) {
+                                RUN_SECOND_KERNEL
+                                RUN_THIRD_KERNEL
+
+                            } // end of for
+                         });//parallel_for
+                }); //submit()
+                //e.wait();
+
+                //update the sw pipeline process state so that next kernel will be processed in next round
+                for (int i = 0; i < NOCOPY_KERNEL_NUM; i++) {
+                    if (sw_pipeline_kernel_state[i] & NOCOPY_LAST_KERNEL)
+                        sw_pipeline_kernel_state[i] =
+                            0; //remove the kernel from the sw pipeline if it is fifth kernel. Everything is already executed.
+                    else
+                        sw_pipeline_kernel_state[i] <<= 1;
+                }
+
+                //std::cout << "rank" << temp_rank << " iter" << iter << " outer_iter" << outer_iter << " kernel1 done." << "\n";
+
+            } // end of outer_iter
+        } // end of for iter = 2
+
+        buffer_index += sync_reset_counter;
+        buffer_index %= NOCOPY_BUFFER_COUNT;
+
+        return ccl::event::create_from_native(e);
+    }
+
+    //sync all the ranks here before consuming the results.
+    // offset = size_per_buffer_for_sync_kernel * buffer_index_kernel
+    sycl::event global_sync(sycl::queue queue,
+                            int temp_rank,
+                            uint32_t temp_world,
+                            int offset,
+                            int index,
+                            int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class ReduceScatterLargeKernel_GlobalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //since other ranks might still be doing local_sum, we need to sync ranks here.
+                //After the sync is done, the second half of hte temp buffer will be replaced with new sum val.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                for (uint32_t i = 0; i < temp_world; i++) {
+                    int *sync_ptr = (int *)temp_sync_buffer[i] + offset;
+
+                    lsc_atomic_update<atomic_op::inc,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(sync_ptr, ramp, pred);
+                }
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != temp_world) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    // sync tiles in a GPU
+    sycl::event local_sync(sycl::queue queue,
+                           int temp_rank,
+                           uint32_t temp_world,
+                           int offset,
+                           int index,
+                           int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class ReduceScatterLargeKernel_LocalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //sync only the rank pair within the same gpu.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank ^ 1] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+                sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != RANKS_PER_GPU) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    void release(sycl::queue &queue) {
+        // Clean up, close/put ipc handles, free memory, etc.
+        auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_context());
+        for (int i = 0; i < world; i++) {
+            if (i != rank) {
+                ZE_CALL(zeMemCloseIpcHandle, (l0_ctx, (char *)buffers[i] - offsets[i]));
+            }
+        }
+
+        sycl::free(buffers[rank], queue);
+        this->initialized = false;
+    }
+
+private:
+    void *buffers[max_rank];
+    void *sync_buffer[max_rank];
+    size_t offsets[max_rank];
+    ze_ipc_mem_handle_t ipc_handle[max_rank];
+    int rank{ ccl::utils::invalid_rank }, world{ ccl::utils::invalid_err_code };
+    size_t size_per_buffer{ 0 };
+    int data_size_per_buffer{ ccl::utils::invalid_bytes_value };
+    uint32_t max_count_per_rank{ 0 };
+    int buffer_index{ ccl::utils::invalid_err_code };
+    ccl_stream *global_stream{};
+    ccl_comm *global_comm{};
+    ccl_comm *even_comm{};
+};
+
+#define REDUCE_SCATTER_LARGE_API(TYPE) \
+    void init_reduce_scatter_large_##TYPE(ccl::datatype dtype, \
+                                          sycl::queue &queue, \
+                                          ccl_comm *comm, \
+                                          ccl_stream *stream, \
+                                          uint32_t rank_in, \
+                                          uint32_t world_in) { \
+        if (!rs_large_##TYPE.inited()) { \
+            LOG_INFO("invoking large reduce_scatter first time for datatype: ", dtype); \
+            rs_large_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+    } \
+\
+    ccl::event run_reduce_scatter_large_##TYPE(ccl::datatype dtype, \
+                                               sycl::queue queue, \
+                                               const void *send_buf, \
+                                               void *recv_buf, \
+                                               size_t recv_count, \
+                                               bool &done) { \
+        return rs_large_##TYPE.reduce_scatter(queue, send_buf, recv_buf, recv_count, done); \
+    }
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_bf16.cpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_bf16.cpp
new file mode 100644
index 000000000..ad35ca142
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_bf16.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.hpp"
+
+sycl_reduce_scatter_large<sycl::_V1::ext::oneapi::bfloat16> rs_large_bf16;
+
+REDUCE_SCATTER_LARGE_API(bf16);
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_fp16.cpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_fp16.cpp
new file mode 100644
index 000000000..aeb83cb69
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_fp16.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.hpp"
+
+sycl_reduce_scatter_large<sycl::half> rs_large_fp16;
+
+REDUCE_SCATTER_LARGE_API(fp16);
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_fp32.cpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_fp32.cpp
new file mode 100644
index 000000000..ff2bda744
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_fp32.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.hpp"
+
+sycl_reduce_scatter_large<float> rs_large_fp32;
+
+REDUCE_SCATTER_LARGE_API(fp32);
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_int32.cpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_int32.cpp
new file mode 100644
index 000000000..a622e94b2
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl_int32.cpp
@@ -0,0 +1,20 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_large_sycl.hpp"
+
+sycl_reduce_scatter_large<int32_t> rs_large_int32;
+
+REDUCE_SCATTER_LARGE_API(int32);
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_medium_sycl.cpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_medium_sycl.cpp
new file mode 100644
index 000000000..106ca5550
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_medium_sycl.cpp
@@ -0,0 +1,66 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_medium_sycl.hpp"
+
+sycl_reduce_scatter_medium<sycl::half> rs_medium_fp16;
+sycl_reduce_scatter_medium<sycl::_V1::ext::oneapi::bfloat16> rs_medium_bf16;
+sycl_reduce_scatter_medium<int32_t> rs_medium_int32;
+sycl_reduce_scatter_medium<float> rs_medium_fp32;
+
+#define SWITCH_INIT_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        if (!rs_medium_##TYPE.inited()) { \
+            LOG_INFO("invoking medium reduce_scatter first time for datatype: ", ccl_type); \
+            rs_medium_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+        break;
+
+void init_reduce_scatter_medium(ccl::datatype dtype,
+                                sycl::queue &queue,
+                                ccl_comm *comm,
+                                ccl_stream *stream,
+                                uint32_t rank_in,
+                                uint32_t world_in) {
+    switch (dtype) {
+        SWITCH_INIT_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_INIT_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_INIT_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_INIT_TYPE(int32, ccl::datatype::int32)
+        default: assert(0);
+    }
+}
+
+#define SWITCH_RUN_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        e = rs_medium_##TYPE.reduce_scatter(queue, send_buf, recv_buf, recv_count, 1, false, done); \
+        break;
+
+ccl::event run_reduce_scatter_medium(ccl::datatype dtype,
+                                     sycl::queue queue,
+                                     const void *send_buf,
+                                     void *recv_buf,
+                                     size_t recv_count,
+                                     bool &done) {
+    ccl::event e;
+    switch (dtype) {
+        SWITCH_RUN_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_RUN_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_RUN_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_RUN_TYPE(int32, ccl::datatype::int32)
+        default: assert(0);
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_medium_sycl.hpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_medium_sycl.hpp
new file mode 100644
index 000000000..7e9a9eb9a
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_medium_sycl.hpp
@@ -0,0 +1,1537 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define MAX_RANK         16
+#define SIMD_COMPUTE_MAX 256
+#define SIMD_COMPUTE     (SIMD_COMPUTE_MAX / sizeof(data_type))
+#define SIMD_SYNC        32
+#define UNROLL_SIZE      1
+#define BUFFER_COUNT     2
+#define SYNC_BYTE        (SIMD_SYNC * sizeof(int) * 2)
+#define ALIGNMENT_BYTE   256
+#define NOCOPY_MAX_COUNT (256 * 1024 * 1024 / sizeof(data_type))
+#define COPY_MAX_COUNT   (32 * 1024 * 1024 / sizeof(data_type))
+//#define EU_COUNT_PER_RANK 448
+#define EU_COUNT_PER_RANK   512
+#define THREAD_COUNT_PER_EU 8
+#define HW_THREAD_COUNT     (EU_COUNT_PER_RANK * THREAD_COUNT_PER_EU)
+#define RANKS_PER_GPU       2
+
+// nocopy version only use half of the buffer of the copy version
+template <uint32_t TEMP_WORLD, typename data_type>
+void nocopy_reduce_read_write(int *even_ranks,
+                              int my_rank_index,
+                              int idx,
+                              void **in_buffers,
+                              void *out_buffer,
+                              uint32_t recv_size,
+                              int threads_already_processed,
+                              void *temp_buffer[],
+                              uint32_t temp_rank,
+                              int outer_iter,
+                              int size_per_buffer_kernel,
+                              int buffer_index_kernel,
+                              uint32_t threads_needed_per_chunk) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int chunk_size = threads_needed_per_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    int read_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+
+    data_type *mdfi_ptr = (data_type *)in_buffers[temp_rank ^ 1];
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> mdfi_buffer;
+    data_type *local_ptr = (data_type *)in_buffers[temp_rank];
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> local_buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r]; // even rank copies odd chunks
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            mdfi_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>(mdfi_ptr + rr * recv_size + read_offset + i * SIMD_COMPUTE);
+            local_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>(local_ptr + rr * recv_size + read_offset + i * SIMD_COMPUTE);
+        }
+    }
+
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> sum;
+    if (even_ranks[0] == 0) {
+#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            sum.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) =
+                local_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) +
+                mdfi_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+        }
+    }
+    else {
+#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            sum.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) =
+                mdfi_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) +
+                local_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+        }
+    }
+
+    //store the result to the buffer
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r];
+        data_type *write_ptr = (data_type *)temp_buffer[rr];
+        //write_ptr += size_per_buffer_kernel * buffer_index_kernel;
+        int out_offset = (temp_rank / 2) * chunk_size + idx * SIMD_COMPUTE * UNROLL_SIZE;
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::write_back> //save the all sum in the second half of the temp slot.
+                (write_ptr + out_offset + i * SIMD_COMPUTE,
+                 sum.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i));
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void local_copy(int *even_ranks,
+                int idx,
+                const void *in_buffer,
+                uint32_t recv_size,
+                int threads_already_processed,
+                void *temp_buffer[],
+                uint32_t temp_rank,
+                int outer_iter,
+                int size_per_buffer_kernel,
+                int buffer_index_kernel,
+                uint32_t threads_needed_per_chunk) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int chunk_size = threads_needed_per_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    // even rank copies odd chunks
+    int read_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r] ^ 1; // even rank copies odd chunks
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)in_buffer + rr * recv_size + read_offset +
+                                                     i * SIMD_COMPUTE);
+        }
+    }
+
+    // write to myrank's second half of the temp buffer
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    //ptr += size_per_buffer_kernel * buffer_index_kernel;
+    ptr += chunk_size * TEMP_WORLD / 2 + idx * SIMD_COMPUTE * UNROLL_SIZE;
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::write_back>(
+                ptr + i * SIMD_COMPUTE,
+                buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + i * SIMD_COMPUTE));
+        }
+        ptr += chunk_size;
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void reduce_read_write(int *even_ranks,
+                       int my_rank_index,
+                       int idx,
+                       const void *in_buffer,
+                       void *out_buffer,
+                       uint32_t recv_size,
+                       int threads_already_processed,
+                       void *temp_buffer[],
+                       uint32_t temp_rank,
+                       int outer_iter,
+                       int size_per_buffer_kernel,
+                       int buffer_index_kernel,
+                       uint32_t threads_needed_per_chunk) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int chunk_size = threads_needed_per_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+
+    data_type *mdfi_ptr = (data_type *)temp_buffer[temp_rank ^ 1];
+    //mdfi_ptr += size_per_buffer_kernel * buffer_index_kernel;
+    int mdfi_offset = chunk_size * TEMP_WORLD / 2 + idx * SIMD_COMPUTE * UNROLL_SIZE;
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> mdfi_buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            mdfi_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)mdfi_ptr + mdfi_offset + i * SIMD_COMPUTE);
+        }
+        mdfi_ptr += chunk_size;
+    }
+
+    int abs_offset_in_chunk = idx + threads_already_processed;
+    int read_offset = abs_offset_in_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> local_buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        int rr = even_ranks[r];
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            local_buffer.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::uncached>((data_type *)in_buffer + rr * recv_size + read_offset +
+                                                     i * SIMD_COMPUTE);
+        }
+    }
+
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> sum;
+    if (even_ranks[0] == 0) {
+#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            sum.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) =
+                local_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) +
+                mdfi_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+        }
+    }
+    else {
+#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            sum.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) =
+                mdfi_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE) +
+                local_buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+        }
+    }
+
+    //store the result to the first half of the buffer
+    if (TEMP_WORLD > 2) {
+        //#pragma unroll
+        for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+            int rr = even_ranks[r];
+            data_type *write_ptr = (data_type *)temp_buffer[rr];
+            //write_ptr += size_per_buffer_kernel * buffer_index_kernel;
+            int out_offset = (temp_rank / 2) * chunk_size + idx * SIMD_COMPUTE * UNROLL_SIZE;
+#pragma unroll
+            for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+                lsc_block_store<data_type,
+                                SIMD_COMPUTE,
+                                lsc_data_size::default_size,
+                                cache_hint::uncached,
+                                cache_hint::write_back> //save the all sum in the second half of the temp slot.
+                    (write_ptr + out_offset + i * SIMD_COMPUTE,
+                     sum.template select<SIMD_COMPUTE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE + SIMD_COMPUTE * i));
+            }
+        }
+    }
+    else {
+        // directly write to output
+        data_type *write_ptr = (data_type *)out_buffer;
+        write_ptr += (idx + threads_already_processed) * SIMD_COMPUTE * UNROLL_SIZE;
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::write_back> //save the all sum in the second half of the temp slot.
+                (write_ptr + i * SIMD_COMPUTE, sum.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * i));
+        }
+    }
+}
+
+template <uint32_t TEMP_WORLD, typename data_type>
+void all_sum(int idx,
+             const void *in_buffer,
+             void *out_buffer,
+             uint32_t recv_size,
+             int threads_already_processed,
+             void *temp_buffer[],
+             uint32_t temp_rank,
+             int outer_iter,
+             int size_per_buffer_kernel,
+             int buffer_index_kernel,
+             uint32_t threads_needed_per_chunk) {
+    using namespace __ESIMD_NS;
+    using namespace __ESIMD_ENS;
+
+    int chunk_size = threads_needed_per_chunk * SIMD_COMPUTE * UNROLL_SIZE;
+    //read the input data
+    data_type *ptr = (data_type *)temp_buffer[temp_rank];
+    //ptr += size_per_buffer_kernel * buffer_index_kernel;
+    int read_offset = idx * SIMD_COMPUTE * UNROLL_SIZE;
+    simd<data_type, SIMD_COMPUTE * UNROLL_SIZE * TEMP_WORLD / 2> buffer;
+    //#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            buffer.template select<SIMD_COMPUTE, 1>(SIMD_COMPUTE * UNROLL_SIZE * r + i * SIMD_COMPUTE) =
+                lsc_block_load<data_type,
+                               SIMD_COMPUTE,
+                               lsc_data_size::default_size,
+                               cache_hint::uncached,
+                               cache_hint::cached>(ptr + read_offset + i * SIMD_COMPUTE);
+        }
+        ptr += chunk_size;
+    }
+    simd<data_type, SIMD_COMPUTE *UNROLL_SIZE> sum = 0;
+#pragma unroll
+    for (uint32_t r = 0; r < TEMP_WORLD / 2; r++) {
+        sum = sum + buffer.template select<SIMD_COMPUTE * UNROLL_SIZE, 1>(r * SIMD_COMPUTE * UNROLL_SIZE);
+    }
+
+    //store the result
+    data_type *write_ptr = (data_type *)out_buffer;
+    int write_offset = (idx + threads_already_processed) * SIMD_COMPUTE * UNROLL_SIZE;
+    if (write_offset + SIMD_COMPUTE * UNROLL_SIZE <= recv_size) {
+        write_ptr += write_offset;
+#pragma unroll
+        for (uint32_t i = 0; i < UNROLL_SIZE; i++) {
+            lsc_block_store<data_type,
+                            SIMD_COMPUTE,
+                            lsc_data_size::default_size,
+                            cache_hint::uncached,
+                            cache_hint::write_back> //save the all sum in the second half of the temp slot.
+                (write_ptr + i * SIMD_COMPUTE, sum.template select<SIMD_COMPUTE, 1>(i * SIMD_COMPUTE));
+        }
+    }
+    else {
+        for (uint32_t i = write_offset; i < recv_size; i++)
+            *(write_ptr + i) = sum[i - write_offset];
+    }
+}
+
+template <typename dtype>
+class ReduceScatterMediumKernel_local_copy;
+template <typename dtype>
+class ReduceScatterMediumKernel_reduce_read_write;
+template <typename dtype>
+class ReduceScatterMediumKernel_local_all_sum;
+
+template <typename dtype>
+class ReduceScatterMediumKernel_nocopy_reduce_read_write;
+template <typename dtype>
+class ReduceScatterMediumKernel_nocopy_local_all_sum;
+
+template <typename dtype>
+class ReduceScatterMediumKernel_GlobalSync;
+template <typename dtype>
+class ReduceScatterMediumKernel_LocalSync;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK, uint32_t max_buffer = 1024 /*KB*/>
+class sycl_reduce_scatter_medium : public sycl_coll_base<data_type> {
+public:
+    sycl_reduce_scatter_medium() : sycl_coll_base<data_type>() {
+        size_per_buffer = 0;
+        buffer_index = 0;
+        data_size_per_buffer = 0;
+    }
+
+    void init(sycl::queue &queue, ccl_comm *comm, ccl_stream *stream, uint32_t rank_in, uint32_t world_in) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+        rank = rank_in;
+        world = world_in;
+        // temporal buffer used for allreduce temporal use only.
+
+        max_count_per_rank = (COPY_MAX_COUNT + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE) *
+                             SIMD_COMPUTE * UNROLL_SIZE;
+        data_size_per_buffer = max_count_per_rank * world;
+        size_per_buffer = data_size_per_buffer * sizeof(data_type);
+        size_per_buffer = (size_per_buffer + ALIGNMENT_BYTE - 1) / ALIGNMENT_BYTE * ALIGNMENT_BYTE;
+
+        void *local_buffer = sycl::malloc_device(size_per_buffer + SYNC_BYTE * BUFFER_COUNT, queue);
+        //printf("allocate temp buffer: max_count_per_rank:%d size_per_buffer:%ld %ld\n", max_count_per_rank, size_per_buffer, (size_t)size_per_buffer + SYNC_BYTE * BUFFER_COUNT);
+        auto e = queue.memset(local_buffer, 0, size_per_buffer + SYNC_BYTE * BUFFER_COUNT);
+        e.wait();
+
+        // XXX: gain access to remote pointers
+        this->exchange_peer_ipc_mem(queue,
+                                    comm,
+                                    stream,
+                                    local_buffer,
+                                    NULL,
+                                    rank,
+                                    world,
+                                    data_size_per_buffer * sizeof(data_type),
+                                    (void **)buffers,
+                                    (void **)sync_buffer,
+                                    offsets,
+                                    ipc_handle,
+                                    NULL,
+                                    NULL /* mmap_buffers */,
+                                    false /* to_cache */);
+
+        this->initialized = true;
+
+        global_stream = stream;
+        global_comm = comm;
+        even_comm = global_comm->get_even_comm().get();
+    }
+
+    ccl::event reduce_scatter(sycl::queue &queue,
+                              const void *send_buf,
+                              void *out_buffer,
+                              uint32_t recv_size,
+                              int repetition,
+                              bool print_en,
+                              bool &done) {
+        return reduce_scatter_copy(queue, send_buf, out_buffer, recv_size, repetition, print_en, done);
+    }
+
+private:
+    ccl::event reduce_scatter_copy(sycl::queue &queue,
+                                   const void *send_buf,
+                                   void *out_buffer,
+                                   uint32_t recv_size,
+                                   int repetition,
+                                   bool print_en,
+                                   bool &done) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        done = true;
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        /*
+        void* temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) 
+        {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }*/
+
+        if (recv_size / (SIMD_COMPUTE * UNROLL_SIZE) < temp_world) {
+            done = false;
+            return ccl::event::create_from_native(e);
+        }
+
+        int even_ranks[max_rank];
+        int my_rank_index = -1;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank)
+                my_rank_index = i;
+            //printf("even rank %d: %d neighbor: %d\n", i, even_ranks[i], even_ranks[i] ^ 1);
+        }
+        int size_per_buffer_kernel __attribute__((unused)) = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel __attribute__((unused)) =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        int buffer_index_kernel __attribute__((unused)) = buffer_index;
+        int outerloop_iter_count; //Since 16 elements in temp buffer is used to process 8 element output, the outer loop count must be doubled roughly.
+        int outer_iter;
+        //todo:
+        //5. prefetch in persistent threads?
+        int max_elements_per_MAX_COUNT __attribute__((unused)) = (recv_size + SIMD_COMPUTE * UNROLL_SIZE - 1) /
+                                                                 (SIMD_COMPUTE * UNROLL_SIZE) * SIMD_COMPUTE *
+                                                                 UNROLL_SIZE;
+        int max_threads_per_MAX_COUNT __attribute__((unused)) = max_count_per_rank / (SIMD_COMPUTE * UNROLL_SIZE);
+        //this is the outerloop count that requires full hw thread count.
+        //This doesnt include the outloop iteration that only needs partial thread count
+        int threads_already_processed __attribute__((unused)) = 0;
+        outerloop_iter_count = (recv_size + max_count_per_rank - 1) / max_count_per_rank;
+        uint32_t total_threads_needed_sync __attribute__((unused)) = 1;
+
+        //printf("[%d] max_count_per_rank: %d max_threads_per_MAX_COUNT: %d max_elements_per_MAX_COUNT: %d outerloop_iter_count: %d\n",
+        // temp_rank, max_count_per_rank, max_threads_per_MAX_COUNT, max_elements_per_MAX_COUNT, outerloop_iter_count);
+        for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++) {
+            uint32_t threads_needed_per_chunk __attribute__((unused));
+            uint32_t total_threads_needed __attribute__((unused));
+            if ((outer_iter + 1) * max_count_per_rank < recv_size) {
+                threads_needed_per_chunk = max_count_per_rank / (SIMD_COMPUTE * UNROLL_SIZE);
+            }
+            else {
+                uint32_t leftover = recv_size - outer_iter * max_count_per_rank;
+                threads_needed_per_chunk =
+                    (leftover + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE);
+            }
+            int wg_size __attribute__((unused)) = 1;
+            total_threads_needed = threads_needed_per_chunk;
+
+            int innerloop_iter_count __attribute__((unused)) =
+                (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+                //printf("outer_iter=%d outerloop_iter_count: %d total_threads_needed: %d threads_needed_per_chunk:
+                // %d innerloop_iter_count: %d persist_threads_needed: %d\n", outer_iter, outerloop_iter_count,
+                // total_threads_needed, threads_needed_per_chunk, innerloop_iter_count, persist_threads_needed);
+
+#define KERNEL_EXEC_MAP (1 + 2 + 4 + 8 + 16)
+
+#if KERNEL_EXEC_MAP & 1
+            // local copy half of the data to tmp buffer
+            queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class ReduceScatterMediumKernel_local_copy<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        uint32_t idx = idx2.get_global_id();
+                        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++) {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= total_threads_needed)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    local_copy<2, data_type>((int *)even_ranks,
+                                                             index,
+                                                             send_buf,
+                                                             recv_size,
+                                                             threads_already_processed,
+                                                             (void **)temp_buffer,
+                                                             temp_rank,
+                                                             outer_iter,
+                                                             size_per_buffer_kernel,
+                                                             buffer_index_kernel,
+                                                             threads_needed_per_chunk);
+                                    break;
+                                case 4:
+                                    local_copy<4, data_type>((int *)even_ranks,
+                                                             index,
+                                                             send_buf,
+                                                             recv_size,
+                                                             threads_already_processed,
+                                                             (void **)temp_buffer,
+                                                             temp_rank,
+                                                             outer_iter,
+                                                             size_per_buffer_kernel,
+                                                             buffer_index_kernel,
+                                                             threads_needed_per_chunk);
+                                    break;
+                                case 6:
+                                    local_copy<6, data_type>((int *)even_ranks,
+                                                             index,
+                                                             send_buf,
+                                                             recv_size,
+                                                             threads_already_processed,
+                                                             (void **)temp_buffer,
+                                                             temp_rank,
+                                                             outer_iter,
+                                                             size_per_buffer_kernel,
+                                                             buffer_index_kernel,
+                                                             threads_needed_per_chunk);
+                                    break;
+                                case 8:
+                                    local_copy<8, data_type>((int *)even_ranks,
+                                                             index,
+                                                             send_buf,
+                                                             recv_size,
+                                                             threads_already_processed,
+                                                             (void **)temp_buffer,
+                                                             temp_rank,
+                                                             outer_iter,
+                                                             size_per_buffer_kernel,
+                                                             buffer_index_kernel,
+                                                             threads_needed_per_chunk);
+                                    break;
+                                case 10:
+                                    local_copy<10, data_type>((int *)even_ranks,
+                                                              index,
+                                                              send_buf,
+                                                              recv_size,
+                                                              threads_already_processed,
+                                                              (void **)temp_buffer,
+                                                              temp_rank,
+                                                              outer_iter,
+                                                              size_per_buffer_kernel,
+                                                              buffer_index_kernel,
+                                                              threads_needed_per_chunk);
+                                    break;
+                                case 12:
+                                    local_copy<12, data_type>((int *)even_ranks,
+                                                              index,
+                                                              send_buf,
+                                                              recv_size,
+                                                              threads_already_processed,
+                                                              (void **)temp_buffer,
+                                                              temp_rank,
+                                                              outer_iter,
+                                                              size_per_buffer_kernel,
+                                                              buffer_index_kernel,
+                                                              threads_needed_per_chunk);
+                                    break;
+                                case 14:
+                                    local_copy<14, data_type>((int *)even_ranks,
+                                                              index,
+                                                              send_buf,
+                                                              recv_size,
+                                                              threads_already_processed,
+                                                              (void **)temp_buffer,
+                                                              temp_rank,
+                                                              outer_iter,
+                                                              size_per_buffer_kernel,
+                                                              buffer_index_kernel,
+                                                              threads_needed_per_chunk);
+                                    break;
+                                case 16:
+                                    local_copy<16, data_type>((int *)even_ranks,
+                                                              index,
+                                                              send_buf,
+                                                              recv_size,
+                                                              threads_already_processed,
+                                                              (void **)temp_buffer,
+                                                              temp_rank,
+                                                              outer_iter,
+                                                              size_per_buffer_kernel,
+                                                              buffer_index_kernel,
+                                                              threads_needed_per_chunk);
+                                    break;
+                                default: break;
+                            }
+                        }
+                        });//parallel_for
+            }); //submit()
+            //printf("kernel0\n");
+#endif
+#if KERNEL_EXEC_MAP & 2
+            //sync all the ranks within the single GPU.
+            e = local_sync(queue, temp_rank, temp_world, SYNC_BYTE * buffer_index_kernel, 0, 0);
+            //printf("kernel1\n");
+#endif
+#if KERNEL_EXEC_MAP & 4
+            //local reduction kernel
+            e = queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class ReduceScatterMediumKernel_reduce_read_write<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        uint32_t idx = idx2.get_global_id();
+                        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++)
+                        //for (int inner_iter = 0; inner_iter < 1; inner_iter++)
+                        {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= total_threads_needed)
+                                break;
+                            //                                int index = idx;
+
+                            switch (temp_world) {
+                                case 2:
+                                    reduce_read_write<2, data_type>((int *)even_ranks,
+                                                                    my_rank_index,
+                                                                    index,
+                                                                    send_buf,
+                                                                    out_buffer,
+                                                                    recv_size,
+                                                                    threads_already_processed,
+                                                                    (void **)temp_buffer,
+                                                                    temp_rank,
+                                                                    outer_iter,
+                                                                    size_per_buffer_kernel,
+                                                                    buffer_index_kernel,
+                                                                    threads_needed_per_chunk);
+                                    break;
+                                case 4:
+                                    reduce_read_write<4, data_type>((int *)even_ranks,
+                                                                    my_rank_index,
+                                                                    index,
+                                                                    send_buf,
+                                                                    out_buffer,
+                                                                    recv_size,
+                                                                    threads_already_processed,
+                                                                    (void **)temp_buffer,
+                                                                    temp_rank,
+                                                                    outer_iter,
+                                                                    size_per_buffer_kernel,
+                                                                    buffer_index_kernel,
+                                                                    threads_needed_per_chunk);
+                                    break;
+                                case 6:
+                                    reduce_read_write<6, data_type>((int *)even_ranks,
+                                                                    my_rank_index,
+                                                                    index,
+                                                                    send_buf,
+                                                                    out_buffer,
+                                                                    recv_size,
+                                                                    threads_already_processed,
+                                                                    (void **)temp_buffer,
+                                                                    temp_rank,
+                                                                    outer_iter,
+                                                                    size_per_buffer_kernel,
+                                                                    buffer_index_kernel,
+                                                                    threads_needed_per_chunk);
+                                    break;
+                                case 8:
+                                    reduce_read_write<8, data_type>((int *)even_ranks,
+                                                                    my_rank_index,
+                                                                    index,
+                                                                    send_buf,
+                                                                    out_buffer,
+                                                                    recv_size,
+                                                                    threads_already_processed,
+                                                                    (void **)temp_buffer,
+                                                                    temp_rank,
+                                                                    outer_iter,
+                                                                    size_per_buffer_kernel,
+                                                                    buffer_index_kernel,
+                                                                    threads_needed_per_chunk);
+                                    break;
+                                case 10:
+                                    reduce_read_write<10, data_type>((int *)even_ranks,
+                                                                     my_rank_index,
+                                                                     index,
+                                                                     send_buf,
+                                                                     out_buffer,
+                                                                     recv_size,
+                                                                     threads_already_processed,
+                                                                     (void **)temp_buffer,
+                                                                     temp_rank,
+                                                                     outer_iter,
+                                                                     size_per_buffer_kernel,
+                                                                     buffer_index_kernel,
+                                                                     threads_needed_per_chunk);
+                                    break;
+                                case 12:
+                                    reduce_read_write<12, data_type>((int *)even_ranks,
+                                                                     my_rank_index,
+                                                                     index,
+                                                                     send_buf,
+                                                                     out_buffer,
+                                                                     recv_size,
+                                                                     threads_already_processed,
+                                                                     (void **)temp_buffer,
+                                                                     temp_rank,
+                                                                     outer_iter,
+                                                                     size_per_buffer_kernel,
+                                                                     buffer_index_kernel,
+                                                                     threads_needed_per_chunk);
+                                    break;
+                                case 14:
+                                    reduce_read_write<14, data_type>((int *)even_ranks,
+                                                                     my_rank_index,
+                                                                     index,
+                                                                     send_buf,
+                                                                     out_buffer,
+                                                                     recv_size,
+                                                                     threads_already_processed,
+                                                                     (void **)temp_buffer,
+                                                                     temp_rank,
+                                                                     outer_iter,
+                                                                     size_per_buffer_kernel,
+                                                                     buffer_index_kernel,
+                                                                     threads_needed_per_chunk);
+                                    break;
+                                case 16:
+                                    reduce_read_write<16, data_type>((int *)even_ranks,
+                                                                     my_rank_index,
+                                                                     index,
+                                                                     send_buf,
+                                                                     out_buffer,
+                                                                     recv_size,
+                                                                     threads_already_processed,
+                                                                     (void **)temp_buffer,
+                                                                     temp_rank,
+                                                                     outer_iter,
+                                                                     size_per_buffer_kernel,
+                                                                     buffer_index_kernel,
+                                                                     threads_needed_per_chunk);
+                                    break;
+                                default: break;
+                            }
+                        }
+                        });//parallel_for
+            }); //submit()
+            //printf("kernel2\n");
+#endif
+#if KERNEL_EXEC_MAP & 8
+            //sync all the ranks here before consuming the results.
+            e = global_sync(queue, temp_rank, temp_world, SYNC_BYTE * buffer_index_kernel, 1, 1);
+            //printf("kernel3\n");
+#endif
+#if KERNEL_EXEC_MAP & 16
+            if (temp_world > 2) {
+                int innerloop_local_sum_iter_count __attribute__((unused)) =
+                    (threads_needed_per_chunk + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+                uint32_t persist_local_sum_threads_needed = threads_needed_per_chunk;
+                if (persist_local_sum_threads_needed > HW_THREAD_COUNT)
+                    persist_local_sum_threads_needed = HW_THREAD_COUNT;
+                //local reduction kernel
+                e = queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class ReduceScatterMediumKernel_local_all_sum<data_type>>(
+                        sycl::nd_range<1>({ persist_local_sum_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        uint32_t idx = idx2.get_global_id();
+                        for (int inner_iter = 0; inner_iter < innerloop_local_sum_iter_count; inner_iter++) {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= threads_needed_per_chunk)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    all_sum<2, data_type>(index,
+                                                          send_buf,
+                                                          out_buffer,
+                                                          recv_size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          threads_needed_per_chunk);
+                                    break;
+                                case 4:
+                                    all_sum<4, data_type>(index,
+                                                          send_buf,
+                                                          out_buffer,
+                                                          recv_size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          threads_needed_per_chunk);
+                                    break;
+                                case 6:
+                                    all_sum<6, data_type>(index,
+                                                          send_buf,
+                                                          out_buffer,
+                                                          recv_size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          threads_needed_per_chunk);
+                                    break;
+                                case 8:
+                                    all_sum<8, data_type>(index,
+                                                          send_buf,
+                                                          out_buffer,
+                                                          recv_size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          threads_needed_per_chunk);
+                                    break;
+                                case 10:
+                                    all_sum<10, data_type>(index,
+                                                           send_buf,
+                                                           out_buffer,
+                                                           recv_size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           threads_needed_per_chunk);
+                                    break;
+                                case 12:
+                                    all_sum<12, data_type>(index,
+                                                           send_buf,
+                                                           out_buffer,
+                                                           recv_size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           threads_needed_per_chunk);
+                                    break;
+                                case 14:
+                                    all_sum<14, data_type>(index,
+                                                           send_buf,
+                                                           out_buffer,
+                                                           recv_size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           threads_needed_per_chunk);
+                                    break;
+                                case 16:
+                                    all_sum<16, data_type>(index,
+                                                           send_buf,
+                                                           out_buffer,
+                                                           recv_size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           threads_needed_per_chunk);
+                                    break;
+                                default: break;
+                            }
+                        }
+                        });//parallel_for
+                }); //submit()
+                //printf("kernel4\n");
+            } // end if
+#endif
+            threads_already_processed += threads_needed_per_chunk;
+            buffer_index++;
+            buffer_index %= BUFFER_COUNT;
+            buffer_index_kernel = buffer_index;
+        } //for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++)
+
+        return ccl::event::create_from_native(e);
+    }
+
+    ccl::event reduce_scatter_nocopy(sycl::queue &queue,
+                                     const void *send_buf,
+                                     void *out_buffer,
+                                     uint32_t recv_size,
+                                     int repetition,
+                                     bool print_en,
+                                     bool &done) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+        assert(this->initialized == true);
+        done = true;
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        /*
+        void* temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) 
+        {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }*/
+
+        if (recv_size / (SIMD_COMPUTE * UNROLL_SIZE) < temp_world) {
+            done = false;
+            return ccl::event::create_from_native(e);
+        }
+#if 0
+	if (recv_size > max_count_per_rank) {
+            //printf("reduce-scatter medium fallback recv_size: %d max_count_per_rank: %d \n", recv_size, max_count_per_rank);
+            done = false;
+	    return ccl::event::create_from_native(e);
+	}
+#endif
+        int even_ranks[max_rank];
+        int my_rank_index = -1;
+        for (int i = 0; i < world / 2; i++) {
+            even_ranks[i] = even_comm->get_global_rank(i);
+            if (even_ranks[i] == (int)temp_rank)
+                my_rank_index = i;
+            //printf("even rank %d: %d neighbor: %d\n", i, even_ranks[i], even_ranks[i] ^ 1);
+        }
+
+        int size_per_buffer_kernel __attribute__((unused)) = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel __attribute__((unused)) =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+        int buffer_index_kernel __attribute__((unused)) = buffer_index;
+        int outerloop_iter_count; //Since 16 elements in temp buffer is used to process 8 element output, the outer loop count must be doubled roughly.
+        int outer_iter;
+        //todo:
+        //5. prefetch in persistent threads?
+        int max_elements_per_MAX_COUNT __attribute__((unused)) = (recv_size + SIMD_COMPUTE * UNROLL_SIZE - 1) /
+                                                                 (SIMD_COMPUTE * UNROLL_SIZE) * SIMD_COMPUTE *
+                                                                 UNROLL_SIZE;
+        int max_threads_per_MAX_COUNT __attribute__((unused)) = max_count_per_rank / (SIMD_COMPUTE * UNROLL_SIZE);
+
+        int threads_already_processed __attribute__((unused)) = 0;
+        outerloop_iter_count =
+            (recv_size + max_count_per_rank - 1) /
+            max_count_per_rank; //this is the outerloop count that requires full hw thread count. This doesnt include the outloop iteration that only needs partial thread count
+        uint32_t total_threads_needed_sync __attribute__((unused)) = 1;
+
+        void *in_buffers[max_rank];
+        this->exchange_peer_ipc_mem(queue,
+                                    global_comm,
+                                    global_stream,
+                                    (void **)send_buf,
+                                    NULL,
+                                    rank,
+                                    world,
+                                    0,
+                                    (void **)in_buffers,
+                                    NULL,
+                                    NULL,
+                                    NULL,
+                                    NULL);
+
+        //must sync tiles in the single GPU.
+        e = local_sync(queue, temp_rank, temp_world, SYNC_BYTE * buffer_index_kernel, 0, 0);
+
+        //printf("[%d] max_count_per_rank: %d max_threads_per_MAX_COUNT: %d max_elements_per_MAX_COUNT: %d outerloop_iter_count: %d\n",
+        //temp_rank, max_count_per_rank, max_threads_per_MAX_COUNT, max_elements_per_MAX_COUNT, outerloop_iter_count);
+        for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++) {
+            uint32_t threads_needed_per_chunk __attribute__((unused));
+            uint32_t total_threads_needed __attribute__((unused));
+            if ((outer_iter + 1) * max_count_per_rank < recv_size) {
+                threads_needed_per_chunk = max_count_per_rank / (SIMD_COMPUTE * UNROLL_SIZE);
+            }
+            else {
+                uint32_t leftover = recv_size - outer_iter * max_count_per_rank;
+                threads_needed_per_chunk =
+                    (leftover + SIMD_COMPUTE * UNROLL_SIZE - 1) / (SIMD_COMPUTE * UNROLL_SIZE);
+            }
+            int wg_size __attribute__((unused)) = 1;
+            total_threads_needed = threads_needed_per_chunk;
+
+            int innerloop_iter_count __attribute__((unused)) =
+                (total_threads_needed + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+
+            uint32_t persist_threads_needed = total_threads_needed;
+            if (persist_threads_needed > HW_THREAD_COUNT)
+                persist_threads_needed = HW_THREAD_COUNT;
+            //printf("outer_iter=%d outerloop_iter_count: %d total_threads_needed: %d threads_needed_per_chunk: %d innerloop_iter_count:
+            //%d persist_threads_needed: %d\n", outer_iter, outerloop_iter_count, total_threads_needed, threads_needed_per_chunk, innerloop_iter_count, persist_threads_needed);
+
+            //local reduction kernel
+            e = queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class ReduceScatterMediumKernel_nocopy_reduce_read_write<data_type>>(
+                        sycl::nd_range<1>({ persist_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        uint32_t idx = idx2.get_global_id();
+                        for (int inner_iter = 0; inner_iter < innerloop_iter_count; inner_iter++)
+                        //for (int inner_iter = 0; inner_iter < 1; inner_iter++)
+                        {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= total_threads_needed)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    nocopy_reduce_read_write<2, data_type>((int *)even_ranks,
+                                                                           my_rank_index,
+                                                                           index,
+                                                                           (void **)in_buffers,
+                                                                           out_buffer,
+                                                                           recv_size,
+                                                                           threads_already_processed,
+                                                                           (void **)temp_buffer,
+                                                                           temp_rank,
+                                                                           outer_iter,
+                                                                           size_per_buffer_kernel,
+                                                                           buffer_index_kernel,
+                                                                           threads_needed_per_chunk);
+                                    break;
+                                case 4:
+                                    nocopy_reduce_read_write<4, data_type>((int *)even_ranks,
+                                                                           my_rank_index,
+                                                                           index,
+                                                                           (void **)in_buffers,
+                                                                           out_buffer,
+                                                                           recv_size,
+                                                                           threads_already_processed,
+                                                                           (void **)temp_buffer,
+                                                                           temp_rank,
+                                                                           outer_iter,
+                                                                           size_per_buffer_kernel,
+                                                                           buffer_index_kernel,
+                                                                           threads_needed_per_chunk);
+                                    break;
+                                case 6:
+                                    nocopy_reduce_read_write<6, data_type>((int *)even_ranks,
+                                                                           my_rank_index,
+                                                                           index,
+                                                                           (void **)in_buffers,
+                                                                           out_buffer,
+                                                                           recv_size,
+                                                                           threads_already_processed,
+                                                                           (void **)temp_buffer,
+                                                                           temp_rank,
+                                                                           outer_iter,
+                                                                           size_per_buffer_kernel,
+                                                                           buffer_index_kernel,
+                                                                           threads_needed_per_chunk);
+                                    break;
+                                case 8:
+                                    nocopy_reduce_read_write<8, data_type>((int *)even_ranks,
+                                                                           my_rank_index,
+                                                                           index,
+                                                                           (void **)in_buffers,
+                                                                           out_buffer,
+                                                                           recv_size,
+                                                                           threads_already_processed,
+                                                                           (void **)temp_buffer,
+                                                                           temp_rank,
+                                                                           outer_iter,
+                                                                           size_per_buffer_kernel,
+                                                                           buffer_index_kernel,
+                                                                           threads_needed_per_chunk);
+                                    break;
+                                case 10:
+                                    nocopy_reduce_read_write<10, data_type>((int *)even_ranks,
+                                                                            my_rank_index,
+                                                                            index,
+                                                                            (void **)in_buffers,
+                                                                            out_buffer,
+                                                                            recv_size,
+                                                                            threads_already_processed,
+                                                                            (void **)temp_buffer,
+                                                                            temp_rank,
+                                                                            outer_iter,
+                                                                            size_per_buffer_kernel,
+                                                                            buffer_index_kernel,
+                                                                            threads_needed_per_chunk);
+                                    break;
+                                case 12:
+                                    nocopy_reduce_read_write<12, data_type>((int *)even_ranks,
+                                                                            my_rank_index,
+                                                                            index,
+                                                                            (void **)in_buffers,
+                                                                            out_buffer,
+                                                                            recv_size,
+                                                                            threads_already_processed,
+                                                                            (void **)temp_buffer,
+                                                                            temp_rank,
+                                                                            outer_iter,
+                                                                            size_per_buffer_kernel,
+                                                                            buffer_index_kernel,
+                                                                            threads_needed_per_chunk);
+                                    break;
+                                case 14:
+                                    nocopy_reduce_read_write<14, data_type>((int *)even_ranks,
+                                                                            my_rank_index,
+                                                                            index,
+                                                                            (void **)in_buffers,
+                                                                            out_buffer,
+                                                                            recv_size,
+                                                                            threads_already_processed,
+                                                                            (void **)temp_buffer,
+                                                                            temp_rank,
+                                                                            outer_iter,
+                                                                            size_per_buffer_kernel,
+                                                                            buffer_index_kernel,
+                                                                            threads_needed_per_chunk);
+                                    break;
+                                case 16:
+                                    nocopy_reduce_read_write<16, data_type>((int *)even_ranks,
+                                                                            my_rank_index,
+                                                                            index,
+                                                                            (void **)in_buffers,
+                                                                            out_buffer,
+                                                                            recv_size,
+                                                                            threads_already_processed,
+                                                                            (void **)temp_buffer,
+                                                                            temp_rank,
+                                                                            outer_iter,
+                                                                            size_per_buffer_kernel,
+                                                                            buffer_index_kernel,
+                                                                            threads_needed_per_chunk);
+                                    break;
+                                default: break;
+                            }
+                        }
+                        });//parallel_for
+            }); //submit()
+            //printf("kernel2\n");
+
+            //sync all the ranks here before consuming the results.
+            e = global_sync(queue, temp_rank, temp_world, SYNC_BYTE * buffer_index_kernel, 1, 1);
+            //printf("kernel3\n");
+
+            if (temp_world > 2) {
+                int innerloop_local_sum_iter_count __attribute__((unused)) =
+                    (threads_needed_per_chunk + HW_THREAD_COUNT - 1) / HW_THREAD_COUNT;
+                uint32_t persist_local_sum_threads_needed = threads_needed_per_chunk;
+                if (persist_local_sum_threads_needed > HW_THREAD_COUNT)
+                    persist_local_sum_threads_needed = HW_THREAD_COUNT;
+                //local reduction kernel
+                e = queue.submit([&](sycl::handler &cgh) {
+                    cgh.parallel_for<class ReduceScatterMediumKernel_nocopy_local_all_sum<data_type>>(
+                        sycl::nd_range<1>({ persist_local_sum_threads_needed }, wg_size), [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL
+                        {
+                        //ESIMD kernel
+                        uint32_t idx = idx2.get_global_id();
+                        for (int inner_iter = 0; inner_iter < innerloop_local_sum_iter_count; inner_iter++) {
+                            int index = idx + inner_iter * HW_THREAD_COUNT;
+                            if ((uint32_t)index >= threads_needed_per_chunk)
+                                break;
+
+                            switch (temp_world) {
+                                case 2:
+                                    all_sum<2, data_type>(index,
+                                                          send_buf,
+                                                          out_buffer,
+                                                          recv_size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          threads_needed_per_chunk);
+                                    break;
+                                case 4:
+                                    all_sum<4, data_type>(index,
+                                                          send_buf,
+                                                          out_buffer,
+                                                          recv_size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          threads_needed_per_chunk);
+                                    break;
+                                case 6:
+                                    all_sum<6, data_type>(index,
+                                                          send_buf,
+                                                          out_buffer,
+                                                          recv_size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          threads_needed_per_chunk);
+                                    break;
+                                case 8:
+                                    all_sum<8, data_type>(index,
+                                                          send_buf,
+                                                          out_buffer,
+                                                          recv_size,
+                                                          threads_already_processed,
+                                                          (void **)temp_buffer,
+                                                          temp_rank,
+                                                          outer_iter,
+                                                          size_per_buffer_kernel,
+                                                          buffer_index_kernel,
+                                                          threads_needed_per_chunk);
+                                    break;
+                                case 10:
+                                    all_sum<10, data_type>(index,
+                                                           send_buf,
+                                                           out_buffer,
+                                                           recv_size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           threads_needed_per_chunk);
+                                    break;
+                                case 12:
+                                    all_sum<12, data_type>(index,
+                                                           send_buf,
+                                                           out_buffer,
+                                                           recv_size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           threads_needed_per_chunk);
+                                    break;
+                                case 14:
+                                    all_sum<14, data_type>(index,
+                                                           send_buf,
+                                                           out_buffer,
+                                                           recv_size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           threads_needed_per_chunk);
+                                    break;
+                                case 16:
+                                    all_sum<16, data_type>(index,
+                                                           send_buf,
+                                                           out_buffer,
+                                                           recv_size,
+                                                           threads_already_processed,
+                                                           (void **)temp_buffer,
+                                                           temp_rank,
+                                                           outer_iter,
+                                                           size_per_buffer_kernel,
+                                                           buffer_index_kernel,
+                                                           threads_needed_per_chunk);
+                                    break;
+                                default: break;
+                            }
+                        }
+                        });//parallel_for
+                }); //submit()
+                //printf("kernel4\n");
+            } // end if
+
+            threads_already_processed += threads_needed_per_chunk;
+            buffer_index++;
+            buffer_index %= BUFFER_COUNT;
+            buffer_index_kernel = buffer_index;
+        } //for (outer_iter = 0; outer_iter < outerloop_iter_count; outer_iter++)
+
+        return ccl::event::create_from_native(e);
+    }
+
+    //sync all the ranks here before consuming the results.
+    // offset = size_per_buffer_for_sync_kernel * buffer_index_kernel
+    sycl::event global_sync(sycl::queue queue,
+                            int temp_rank,
+                            uint32_t temp_world,
+                            int offset,
+                            int index,
+                            int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class ReduceScatterMediumKernel_GlobalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                //ESIMD kernel
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //since other ranks might still be doing local_sum, we need to sync ranks here.
+                //After the sync is done, the second half of hte temp buffer will be replaced with new sum val.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                for (uint32_t i = 0; i < temp_world; i++) {
+                    int *sync_ptr = (int *)temp_sync_buffer[i] + offset;
+                    ////never true. Used to force dependecy with prev kernel
+                    //if (total_threads_needed_sync == 0x7fffffff)
+                    //    sync_ptr = temp_buffer[0];
+                    lsc_atomic_update<atomic_op::inc,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(sync_ptr, ramp, pred);
+                }
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != temp_world) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    // sync tiles in a GPU
+    sycl::event local_sync(sycl::queue queue,
+                           int temp_rank,
+                           uint32_t temp_world,
+                           int offset,
+                           int index,
+                           int reset) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+        sycl::event e;
+        uint32_t total_threads_needed_sync = 1;
+        int wg_size = 1;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<class ReduceScatterMediumKernel_LocalSync<data_type>>(
+                sycl::nd_range<1>({ total_threads_needed_sync }, wg_size), [=](sycl::item<1> idx) SYCL_ESIMD_KERNEL
+                {
+                //ESIMD kernel
+                simd<ushort, SIMD_SYNC> ramp;
+#pragma unroll
+                for (uint32_t i = 0; i < SIMD_SYNC; i++) {
+                    ramp[i] = i * sizeof(int);
+                }
+
+                //sync only the rank pair within the same gpu.
+                simd_mask<SIMD_SYNC> pred;
+                simd<int, SIMD_SYNC> status0;
+                pred = false;
+                pred[index] = true;
+
+                //sync .
+                int *sync_ptr = (int *)temp_sync_buffer[temp_rank ^ 1] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+                sync_ptr = (int *)temp_sync_buffer[temp_rank] + offset;
+                lsc_atomic_update<atomic_op::inc,
+                                  int,
+                                  SIMD_SYNC,
+                                  lsc_data_size::default_size,
+                                  cache_hint::none,
+                                  cache_hint::none>(sync_ptr, ramp, pred);
+
+                //wait for all the local TG to sync. Then sync the other remote GPUs
+                status0 = lsc_atomic_update<atomic_op::load,
+                                            int,
+                                            SIMD_SYNC,
+                                            lsc_data_size::default_size,
+                                            cache_hint::none,
+                                            cache_hint::none>(sync_ptr, ramp, pred);
+                while (status0[index] != RANKS_PER_GPU) {
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_SYNC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(sync_ptr, ramp, pred);
+                }
+                if (reset) {
+                    //init the atomic counter to 0 for the next run
+                    status0 = 0;
+                    pred = true;
+                    lsc_atomic_update<atomic_op::store,
+                                      int,
+                                      SIMD_SYNC,
+                                      lsc_data_size::default_size,
+                                      cache_hint::none,
+                                      cache_hint::none>(
+                        sync_ptr, ramp, status0, pred); //initialize the counter for the next run
+                }
+                });//parallel_for
+        }); //submit()
+        return e;
+    }
+
+    void release(sycl::queue &queue) {
+        // Clean up, close/put ipc handles, free memory, etc.
+        auto l0_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(queue.get_context());
+        for (int i = 0; i < world; i++) {
+            if (i != rank) {
+                ZE_CALL(zeMemCloseIpcHandle, (l0_ctx, (char *)buffers[i] - offsets[i]));
+            }
+        }
+
+        sycl::free(buffers[rank], queue);
+        this->initialized = false;
+    }
+
+private:
+    void *buffers[max_rank];
+    void *sync_buffer[max_rank];
+    size_t offsets[max_rank];
+    ze_ipc_mem_handle_t ipc_handle[max_rank];
+    int rank{ ccl::utils::invalid_rank }, world{ ccl::utils::invalid_err_code };
+    size_t size_per_buffer{ 0 }; // todo align size_t or int for all algos
+    int data_size_per_buffer{ ccl::utils::invalid_bytes_value };
+    uint32_t max_count_per_rank{ 0 };
+    int buffer_index{ ccl::utils::invalid_err_code };
+    ccl_stream *global_stream{};
+    ccl_comm *global_comm{};
+    ccl_comm *even_comm{};
+};
+
+#define REDUCE_SCATTER_MEDIUM_API(TYPE) \
+    void init_reduce_scatter_medium_##TYPE(ccl::datatype dtype, \
+                                           sycl::queue &queue, \
+                                           ccl_comm *comm, \
+                                           ccl_stream *stream, \
+                                           uint32_t rank_in, \
+                                           uint32_t world_in) { \
+        if (!rs_medium_##TYPE.inited()) { \
+            LOG_INFO("invoking medium reduce_scatter first time for datatype: ", dtype); \
+            rs_medium_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+    } \
+\
+    ccl::event run_reduce_scatter_medium_##TYPE(ccl::datatype dtype, \
+                                                sycl::queue queue, \
+                                                const void *send_buf, \
+                                                void *recv_buf, \
+                                                size_t recv_count, \
+                                                bool &done) { \
+        return rs_medium_##TYPE.reduce_scatter(queue, send_buf, recv_buf, recv_count, done); \
+    }
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_small_sycl.cpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_small_sycl.cpp
new file mode 100644
index 000000000..93807e6ae
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_small_sycl.cpp
@@ -0,0 +1,64 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_small_sycl.hpp"
+
+sycl_reduce_scatter_small<sycl::half> rs_small_fp16;
+sycl_reduce_scatter_small<sycl::_V1::ext::oneapi::bfloat16> rs_small_bf16;
+sycl_reduce_scatter_small<int32_t> rs_small_int32;
+sycl_reduce_scatter_small<float> rs_small_fp32;
+
+#define SWITCH_INIT_TYPE(TYPE, ccl_type) \
+    case ccl_type: \
+        if (!rs_small_##TYPE.inited()) { \
+            LOG_INFO("invoking small reduce_scatter first time for datatype: ", ccl_type); \
+            rs_small_##TYPE.init(queue, comm, stream, rank_in, world_in); \
+        } \
+        break;
+
+void init_reduce_scatter_small(ccl::datatype dtype,
+                               sycl::queue &queue,
+                               ccl_comm *comm,
+                               ccl_stream *stream,
+                               uint32_t rank_in,
+                               uint32_t world_in) {
+    switch (dtype) {
+        SWITCH_INIT_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_INIT_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_INIT_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_INIT_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for reduce_scatter"); assert(0);
+    }
+}
+
+#define SWITCH_RUN_TYPE(TYPE, ccl_type) \
+    case ccl_type: e = rs_small_##TYPE.reduce_scatter(queue, send_buf, recv_buf, dtype, recv_count, done); break;
+
+ccl::event run_reduce_scatter_small(ccl::datatype dtype,
+                                    sycl::queue queue,
+                                    const void *send_buf,
+                                    void *recv_buf,
+                                    size_t recv_count,
+                                    bool &done) {
+    ccl::event e;
+    switch (dtype) {
+        SWITCH_RUN_TYPE(fp16, ccl::datatype::float16)
+        SWITCH_RUN_TYPE(bf16, ccl::datatype::bfloat16)
+        SWITCH_RUN_TYPE(fp32, ccl::datatype::float32)
+        SWITCH_RUN_TYPE(int32, ccl::datatype::int32)
+        default: CCL_THROW("unsupported datatype for reduce_scatter"); assert(0);
+    }
+    return e;
+}
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_small_sycl.hpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_small_sycl.hpp
new file mode 100644
index 000000000..560705f62
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_small_sycl.hpp
@@ -0,0 +1,951 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#define SIMD_MAX              256
+#define SIMD                  (SIMD_MAX / sizeof(data_type))
+#define SIMD_ATOMIC           16
+#define MAX_RANK              16
+#define UNROLL_SIZE           1
+#define TRIPLE_BUFFER         3
+#define SYNC_BYTE             (SIMD_ATOMIC * sizeof(int) * 2)
+#define ALIGNMENT_BYTE        256
+#define EU_COUNT              512
+#define THREADS_PER_EU        8
+#define MAX_THREAD            (EU_COUNT * THREADS_PER_EU)
+#define MAX_KERNEL_LOOP_COUNT 4
+#define MAX_COUNT             (SIMD * UNROLL_SIZE * MAX_KERNEL_LOOP_COUNT * MAX_THREAD)
+#define LOOP_COUNT_LIMIT      (1000000)
+
+template <typename data_type, uint32_t N, int kernel_inner_loop_scalar>
+ESIMD_INLINE void reduce_kernel(void **temp_buffer, int buf_offset, int offset, data_type result[]) {
+    data_type peer[N][kernel_inner_loop_scalar];
+//    gpu_kernel_copy((char*)result, (const char *)((data_type *)(temp_buffer[0]) + buf_offset + offset), kernel_inner_loop_scalar * sizeof(data_type));
+#pragma unroll
+    for (uint32_t r = 0; r < N; r++) {
+        data_type *peer_ptr = (data_type *)(temp_buffer[r]) + buf_offset + offset;
+        gpu_kernel_copy((char *)peer[r], (const char *)peer_ptr, kernel_inner_loop_scalar * sizeof(data_type));
+    }
+    gpu_kernel_copy((char *)result, (const char *)peer[0], kernel_inner_loop_scalar * sizeof(data_type));
+#pragma unroll
+    for (uint32_t r = 1; r < N; r++) {
+        for (int j = 0; j < kernel_inner_loop_scalar; j++)
+            result[j] += peer[r][j];
+    }
+}
+
+template <typename dtype, int kernel_inner_loop>
+class Reduce_scatter_small_kernel;
+template <typename dtype, int kernel_inner_loop_scalar>
+class Reduce_scatter_small_kernel_scalar;
+
+template <typename data_type, uint32_t max_rank = MAX_RANK, uint32_t max_buffer = 1024 /*KB*/>
+class sycl_reduce_scatter_small : public sycl_coll_base<data_type> {
+public:
+    sycl_reduce_scatter_small() : sycl_coll_base<data_type>() {
+        buffer_index = 0;
+        size_per_buffer = 0;
+    }
+
+    void init(sycl::queue &queue, ccl_comm *comm, ccl_stream *stream, uint32_t rank_in, uint32_t world_in) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        rank = rank_in;
+        world = world_in;
+        // temporal buffer used for allreduce temporal use only.
+        data_size_per_buffer = ((MAX_COUNT + SIMD * UNROLL_SIZE * MAX_KERNEL_LOOP_COUNT - 1) /
+                                (SIMD * UNROLL_SIZE * MAX_KERNEL_LOOP_COUNT)) *
+                               SIMD * UNROLL_SIZE * MAX_KERNEL_LOOP_COUNT;
+        data_size_per_buffer = ((data_size_per_buffer * sizeof(data_type) + ALIGNMENT_BYTE - 1) / ALIGNMENT_BYTE) *
+                               ALIGNMENT_BYTE / sizeof(data_type); //aligned size
+        size_per_buffer = data_size_per_buffer * sizeof(data_type) + SYNC_BYTE;
+        void *local_triple_buffer = sycl::malloc_device(size_per_buffer * TRIPLE_BUFFER, queue);
+
+        auto e = queue.memset(local_triple_buffer, 0, size_per_buffer * TRIPLE_BUFFER);
+        e.wait();
+        this->exchange_peer_ipc_mem(queue,
+                                    comm,
+                                    stream,
+                                    local_triple_buffer,
+                                    NULL,
+                                    rank,
+                                    world,
+                                    data_size_per_buffer * sizeof(data_type),
+                                    (void **)buffers,
+                                    (void **)sync_buffer,
+                                    offsets,
+                                    ipc_handle,
+                                    NULL,
+                                    NULL /* mmap_buffers */,
+                                    false /* to_cache */);
+
+        this->initialized = true;
+    }
+
+    ccl::event reduce_scatter(sycl::queue &queue,
+                              const void *send_buf,
+                              void *out_buffer,
+                              ccl::datatype dtype,
+                              uint32_t recv_size,
+                              bool &done) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        assert(this->initialized == true);
+
+        uint32_t total_count = recv_size * world;
+        if (total_count > MAX_COUNT) {
+            done = false;
+            return ccl::event::create_from_native(e);
+        }
+
+        if (total_count * sizeof(data_type) <= 8192) {
+            e = reduce_scatter_scalar<4>(queue, send_buf, out_buffer, dtype, recv_size, done);
+        }
+        else if (total_count * sizeof(data_type) <= 524288) {
+            e = reduce_scatter_esimd<1>(queue, send_buf, out_buffer, dtype, recv_size, done);
+        }
+        else {
+            e = reduce_scatter_esimd<2>(queue, send_buf, out_buffer, dtype, recv_size, done);
+        }
+        return ccl::event::create_from_native(e);
+    }
+
+    template <int kernel_inner_loop>
+    sycl::event reduce_scatter_esimd(sycl::queue &queue,
+                                     const void *send_buf,
+                                     void *out_buffer,
+                                     ccl::datatype dtype,
+                                     uint32_t recv_size,
+                                     bool &done) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+
+        uint32_t total_count = recv_size * world;
+        if (total_count > MAX_COUNT) {
+            done = false;
+            return e;
+        }
+
+        done = true;
+
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+
+        int size_per_buffer_kernel __attribute__((unused)) = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel __attribute__((unused)) =
+            size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+
+        int buffer_index_kernel = buffer_index;
+        buffer_index++;
+        buffer_index %= TRIPLE_BUFFER;
+
+        uint32_t total_threads_needed = (total_count + SIMD * UNROLL_SIZE * kernel_inner_loop - 1) /
+                                        (SIMD * UNROLL_SIZE * kernel_inner_loop); //ceiling
+        int wg_size = 16;
+        uint32_t total_threads_dispatched = (total_threads_needed + wg_size - 1) / wg_size * wg_size;
+        uint32_t total_wg_count = total_threads_dispatched / wg_size;
+
+        uint32_t total_threads_needed_for_reduce = (recv_size + SIMD * UNROLL_SIZE * kernel_inner_loop - 1) /
+                                                   (SIMD * UNROLL_SIZE * kernel_inner_loop); //ceiling
+
+        //e[r] = queue.submit([&](sycl::handler& cgh) {
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<Reduce_scatter_small_kernel<data_type, kernel_inner_loop>>(
+                sycl::nd_range<1>({ total_threads_dispatched }, wg_size),
+                [=](sycl::nd_item<1> idx2) SYCL_ESIMD_KERNEL {
+                    //slm_init(1024);
+                    uint32_t idx = idx2.get_global_id();
+
+                    //ESIMD kernel
+                    uint32_t offset = idx * SIMD * UNROLL_SIZE * kernel_inner_loop;
+                    simd<data_type, max_rank * SIMD * UNROLL_SIZE> buffer; //64 registers
+                    simd<data_type, SIMD * UNROLL_SIZE> buffer_small;
+                    simd<ushort, SIMD_ATOMIC> ramp;
+                    simd_mask<SIMD_ATOMIC> pred;
+                    simd<int, SIMD_ATOMIC> status0;
+                    int *local_sync_ptr;
+
+#pragma unroll
+                    for (uint32_t i = 0; i < SIMD_ATOMIC; i++) {
+                        ramp[i] = i * sizeof(int);
+                    }
+
+                    //process the input only if the thread is useful
+                    if (idx < total_threads_needed) {
+                        //do copy from input buffer to temp buffer.
+                        for (int i = 0; i < kernel_inner_loop; i++) {
+                            if (offset + i * SIMD * UNROLL_SIZE > total_count)
+                                break;
+#pragma unroll
+                            for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                                buffer_small.template select<SIMD, 1>(unroll_i * SIMD) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::cached,
+                                                   cache_hint::cached>((data_type *)send_buf + offset +
+                                                                       unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                            }
+
+                            //use the temp buffer for the current rank to copy the data to.
+                            data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+                            //point to the correct buffer inside the triple buffer
+                            local_temp_ptr += buffer_index_kernel * size_per_buffer_kernel;
+
+#pragma unroll
+                            for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                                lsc_block_store<data_type,
+                                                SIMD,
+                                                lsc_data_size::default_size,
+                                                cache_hint::uncached,
+                                                cache_hint::uncached>(
+                                    (data_type *)local_temp_ptr + offset + unroll_i * SIMD +
+                                        i * SIMD * UNROLL_SIZE,
+                                    buffer_small.template select<SIMD, 1>(unroll_i * SIMD));
+                            }
+                        }
+                        //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                        //since each threads are copying small chunks of data to temp buffer, all the threads needs to sync globally using atomics within this rank
+                    }
+
+                    //sync locally within local GPU first.
+                    //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                    local_sync_ptr = (int *)temp_sync_buffer[temp_rank];
+                    local_sync_ptr += buffer_index_kernel * size_per_buffer_for_sync_kernel;
+
+                    //if there are more than 1 threads required per rank, then do the local sync within the rank first.
+                    if (total_threads_needed > 1) {
+                        //do local sync in two steps. First using TG barrier. Then global L3 atomics.
+                        uint32_t local_tid = idx2.get_local_linear_id();
+
+                        pred = false;
+                        pred[0] = true;
+                        if (local_tid == 0) {
+                            status0 = lsc_atomic_update<atomic_op::inc,
+                                                        int,
+                                                        SIMD_ATOMIC,
+                                                        lsc_data_size::default_size,
+                                                        cache_hint::none,
+                                                        cache_hint::none>(local_sync_ptr, ramp, pred);
+                            //wait for all the local TG to sync. Then sync the other remote GPUs
+                            while (status0[0] != total_wg_count) {
+                                status0 = lsc_atomic_update<atomic_op::load,
+                                                            int,
+                                                            SIMD_ATOMIC,
+                                                            lsc_data_size::default_size,
+                                                            cache_hint::none,
+                                                            cache_hint::none>(local_sync_ptr, ramp, pred);
+                            }
+                        }
+                        barrier();
+                    }
+
+                    //once the local level sync is done, atomically write its counter to other remote gpus' atomic counter
+                    pred = false;
+                    pred[1] = true; //use different lane for the remote gpu sync
+                    if (total_threads_dispatched >= temp_world) {
+                        if (idx < temp_world) {
+                            status0 = total_threads_needed;
+                            int *sync_ptr = (int *)temp_sync_buffer[idx];
+                            sync_ptr += buffer_index_kernel * size_per_buffer_for_sync_kernel;
+                            lsc_atomic_update<atomic_op::add,
+                                              int,
+                                              SIMD_ATOMIC,
+                                              lsc_data_size::default_size,
+                                              cache_hint::none,
+                                              cache_hint::none>(sync_ptr, ramp, status0, pred);
+                        }
+                    }
+                    else if (idx == 0) { //one thread in the local gpu notifies the remote gpu of its status.
+                        status0 = total_threads_needed;
+                        for (uint32_t i = 0; i < temp_world; i++) {
+                            int *sync_ptr;
+                            sync_ptr = (int *)temp_sync_buffer
+                                [i]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                            sync_ptr += buffer_index_kernel * size_per_buffer_for_sync_kernel;
+                            lsc_atomic_update<atomic_op::add,
+                                              int,
+                                              SIMD_ATOMIC,
+                                              lsc_data_size::default_size,
+                                              cache_hint::none,
+                                              cache_hint::none>(sync_ptr, ramp, status0, pred);
+                        }
+                    }
+
+                    //once the local sync is done, retire useless threads
+                    if (idx >= total_threads_needed)
+                        return;
+
+                    //once all the local TGs are sync, do fence so that other GPU can see.
+                    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                    //wait for completion of the atomic sync
+                    status0 = lsc_atomic_update<atomic_op::load,
+                                                int,
+                                                SIMD_ATOMIC,
+                                                lsc_data_size::default_size,
+                                                cache_hint::none,
+                                                cache_hint::none>(local_sync_ptr, ramp, pred);
+                    while (status0[1] != total_threads_needed * temp_world) {
+                        status0 = lsc_atomic_update<atomic_op::load,
+                                                    int,
+                                                    SIMD_ATOMIC,
+                                                    lsc_data_size::default_size,
+                                                    cache_hint::none,
+                                                    cache_hint::none>(local_sync_ptr, ramp, pred);
+                    }
+
+                    //reset the sync counter for the next allreduce session. Each rank reset's its own buffer
+                    if (idx == 0) { //one thread in the local gpu notifies the remote gpu of its status.
+                        int buffer_index_to_reset = (buffer_index_kernel + 2) % 3;
+                        status0 = 0;
+                        pred = true;
+                        //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        local_sync_ptr = (int *)temp_sync_buffer[temp_rank];
+                        local_sync_ptr += buffer_index_to_reset * size_per_buffer_for_sync_kernel;
+                        lsc_atomic_update<atomic_op::store,
+                                          int,
+                                          SIMD_ATOMIC,
+                                          lsc_data_size::default_size,
+                                          cache_hint::none,
+                                          cache_hint::none>(
+                            local_sync_ptr, ramp, status0, pred); //reset the first half of sync buffer
+                    }
+
+                    if (idx > total_threads_needed_for_reduce)
+                        return;
+
+                    //at this point, all the threads are done copying data from input buffer to temp buffer.
+                    //do partial reduce
+                    uint32_t send_offset = recv_size * temp_rank + idx * SIMD * UNROLL_SIZE * kernel_inner_loop;
+
+                    simd<data_type, SIMD * UNROLL_SIZE> result;
+                    for (int i = 0; i < kernel_inner_loop; i++) {
+                        if (temp_world == 4) {
+                            data_type *peer_ptr0 =
+                                ((data_type *)temp_buffer[0]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr1 =
+                                ((data_type *)temp_buffer[1]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr2 =
+                                ((data_type *)temp_buffer[2]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr3 =
+                                ((data_type *)temp_buffer[3]) + buffer_index_kernel * size_per_buffer_kernel;
+
+#pragma unroll
+                            for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 0 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr0 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 1 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr1 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 2 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr2 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 3 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr3 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                            }
+                            //do the actual reduction
+                            result = 0;
+#pragma unroll
+                            for (int r = 0; r < 4; r++) {
+                                //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                                result =
+                                    result + buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            }
+                        }
+                        else if (temp_world == 8) {
+                            data_type *peer_ptr0 =
+                                ((data_type *)temp_buffer[0]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr1 =
+                                ((data_type *)temp_buffer[1]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr2 =
+                                ((data_type *)temp_buffer[2]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr3 =
+                                ((data_type *)temp_buffer[3]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr4 =
+                                ((data_type *)temp_buffer[4]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr5 =
+                                ((data_type *)temp_buffer[5]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr6 =
+                                ((data_type *)temp_buffer[6]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr7 =
+                                ((data_type *)temp_buffer[7]) + buffer_index_kernel * size_per_buffer_kernel;
+
+#pragma unroll
+                            for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 0 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr0 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 1 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr1 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 2 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr2 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 3 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr3 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 4 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr4 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 5 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr5 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 6 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr6 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 7 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr7 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                            }
+                            //do the actual reduction
+                            result = 0;
+#pragma unroll
+                            for (int r = 0; r < 8; r++) {
+                                //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                                result =
+                                    result + buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            }
+                        }
+                        else if (temp_world == 16) {
+                            //first 8 ranks processing
+                            data_type *peer_ptr0 =
+                                ((data_type *)temp_buffer[0]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr1 =
+                                ((data_type *)temp_buffer[1]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr2 =
+                                ((data_type *)temp_buffer[2]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr3 =
+                                ((data_type *)temp_buffer[3]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr4 =
+                                ((data_type *)temp_buffer[4]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr5 =
+                                ((data_type *)temp_buffer[5]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr6 =
+                                ((data_type *)temp_buffer[6]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr7 =
+                                ((data_type *)temp_buffer[7]) + buffer_index_kernel * size_per_buffer_kernel;
+                            //second 8 ranks processing
+                            data_type *peer_ptr8 =
+                                ((data_type *)temp_buffer[8]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr9 =
+                                ((data_type *)temp_buffer[9]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr10 =
+                                ((data_type *)temp_buffer[10]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr11 =
+                                ((data_type *)temp_buffer[11]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr12 =
+                                ((data_type *)temp_buffer[12]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr13 =
+                                ((data_type *)temp_buffer[13]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr14 =
+                                ((data_type *)temp_buffer[14]) + buffer_index_kernel * size_per_buffer_kernel;
+                            data_type *peer_ptr15 =
+                                ((data_type *)temp_buffer[15]) + buffer_index_kernel * size_per_buffer_kernel;
+
+#pragma unroll
+                            for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 0 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr0 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 1 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr1 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 2 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr2 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 3 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr3 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 4 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr4 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 5 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr5 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 6 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr6 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 7 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr7 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 8 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr8 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 9 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr9 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 10 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr10 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 11 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr11 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 12 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr12 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 13 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr13 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 14 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr14 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                buffer.template select<SIMD, 1>(unroll_i * SIMD + 15 * SIMD * UNROLL_SIZE) =
+                                    lsc_block_load<data_type,
+                                                   SIMD,
+                                                   lsc_data_size::default_size,
+                                                   cache_hint::uncached,
+                                                   cache_hint::uncached>(peer_ptr15 + send_offset +
+                                                                         unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                            }
+                            //do the actual reduction
+                            result = 0;
+#pragma unroll
+                            for (int r = 0; r < 16; r++) {
+                                //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                                result =
+                                    result + buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            }
+                        }
+                        else { //this is for 2,4,6 ranks. So there is no problem of overflowing the buffer.
+                            for (uint32_t r = 0; r < temp_world; r++) {
+                                data_type *peer_ptr =
+                                    ((data_type *)temp_buffer[r]) + buffer_index_kernel * size_per_buffer_kernel;
+#pragma unroll
+                                for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                                    buffer.template select<SIMD, 1>(unroll_i * SIMD + r * SIMD * UNROLL_SIZE) =
+                                        lsc_block_load<data_type,
+                                                       SIMD,
+                                                       lsc_data_size::default_size,
+                                                       cache_hint::uncached,
+                                                       cache_hint::uncached>(
+                                            peer_ptr + send_offset + unroll_i * SIMD + i * SIMD * UNROLL_SIZE);
+                                }
+                            }
+                            //do the actual reduction
+                            result = 0;
+                            for (uint32_t r = 0; r < temp_world; r++) {
+                                //result += buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                                result =
+                                    result + buffer.template select<SIMD * UNROLL_SIZE, 1>(r * SIMD * UNROLL_SIZE);
+                            }
+                        }
+
+                        //write out the results
+                        if (offset + i * SIMD * UNROLL_SIZE + UNROLL_SIZE * SIMD <= recv_size) {
+#pragma unroll
+                            for (int unroll_i = 0; unroll_i < UNROLL_SIZE; unroll_i++) {
+                                lsc_block_store<data_type,
+                                                SIMD,
+                                                lsc_data_size::default_size,
+                                                cache_hint::write_back,
+                                                cache_hint::write_back>(
+                                    (data_type *)out_buffer + offset + unroll_i * SIMD + i * SIMD * UNROLL_SIZE,
+                                    result.template select<SIMD, 1>(unroll_i * SIMD));
+                            }
+                        }
+                        else if (offset + i * SIMD * UNROLL_SIZE < recv_size) {
+                            int count = recv_size - (offset + i * SIMD * UNROLL_SIZE);
+                            for (int c = 0; c < count; c++) {
+                                ((data_type *)out_buffer)[offset + i * SIMD * UNROLL_SIZE + c] = result[c];
+                            }
+                        }
+                        else
+                            break;
+                    }
+                });
+        });
+        //e.wait();
+        return e;
+    }
+
+    template <int kernel_inner_loop_scalar>
+    sycl::event reduce_scatter_scalar(sycl::queue &queue,
+                                      const void *send_buf,
+                                      void *out_buffer,
+                                      ccl::datatype dtype,
+                                      uint32_t recv_size,
+                                      bool &done) {
+        using namespace __ESIMD_NS;
+        using namespace __ESIMD_ENS;
+
+        sycl::event e;
+        uint32_t temp_rank = rank;
+        uint32_t temp_world = world;
+
+        assert(this->initialized == true);
+
+        uint32_t total_count = recv_size * world;
+
+        done = true;
+
+        void *temp_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_buffer[i] = buffers[i];
+        }
+        void *temp_sync_buffer[max_rank];
+        for (int i = 0; i < world; i++) {
+            temp_sync_buffer[i] = sync_buffer[i];
+        }
+
+        int size_per_buffer_kernel = size_per_buffer / sizeof(data_type);
+        int size_per_buffer_for_sync_kernel = size_per_buffer_kernel / (sizeof(int) / sizeof(data_type));
+
+        const int wg_size = 16;
+
+        uint32_t total_threads_needed_for_reduce =
+            (recv_size + kernel_inner_loop_scalar - 1) / kernel_inner_loop_scalar;
+        uint32_t total_threads_needed;
+        uint32_t copy_count;
+        if (total_threads_needed_for_reduce > MAX_THREAD) {
+            total_threads_needed = total_threads_needed_for_reduce;
+            copy_count = temp_world;
+        }
+        else {
+            total_threads_needed = (total_count + kernel_inner_loop_scalar - 1) / kernel_inner_loop_scalar;
+            copy_count = 1;
+        }
+        uint32_t total_threads_dispatched = (total_threads_needed + wg_size - 1) / wg_size * wg_size;
+        uint32_t total_wg_count = total_threads_dispatched / wg_size;
+
+        int buffer_index_kernel = buffer_index;
+        buffer_index++;
+        buffer_index %= TRIPLE_BUFFER;
+
+        e = queue.submit([&](sycl::handler &cgh) {
+            cgh.parallel_for<Reduce_scatter_small_kernel_scalar<data_type, kernel_inner_loop_scalar>>(
+                sycl::nd_range<1>({ total_threads_dispatched }, wg_size),
+                [=](sycl::nd_item<1> idx2) [[intel::reqd_sub_group_size(wg_size)]] {
+                    //slm_init(1024);
+                    uint32_t idx = idx2.get_global_id();
+
+                    //ESIMD kernel
+                    int *local_sync_ptr;
+
+                    //use the temp buffer for the current rank to copy the data to.
+                    data_type *local_temp_ptr = (data_type *)temp_buffer[temp_rank];
+                    local_temp_ptr +=
+                        (buffer_index_kernel *
+                         size_per_buffer_kernel); //point to the correct buffer inside the triple buffer
+
+                    //process the input only if the thread is useful
+                    if (idx < total_threads_needed) {
+                        uint32_t offset __attribute__((unused)) = idx * kernel_inner_loop_scalar * copy_count;
+                        //*(local_temp_ptr + idx) = *((data_type *)send_buf + idx);
+                        if (offset + kernel_inner_loop_scalar * copy_count <= total_count) {
+                            gpu_kernel_copy((char *)(local_temp_ptr + offset),
+                                            (const char *)((data_type *)send_buf + offset),
+                                            kernel_inner_loop_scalar * copy_count * sizeof(data_type));
+                        }
+                        else {
+                            int count = total_count - offset;
+                            gpu_kernel_copy((char *)(local_temp_ptr + offset),
+                                            (const char *)((data_type *)send_buf + offset),
+                                            count * sizeof(data_type));
+                        }
+                    }
+
+                    //since each threads are copying small chunks of data to temp buffer, all the threads needs to sync globally using atomics within this rank
+
+                    //sync locally within local GPU first.
+                    local_sync_ptr = (int *)temp_sync_buffer
+                        [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                    local_sync_ptr += (buffer_index_kernel * size_per_buffer_for_sync_kernel);
+
+                    //if there are more than 1 threads required per rank, then do the local sync within the rank first.
+                    if (total_threads_needed > 1) {
+                        //do local sync in two steps. First using TG barrier. Then global L3 atomics.
+                        uint32_t local_tid = idx2.get_local_linear_id();
+                        if (local_tid == 0) {
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(local_sync_ptr[0]);
+                            atomic_p += 1;
+
+                            //wait for all the local TG to sync. Then sync the other remote GPUs
+                            uint32_t val = atomic_p.load();
+                            while (val != total_wg_count) {
+                                val = atomic_p.load();
+                            }
+                        }
+                        //idx2.barrier();
+                    }
+
+                    //once the local level sync is done, atomically write its counter to other remote gpus' atomic counter
+                    if (total_threads_dispatched >= temp_world) {
+                        if (idx < temp_world) {
+                            int *sync_ptr = (int *)temp_sync_buffer[idx];
+                            sync_ptr += (buffer_index_kernel * size_per_buffer_for_sync_kernel);
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(sync_ptr[1]);
+                            atomic_p++;
+                        }
+                    }
+                    else if (idx == 0) { //one thread in the local gpu notifies the remote gpu of its status.
+                        for (uint32_t i = 0; i < temp_world; i++) {
+                            int *sync_ptr;
+                            sync_ptr = (int *)temp_sync_buffer
+                                [i]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                            sync_ptr += (buffer_index_kernel * size_per_buffer_for_sync_kernel);
+                            sycl::atomic_ref<int,
+                                             sycl::memory_order::relaxed,
+                                             sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>
+                                atomic_p(sync_ptr[1]);
+                            atomic_p++;
+                        }
+                    }
+
+                    //once the local sync is done, retire useless threads
+                    if (idx >= total_threads_needed)
+                        return;
+
+                    //once all the local TGs are sync, do fence so that other GPU can see.
+                    //lsc_fence<lsc_memory_kind::untyped_global, lsc_fence_op::none, lsc_scope::gpus>();
+
+                    //wait for completion of the atomic sync
+                    sycl::atomic_ref<int,
+                                     sycl::memory_order::relaxed,
+                                     sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>
+                        atomic_p(local_sync_ptr[1]);
+                    uint32_t val = atomic_p.load();
+                    while (val < temp_world) {
+                        val = atomic_p.load();
+                    }
+
+                    //reset the sync counter for the next allreduce session. Each rank reset's its own buffer
+                    if (idx == 0) { //one thread in the local gpu notifies the remote gpu of its status.
+                        int buffer_index_to_reset = (buffer_index_kernel + 2) % 3;
+                        local_sync_ptr = (int *)temp_sync_buffer
+                            [temp_rank]; //the buffer might be located in remote GPU. But during the atomics, local L2 should be utilized.
+                        local_sync_ptr += (buffer_index_to_reset * size_per_buffer_for_sync_kernel);
+                        local_sync_ptr[0] = local_sync_ptr[1] = 0;
+                    }
+
+                    //at this point, all the threads are done copying data from input buffer to temp buffer.
+                    if (idx >= total_threads_needed_for_reduce)
+                        return;
+
+                    //data_type result[kernel_inner_loop_scalar];
+                    uint32_t send_offset = temp_rank * recv_size + idx * kernel_inner_loop_scalar;
+
+                    data_type *out_ptr = (data_type *)out_buffer + idx * kernel_inner_loop_scalar;
+                    switch (temp_world) {
+                        case 2:
+                            reduce_kernel<data_type, 2, kernel_inner_loop_scalar>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                send_offset,
+                                out_ptr);
+                            break;
+                        case 4:
+                            reduce_kernel<data_type, 4, kernel_inner_loop_scalar>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                send_offset,
+                                out_ptr);
+                            break;
+                        case 6:
+                            reduce_kernel<data_type, 6, kernel_inner_loop_scalar>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                send_offset,
+                                out_ptr);
+                            break;
+                        case 8:
+                            reduce_kernel<data_type, 8, kernel_inner_loop_scalar>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                send_offset,
+                                out_ptr);
+                            break;
+                        case 10:
+                            reduce_kernel<data_type, 10, kernel_inner_loop_scalar>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                send_offset,
+                                out_ptr);
+                            break;
+                        case 12:
+                            reduce_kernel<data_type, 12, kernel_inner_loop_scalar>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                send_offset,
+                                out_ptr);
+                            break;
+                        case 14:
+                            reduce_kernel<data_type, 14, kernel_inner_loop_scalar>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                send_offset,
+                                out_ptr);
+                            break;
+                        case 16:
+                            reduce_kernel<data_type, 16, kernel_inner_loop_scalar>(
+                                (void **)temp_buffer,
+                                buffer_index_kernel * size_per_buffer_kernel,
+                                send_offset,
+                                out_ptr);
+                            break;
+                        default: assert(0);
+                    }
+                });
+        });
+        return e;
+    }
+
+private:
+    void *buffers[max_rank]{};
+    void *sync_buffer[max_rank]{};
+    size_t offsets[max_rank]{};
+    ze_ipc_mem_handle_t ipc_handle[max_rank]{};
+    int rank{ ccl::utils::invalid_rank }, world{ ccl::utils::invalid_err_code };
+    int buffer_index{ ccl::utils::invalid_err_code };
+    int size_per_buffer{ ccl::utils::invalid_bytes_value };
+    int data_size_per_buffer{ ccl::utils::invalid_bytes_value };
+};
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.cpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.cpp
new file mode 100644
index 000000000..8bbc22bdf
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.cpp
@@ -0,0 +1,136 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/algorithms/utils/sycl_coll_base.hpp"
+
+#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+#include "coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.hpp"
+#endif // defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+
+namespace ccl {
+namespace v1 {
+
+struct impl_dispatch {
+    template <class Object>
+    const typename Object::impl_value_t& operator()(const Object& obj) {
+        return obj.get_impl();
+    }
+};
+
+ccl::event reduce_scatter_sycl(sycl::queue q,
+                               const void* send_buf,
+                               void* recv_buf,
+                               size_t recv_count,
+                               datatype dtype,
+                               reduction reduction,
+                               const ccl::communicator& comm,
+                               const stream& op_stream,
+                               bool& done) {
+    ccl::event e;
+    done = true;
+
+    uint32_t world = comm.size();
+    int rank = comm.rank();
+
+    auto ccl_dtype = ccl::global_data::get().dtypes->get(dtype);
+
+    if (world == 1) {
+        sycl::event sycl_e;
+        if (send_buf != recv_buf) {
+            sycl_e = q.memcpy(recv_buf, send_buf, recv_count * ccl_dtype.size());
+        }
+        return ccl::event::create_from_native(sycl_e);
+    }
+
+    ccl::impl_dispatch disp;
+    std::shared_ptr<ccl::comm_interface> disp_comm = disp(comm);
+    ccl_comm* global_comm = (ccl_comm*)(disp_comm.get());
+    ccl_stream* global_stream = get_stream_ptr(disp(op_stream));
+
+    const bool is_single_tile = global_comm->get_pair_comm()->size() == 1;
+    const bool has_all_vertices_connected = global_comm->get_topo_manager().has_all_vertices_connected();
+    LOG_DEBUG("|CCL_SYCL| has_all_vertices_connected", has_all_vertices_connected);
+
+    if (recv_count * world * ccl_dtype.size() <= ccl::global_data::env().reduce_scatter_small_size_threshold &&
+        has_all_vertices_connected) {
+        if ((recv_count * ccl_dtype.size()) % 4 == 0 || recv_count * ccl_dtype.size() == 2) {
+            init_reduce_scatter_small(dtype, q, global_comm, global_stream, rank, world);
+
+#ifdef CCL_ENABLE_ITT
+            __itt_event coll_create_itt_event = ccl::profile::itt::event_get("CCL_REDUCE_SCATTER_SMALL");
+            ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+            LOG_DEBUG(
+                "|CCL_SYCL| reduce_scatter selects small kernel, recv_count:", recv_count, " datatype: ", dtype);
+            e = run_reduce_scatter_small(dtype, q, send_buf, recv_buf, recv_count, done);
+            LOG_DEBUG("|CCL_SYCL| reduce_scatter selects small kernel, recv_count:",
+                      recv_count,
+                      " datatype: ",
+                      dtype,
+                      "done");
+#ifdef CCL_ENABLE_ITT
+            ccl::profile::itt::event_end(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+        }
+        else {
+            done = false;
+        }
+    }
+    else if (recv_count * world * ccl_dtype.size() <=
+                 ccl::global_data::env().reduce_scatter_medium_size_threshold &&
+             !is_single_tile) {
+        if ((recv_count * ccl_dtype.size()) % 4 == 0) {
+            init_reduce_scatter_medium(dtype, q, global_comm, global_stream, rank, world);
+
+#ifdef CCL_ENABLE_ITT
+            __itt_event coll_create_itt_event = ccl::profile::itt::event_get("CCL_REDUCE_SCATTER_MEDIUM");
+            ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+            LOG_DEBUG("|CCL_SYCL| reduce_scatter selects medium kernel: count:", recv_count, " datatype: ", dtype);
+            e = run_reduce_scatter_medium(dtype, q, send_buf, recv_buf, recv_count, done);
+#ifdef CCL_ENABLE_ITT
+            ccl::profile::itt::event_end(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+        }
+        else {
+            done = false;
+        }
+    }
+    else if (!is_single_tile) {
+        if ((recv_count * ccl_dtype.size()) % 4 == 0) {
+            init_reduce_scatter_large(dtype, q, global_comm, global_stream, rank, world);
+
+#ifdef CCL_ENABLE_ITT
+            __itt_event coll_create_itt_event = ccl::profile::itt::event_get("CCL_REDUCE_SCATTER_LARGE");
+            ccl::profile::itt::event_start(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+            LOG_DEBUG("|CCL_SYCL| reduce_scatter selects large kernel: count:", recv_count, " datatype: ", dtype);
+            e = run_reduce_scatter_large(dtype, q, send_buf, recv_buf, recv_count, done);
+#ifdef CCL_ENABLE_ITT
+            ccl::profile::itt::event_end(coll_create_itt_event);
+#endif // CCL_ENABLE_ITT
+        }
+        else {
+            done = false;
+        }
+    }
+    else {
+        done = false;
+    }
+    return e;
+}
+
+} // namespace v1
+} // namespace ccl
diff --git a/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.hpp b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.hpp
new file mode 100644
index 000000000..890ad78a6
--- /dev/null
+++ b/src/coll/algorithms/reduce_scatter/sycl/reduce_scatter_sycl.hpp
@@ -0,0 +1,46 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#define SYCL_REDUCE_SCATTER_FUNCTIONS(MSGSIZE) \
+    void init_reduce_scatter_##MSGSIZE(ccl::datatype dtype, \
+                                       sycl::queue &queue, \
+                                       ccl_comm *comm, \
+                                       ccl_stream *stream, \
+                                       uint32_t rank_in, \
+                                       uint32_t world_in); \
+    ccl::event run_reduce_scatter_##MSGSIZE( \
+        ccl::datatype dtype, sycl::queue q, const void *send_buf, void *rev_buf, size_t recv_count, bool &done);
+
+SYCL_REDUCE_SCATTER_FUNCTIONS(small)
+SYCL_REDUCE_SCATTER_FUNCTIONS(medium)
+SYCL_REDUCE_SCATTER_FUNCTIONS(large)
+
+namespace ccl {
+namespace v1 {
+
+ccl::event reduce_scatter_sycl(sycl::queue q,
+                               const void *send_buf,
+                               void *recv_buf,
+                               size_t recv_count,
+                               datatype dtype,
+                               reduction reduction,
+                               const ccl::communicator &comm,
+                               const stream &op_stream,
+                               bool &done);
+
+}
+} // namespace ccl
diff --git a/src/coll/algorithms/utils/sycl_coll_base.hpp b/src/coll/algorithms/utils/sycl_coll_base.hpp
new file mode 100644
index 000000000..71d624bec
--- /dev/null
+++ b/src/coll/algorithms/utils/sycl_coll_base.hpp
@@ -0,0 +1,225 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <system_error>
+
+#include <sycl/sycl.hpp>
+#include <ext/intel/esimd.hpp>
+
+#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+#include "comm/comm_interface.hpp"
+#endif //#if defined(CCL_ENABLE_ZE) || defined(CCL_ENABLE_SYCL)
+
+#include "comm/comm.hpp"
+#include "coll/coll_util.hpp"
+#include "sched/entry/factory/entry_factory.hpp"
+#include "ccl_api_functions_generators.hpp"
+#include "common/global/global.hpp"
+#include "common/api_wrapper/mpi_api_wrapper.hpp"
+
+// TODO: timers can re used, but place in more general place
+class timer {
+public:
+    virtual double get_us(uint32_t i) const = 0;
+    virtual int size() const = 0;
+};
+
+template <uint32_t steps_per_instance = 1>
+class gpu_timer : timer {
+    std::array<sycl::event, steps_per_instance> v_events;
+
+public:
+    inline void record(uint32_t i, sycl::event e) {
+        v_events[i] = e;
+    }
+    double get_us(uint32_t i) const {
+        auto start =
+            v_events[i].template get_profiling_info<sycl::info::event_profiling::command_start>();
+        auto end =
+            v_events[i].template get_profiling_info<sycl::info::event_profiling::command_end>();
+        return (end - start) / 1000.0;
+    }
+    double get_start_us(uint32_t i) const {
+        auto start =
+            v_events[i].template get_profiling_info<sycl::info::event_profiling::command_start>();
+        return start / 1000.0;
+    }
+    double get_end_us(uint32_t i) const {
+        auto end =
+            v_events[i].template get_profiling_info<sycl::info::event_profiling::command_end>();
+        return end / 1000.0;
+    }
+    int size() const {
+        return steps_per_instance;
+    }
+};
+
+template <uint32_t steps_per_instance = 1>
+class cpu_timer : timer {
+    std::array<std::chrono::time_point<std::chrono::steady_clock>, steps_per_instance> v_start,
+        v_end;
+
+public:
+    inline void start(uint32_t i) {
+        v_start[i] = std::chrono::steady_clock::now();
+    }
+    inline void stop(uint32_t i) {
+        v_end[i] = std::chrono::steady_clock::now();
+    }
+    double get_us(uint32_t i) const {
+        using namespace std::chrono;
+        return duration_cast<microseconds>(v_end[i] - v_start[i]).count();
+    }
+    int size() const {
+        return steps_per_instance;
+    }
+};
+
+inline void gpu_kernel_copy(char *d, const char *s, size_t n) {
+    while (n >= 8) {
+        *(int64_t *)d = *(int64_t *)s;
+        d += 8;
+        s += 8;
+        n -= 8;
+    }
+    if (n & 4) {
+        *(int *)d = *(int *)s;
+        d += 4;
+        s += 4;
+        n -= 4;
+    }
+    if (n & 2) {
+        *(int16_t *)d = *(int16_t *)s;
+        d += 2;
+        s += 2;
+        n -= 2;
+    }
+    if (n == 1) {
+        *(char *)d = *(char *)s;
+    }
+}
+
+template <typename data_type>
+struct sycl_coll_base {
+public:
+    sycl_coll_base() {
+        initialized = false;
+        sched = NULL;
+    }
+
+    inline int inited() {
+        return initialized;
+    }
+
+protected:
+    void exchange_peer_ipc_mem(sycl::queue &queue,
+                               ccl_comm *comm,
+                               ccl_stream *stream,
+                               void *send_ptr,
+                               void *recv_ptr,
+                               int rank,
+                               int world,
+                               int data_size_per_buffer,
+                               void **send_buffers,
+                               void **sync_buffer,
+                               size_t *offsets,
+                               ze_ipc_mem_handle_t *ipc_handle,
+                               void **recv_buffers,
+                               void **mmap_buffers = NULL,
+                               bool to_cache = true) {
+        // use infrastructure
+        // 10us to create a sched
+        // 6-14us to create exchange_entry
+        // 80-128us  to call update
+        // 10us to fill buffers
+        // 20-30us to free
+        ccl_comm *node_comm = comm->get_node_comm().get();
+        std::vector<ze_handle_exchange_entry::mem_desc_t> in_buffers;
+
+        in_buffers = {
+            { send_ptr, ccl::ze::ipc_mem_type::memory },
+        };
+        if (recv_ptr && send_ptr != recv_ptr) {
+            in_buffers.push_back({ recv_ptr, ccl::ze::ipc_mem_type::memory });
+        }
+
+        if (!sched) {
+            ccl_coll_param param{};
+            param.comm = comm;
+            param.stream = stream;
+            ccl_coll_attr attr{};
+            sched = ccl_sched::create(param, attr);
+        }
+
+        ccl::utils::pt2pt_handle_exchange_info info = {};
+        int skip_rank = ccl_comm::invalid_rank;
+        ze_handle_exchange_entry *exchange_entry =
+            new ze_handle_exchange_entry(sched, node_comm, in_buffers, skip_rank, info);
+        exchange_entry->update(); //    128us
+
+        size_t send_buf_idx = 0;
+        std::vector<ccl_buffer> peer_send_bufs(world - 1);
+        for (int i = 0; i < world - 1; i++) {
+            int peer_rank = (rank + i + 1) % world;
+            sched->get_memory().handle_manager.get(peer_rank,
+                                                   send_buf_idx,
+                                                   peer_send_bufs[i],
+                                                   node_comm,
+                                                   false /*pt2pt_op*/,
+                                                   to_cache);
+            send_buffers[peer_rank] = peer_send_bufs[i].get_ptr();
+            CCL_THROW_IF_NOT(send_buffers[peer_rank], "null IPC buffer is received");
+        }
+        send_buffers[rank] = send_ptr;
+        if (sync_buffer) {
+            for (int i = 0; i < world; i++) {
+                sync_buffer[i] = (char *)send_buffers[i] + data_size_per_buffer;
+            }
+        }
+        if (recv_ptr) {
+            if (send_ptr != recv_ptr) {
+                size_t recv_buf_idx = 1;
+                std::vector<ccl_buffer> peer_recv_bufs(world - 1);
+                for (int i = 0; i < world - 1; i++) {
+                    int peer_rank = (rank + i + 1) % world;
+                    sched->get_memory().handle_manager.get(peer_rank,
+                                                           recv_buf_idx,
+                                                           peer_recv_bufs[i],
+                                                           node_comm,
+                                                           false /*pt2pt_op*/,
+                                                           to_cache);
+                    recv_buffers[peer_rank] = peer_recv_bufs[i].get_ptr();
+                    CCL_THROW_IF_NOT(recv_buffers[peer_rank], "null IPC buffer is received");
+                }
+                recv_buffers[rank] = recv_ptr;
+            }
+            else {
+                for (int i = 0; i < world; i++) {
+                    recv_buffers[i] = send_buffers[i];
+                }
+            }
+        }
+        delete exchange_entry;
+        sched->clear_memory();
+    }
+
+    bool initialized;
+    ccl_sched *sched;
+};
diff --git a/src/coll/coll.cpp b/src/coll/coll.cpp
index aa443a09d..a32971be6 100644
--- a/src/coll/coll.cpp
+++ b/src/coll/coll.cpp
@@ -60,8 +60,8 @@
 #include "sched/sched_timer.hpp"
 #include "unordered_coll/unordered_coll.hpp"
 
-ccl_request* allgatherv_case(ccl_coll_param& param) {
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl_request* exec_single_rank_inplace_coll(const ccl_coll_param& param) {
     std::vector<sycl::event> events{};
     for (size_t idx = 0; idx < param.deps.size(); idx++) {
         events.push_back(param.deps[idx].get_native());
@@ -80,7 +80,43 @@ ccl_request* allgatherv_case(ccl_coll_param& param) {
         req->set_native_event(ev);
         return req;
     }
+    return nullptr;
+}
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+ccl_request* exec_single_rank_coll(const ccl_coll_param& param) {
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (param.is_inplace()) {
+        LOG_DEBUG("single rank: inplace case, coll: ", ccl_coll_type_to_str(param.ctype));
+        return exec_single_rank_inplace_coll(param);
+    }
+    else {
+        std::vector<sycl::event> events{};
+        if (!ccl::is_queue_in_order(param.stream)) {
+            for (size_t idx = 0; idx < param.deps.size(); idx++) {
+                events.push_back(param.deps[idx].get_native());
+            }
+        }
+
+        sycl::queue sycl_stream = param.stream->get_native_stream();
+        ccl_coll_param dummy_param{};
+        dummy_param.comm = param.comm;
+        auto dummy_sched = ccl_sched::create(dummy_param, {});
+        auto req = dummy_sched->get_request();
+
+        auto event = sycl_stream.memcpy(param.recv_bufs[0],
+                                        param.send_bufs[0],
+                                        param.send_counts[0] * param.dtype.size(),
+                                        events);
+        event.wait();
+        req->set_native_event(event);
+
+        LOG_DEBUG("single rank: out-of-place case, coll: ", ccl_coll_type_to_str(param.ctype));
+        return req;
+    }
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    CCL_THROW_IF_NOT(
+        "single rank case for: ", ccl_coll_type_to_str(param.ctype), "is not supported");
     return nullptr;
 }
 
@@ -95,6 +131,32 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
 #endif // CCL_ENABLE_ITT
 
 #ifdef CCL_ENABLE_SYCL
+    /* 0. set dependencies for the collective */
+    // Submit a barrier if necessary to sync queue. The event from the barrier is added
+    // to other deps
+    // The main purpose of the barrier is to sync user's in-order queue with our out-of-order
+    // queue, so we don't execute anything before the user's tasks are completed.
+    // We don't really need anything like this for the case when user has out-of-order queue as
+    // there is no ordering requirement unless dependencies are explicitly provided and which we
+    // handle as well.
+    bool is_queue_in_order = ccl::is_queue_in_order(param.stream);
+    if (is_queue_in_order) {
+        // TODO: it would be nice to pass here all the dependencies as parameters to submit_barrier
+        // and get a single event to use later.
+        try {
+            // Note: submit_barrier with empty event vector doesn't do anything and just return an
+            // empty event as opposed to submit_barrier without paramers which submits a full
+            // queue barrier. And there is a bug which leads to a crash if empty sycl event is
+            // passed to the function.
+            auto sycl_event = ccl::utils::submit_barrier(param.stream->get_native_stream());
+            param.deps.push_back(ccl::create_event(sycl_event));
+        }
+        catch (ccl::exception&) {
+            LOG_WARN("Failed to submit sycl barrier in front of CCL collectives."
+                     "This might lead to the incorrect results");
+        }
+    }
+
     if (ccl::global_data::env().enable_op_sync)
         attr.synchronous = 1;
 
@@ -102,14 +164,6 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     // this is needed because OFI transport means we need ring barrier
     if (ccl::global_data::env().atl_transport == ccl_atl_ofi)
         attr.synchronous = 1;
-
-    auto is_inorder_queue = ccl::is_queue_in_order(param.stream);
-    if ((param.ctype == ccl_coll_send || param.ctype == ccl_coll_recv) && is_inorder_queue)
-        attr.synchronous = 1;
-
-    auto enable_sycl_output_event_saved = ccl::global_data::env().enable_sycl_output_event;
-    ccl::enable_sycl_output_barrier_in_order_queue(param.stream);
-
 #endif // CCL_ENABLE_SYCL
 
     LOG_DEBUG("\n{\n",
@@ -125,8 +179,30 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
 
     ccl::global_data& data = ccl::global_data::get();
 
-    if (param.ctype == ccl_coll_allgatherv && param.comm->size() == 1 && param.is_inplace()) {
-        return allgatherv_case(param);
+    ccl_selector_param selector_param{};
+    selector_param.ctype = param.ctype;
+    selector_param.count = param.count;
+    if (param.ctype == ccl_coll_allgatherv) {
+        selector_param.count = param.send_count;
+    }
+    selector_param.recv_counts =
+        const_cast<size_t*>(reinterpret_cast<const size_t*>(param.recv_counts.data()));
+    selector_param.dtype = param.dtype;
+    selector_param.comm = param.comm;
+    selector_param.stream = param.stream;
+    selector_param.buf = (param.send_buf) ? param.send_buf.get_ptr() : param.recv_buf.get_ptr();
+    selector_param.is_vector_buf = attr.is_vector_buf;
+#ifdef CCL_ENABLE_SYCL
+    selector_param.is_sycl_buf = attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
+    selector_param.hint_algo = param.hint_algo;
+    selector_param.peer_rank = param.peer_rank;
+    selector_param.is_scaleout = param.is_scaleout;
+
+    if (!(param.ctype == ccl_coll_barrier || param.ctype == ccl_coll_send ||
+          param.ctype == ccl_coll_recv) &&
+        param.stream && param.comm->size() == 1 && ccl_is_device_side_algo(selector_param)) {
+        return exec_single_rank_coll(param);
     }
 
     /* 1. decide whether schedule should be postponed (this includes caching and starting) */
@@ -160,8 +236,9 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     if (!postpone_schedule &&
         ccl::global_data::env().enable_fusion
 #ifdef CCL_ENABLE_SYCL
-        // TODO: enable fusion for async case with CCL_SYCL_OUTPUT_EVENT
-        && (attr.synchronous || !ccl::utils::should_use_sycl_output_event(sched->coll_param.stream))
+        // TODO: enable fusion for async case with sycl output event or in_order sycl queue
+        && (attr.synchronous ||
+            (!ccl::utils::should_use_sycl_output_event(param.stream) && !is_queue_in_order))
 #endif //CCL_ENABLE_SYCL
     ) {
         if (data.fusion_manager->add(sched)) {
@@ -191,24 +268,29 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     /* 6. regular schedule execution */
     ccl_request* request = sched->start(data.executor.get());
     if (sched->coll_attr.synchronous) {
-        request->synchronous = true;
         // request->synchronous is true,
         // so ccl_wait_impl should not release the `request`
         auto wait_result = ccl_wait_impl<ccl_sched>(data.executor.get(), request);
         CCL_THROW_IF_NOT(wait_result != ccl_wait_result_completed_released,
                          "internal error, valid request was released");
+#ifdef CCL_ENABLE_SYCL
+        if (ccl::utils::should_use_sycl_output_event(param.stream) || is_queue_in_order) {
+            request->set_native_event(
+                ccl::utils::submit_barrier(param.stream->get_native_stream()));
+        }
+#endif // CCL_ENABLE_SYCL
     }
 #ifdef CCL_ENABLE_SYCL
-    else if (ccl::utils::should_use_sycl_output_event(sched->coll_param.stream)) {
+    else if (ccl::utils::should_use_sycl_output_event(param.stream) || is_queue_in_order) {
         LOG_DEBUG("waiting for sched ", sched, " to be submitted_to_gpu");
         while (!sched->is_submitted_to_gpu() && !request->is_completed()) {
             data.executor.get()->do_work();
         }
         LOG_DEBUG("setting sycl_barrier on sched ", sched);
         if (!request->is_completed()) {
-            if (is_inorder_queue) {
+            if (is_queue_in_order) {
                 request->set_native_event(ccl::utils::submit_barrier(
-                    sched->coll_param.stream->get_native_stream(), request->get_sync_event()));
+                    param.stream->get_native_stream(), request->get_sync_event()));
             }
             else {
                 request->set_native_event(request->get_sync_event());
@@ -216,11 +298,9 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
         }
         else {
             request->set_native_event(
-                ccl::utils::submit_barrier(sched->coll_param.stream->get_native_stream()));
+                ccl::utils::submit_barrier(param.stream->get_native_stream()));
         }
     }
-    ccl::global_data::env().enable_sycl_output_event = enable_sycl_output_event_saved;
-    LOG_DEBUG("CCL_SYCL_OUTPUT_EVENT is restored: ", enable_sycl_output_event_saved);
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_ITT
@@ -235,6 +315,7 @@ ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
                                       size_t send_count,
                                       ccl_buffer recv_buf,
                                       const size_t* recv_counts,
+                                      const std::vector<ccl_buffer>& recv_device_bufs,
                                       const ccl_datatype& dtype,
                                       ccl_comm* comm,
                                       bool is_scaleout) {
@@ -288,8 +369,15 @@ ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
                 sched, send_buf, send_count, recv_buf, recv_counts, dtype, comm));
             break;
         case ccl_coll_allgatherv_ring:
-            CCL_CALL(ccl_coll_build_ring_allgatherv(
-                nullptr, part_scheds, send_buf, send_count, recv_buf, recv_counts, dtype, comm));
+            CCL_CALL(ccl_coll_build_ring_allgatherv(nullptr,
+                                                    part_scheds,
+                                                    send_buf,
+                                                    send_count,
+                                                    recv_buf,
+                                                    recv_counts,
+                                                    recv_device_bufs,
+                                                    dtype,
+                                                    comm));
             break;
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
         case ccl_coll_allgatherv_topo:
@@ -308,6 +396,7 @@ ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
                                      ccl_buffer send_buf,
                                      ccl_buffer recv_buf,
                                      size_t count,
+                                     const std::vector<ccl_buffer>& recv_device_bufs,
                                      const ccl_datatype& dtype,
                                      ccl::reduction reduction,
                                      ccl_comm* comm,
@@ -345,7 +434,7 @@ ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
             break;
         case ccl_coll_allreduce_ring:
             CCL_CALL(ccl_coll_build_ring_allreduce(
-                sched, send_buf, recv_buf, count, dtype, reduction, comm));
+                sched, send_buf, recv_buf, count, recv_device_bufs, dtype, reduction, comm));
             break;
         case ccl_coll_allreduce_ring_rma:
             CCL_CALL(ccl_coll_build_ring_rma_allreduce(
diff --git a/src/coll/coll.hpp b/src/coll/coll.hpp
index de9192d36..70514a3d8 100644
--- a/src/coll/coll.hpp
+++ b/src/coll/coll.hpp
@@ -34,6 +34,7 @@ ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
                                       size_t send_count,
                                       ccl_buffer recv_buf,
                                       const size_t* recv_counts,
+                                      const std::vector<ccl_buffer>& recv_device_bufs,
                                       const ccl_datatype& dtype,
                                       ccl_comm* comm,
                                       bool is_scaleout);
@@ -43,6 +44,7 @@ ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
                                      ccl_buffer send_buf,
                                      ccl_buffer recv_buf,
                                      size_t count,
+                                     const std::vector<ccl_buffer>& recv_device_bufs,
                                      const ccl_datatype& dtype,
                                      ccl::reduction reduction,
                                      ccl_comm* comm,
diff --git a/src/coll/coll_param.cpp b/src/coll/coll_param.cpp
index b0b931790..764da7452 100644
--- a/src/coll/coll_param.cpp
+++ b/src/coll/coll_param.cpp
@@ -105,6 +105,8 @@ void ccl_coll_param::copy(const ccl_coll_param& other) {
     recv_bufs = other.recv_bufs;
     send_dev_bufs = other.send_dev_bufs;
     recv_dev_bufs = other.recv_dev_bufs;
+    send_scale_out_bufs = other.send_scale_out_bufs;
+    recv_scale_out_bufs = other.recv_scale_out_bufs;
     send_counts = other.send_counts;
     recv_counts = other.recv_counts;
     send_count = other.send_count;
@@ -451,8 +453,7 @@ void ccl_coll_param::validate() const {
     }
 }
 
-// Optional extra event(from submit_barrier call) to add to our deps list
-void ccl_coll_param::copy_deps(const std::vector<ccl::event>& d, ccl::event* extra) {
+void ccl_coll_param::copy_deps(const std::vector<ccl::event>& d) {
 #ifdef CCL_ENABLE_SYCL
     deps.clear();
     for (size_t idx = 0; idx < d.size(); idx++) {
@@ -463,15 +464,6 @@ void ccl_coll_param::copy_deps(const std::vector<ccl::event>& d, ccl::event* ext
         catch (ccl::exception&) {
         }
     }
-
-    if (extra) {
-        try {
-            auto sycl_event = extra->get_native();
-            deps.push_back(ccl::create_event(sycl_event));
-        }
-        catch (ccl::exception&) {
-        }
-    }
 #else // CCL_ENABLE_SYCL
     CCL_THROW_IF_NOT(d.size() == 0, "host deps are not supported yet");
 #endif // CCL_ENABLE_SYCL
@@ -484,31 +476,6 @@ void ccl_coll_param::set_common_fields(ccl::datatype d,
     dtype = ccl::global_data::get().dtypes->get(d);
     comm = c;
     stream = (ccl_stream*)s;
-
-    sync_deps(s, ds);
-}
-
-// Submit a barrier if necessary to sync queue. The event from the barrier is added
-// to other deps
-void ccl_coll_param::sync_deps(const ccl_stream* s, const std::vector<ccl::event>& ds) {
-#ifdef CCL_ENABLE_SYCL
-    // The main purpose of the barrier is to sync user's in-order queue with our out-of-order
-    // queue, so we don't execute anything before the user's tasks are completed.
-    // We don't really need anything like this for the case when user has out-of-order queue as
-    // there is no ordering requirement unless dependencies are explicitly provided and which we
-    // handle as well.
-    if (ccl::is_queue_in_order(s)) {
-        // TODO: it would be nice to pass here all the dependencies as parameters to submit_barrier
-        // and get a single event to use later. Note: submit_barrier with empty event vector doesn't
-        // do anything and just return an empty event as opposed to submit_barrier without paramers
-        // which submits a full queue barrier. And there is a bug which leads to a crash if
-        // empty sycl event is passed to the function.
-        auto sycl_ev = ccl::utils::submit_barrier(s->get_native_stream());
-        auto e = ccl::create_event(sycl_ev);
-        copy_deps(ds, &e);
-        return;
-    }
-#endif // CCL_ENABLE_SYCL
     copy_deps(ds);
 }
 
diff --git a/src/coll/coll_param.hpp b/src/coll/coll_param.hpp
index fa1e3bb7d..73232ec5c 100644
--- a/src/coll/coll_param.hpp
+++ b/src/coll/coll_param.hpp
@@ -141,6 +141,9 @@ struct ccl_coll_param {
         }
         return *this;
     }
+    // copy-constructor only adds validation,
+    // no need for custom destructor
+    ~ccl_coll_param() = default;
 
     std::string to_string() const;
 
@@ -159,12 +162,11 @@ struct ccl_coll_param {
 
     void validate() const;
 
-    void copy_deps(const std::vector<ccl::event>& d, ccl::event* extra = nullptr);
+    void copy_deps(const std::vector<ccl::event>& d);
     void set_common_fields(ccl::datatype dtype,
                            ccl_comm* comm,
                            const ccl_stream* stream,
                            const std::vector<ccl::event>& deps);
-    void sync_deps(const ccl_stream* s, const std::vector<ccl::event>& ds);
 
     static ccl_coll_param create_allgatherv_param(const void* send_buf,
                                                   size_t send_count,
diff --git a/src/coll/coll_util.cpp b/src/coll/coll_util.cpp
index 5f62cc0d3..8f09f96b2 100644
--- a/src/coll/coll_util.cpp
+++ b/src/coll/coll_util.cpp
@@ -50,24 +50,6 @@ void add_coll_entry(ccl_sched* sched, const ccl_coll_param& param) {
         selector_param.peer_rank = param.peer_rank;
         selector_param.is_scaleout = param.is_scaleout;
 
-#ifdef CCL_ENABLE_SYCL
-        if (ccl_is_device_side_algo(selector_param) &&
-            (global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::none)) {
-            std::string available_ipc_modes{};
-            for (auto& ipc_exchange_name : ipc_exchange_names) {
-                if (ipc_exchange_name.second == "none") {
-                    continue;
-                }
-                available_ipc_modes += ipc_exchange_name.second + " ";
-            }
-
-            CCL_THROW("ERROR: CCL_ZE_IPC_EXCHANGE is set to none, ",
-                      "CCL_ZE_IPC_EXCHANGE must be set explicitly: ",
-                      available_ipc_modes,
-                      ". Hint: OneCCL build may not have support of drmfd");
-        }
-#endif // CCL_ENABLE_SYCL
-
         if (ccl_is_device_side_algo(selector_param)) {
             sched->strict_order = true;
         }
@@ -436,17 +418,64 @@ ze_event_handle_t fill_scaleout_coll_param(const ccl_coll_param& in_coll_param,
                                    wait_events);
     }
     else {
-        out_event = add_copy_entry(in_coll_param.send_buf,
-                                   out_coll_param.send_buf,
-                                   out_coll_param.count,
-                                   out_coll_param.dtype,
-                                   copy_attr(copy_direction::d2h),
-                                   sched,
-                                   wait_events);
+        out_event =
+            add_copy_entry(in_coll_param.send_buf,
+                           out_coll_param.send_buf,
+                           out_coll_param.count,
+                           out_coll_param.dtype,
+                           copy_attr(copy_direction::d2h
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+                                     ,
+                                     ccl::global_data::env().allreduce_pipe_chunk_count > 1
+                                         ? ccl::ze::queue_group_type::main
+                                         : ccl::ze::queue_group_type::unknown // force_queue_type
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
+                                     ),
+                           sched,
+                           wait_events);
     }
     return out_event;
 }
 
+// Some algorithms for particular collectives may support H2D copy and internal operations overlapping
+bool is_copy_overlap_enabled(ccl_sched* sched, const ccl_coll_param& coll_param, bool multi_node) {
+    ccl_selector_param selector_param;
+    selector_param.ctype = coll_param.ctype;
+    selector_param.count = coll_param.count;
+    if (coll_param.ctype == ccl_coll_allgatherv) {
+        selector_param.count = coll_param.send_count;
+    }
+    selector_param.recv_counts =
+        const_cast<size_t*>(reinterpret_cast<const size_t*>(coll_param.recv_counts.data()));
+    selector_param.dtype = coll_param.dtype;
+    selector_param.comm = coll_param.comm;
+    selector_param.stream = coll_param.stream;
+    selector_param.buf =
+        (coll_param.send_buf) ? coll_param.send_buf.get_ptr() : coll_param.recv_buf.get_ptr();
+    selector_param.is_vector_buf = sched->coll_attr.is_vector_buf;
+#ifdef CCL_ENABLE_SYCL
+    selector_param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
+    selector_param.hint_algo = coll_param.hint_algo;
+    selector_param.peer_rank = coll_param.peer_rank;
+    selector_param.is_scaleout = coll_param.is_scaleout;
+
+    if (coll_param.ctype == ccl_coll_allgatherv) {
+        auto algo =
+            ccl::global_data::get().algorithm_selector->get<ccl_coll_allgatherv>(selector_param);
+        return (algo == ccl_coll_allgatherv_ring) && multi_node &&
+               !ccl::global_data::env().ze_multi_workers;
+    }
+    else if (coll_param.ctype == ccl_coll_allreduce) {
+        auto algo =
+            ccl::global_data::get().algorithm_selector->get<ccl_coll_allreduce>(selector_param);
+        // ring allreduce calls ring allgatherv underneath
+        return (algo == ccl_coll_allreduce_ring) && multi_node &&
+               !ccl::global_data::env().ze_multi_workers;
+    }
+    return false;
+}
+
 void add_scaleout(ccl_sched* sched,
                   const ccl_coll_param& in_coll_param,
                   const bool is_single_node,
@@ -491,7 +520,7 @@ void add_scaleout(ccl_sched* sched,
         utils::clear_and_push_back(wait_events, out_event);
     }
 
-    if (!do_h2d_copy)
+    if ((is_copy_overlap_enabled(sched, coll_param, multi_node) && !enable_hmem) || !do_h2d_copy)
         return;
 
     ccl_buffer src_copy_buf = coll_param.recv_buf;
@@ -535,14 +564,6 @@ bool is_queue_in_order(const ccl_stream* s) {
     return s != nullptr && s->is_sycl_device_stream() && s->get_native_stream().is_in_order();
 }
 
-void enable_sycl_output_barrier_in_order_queue(const ccl_stream* s) {
-    LOG_DEBUG("CCL_SYCL_OUTPUT_EVENT: ", ccl::global_data::env().enable_sycl_output_event);
-    if (is_queue_in_order(s)) {
-        ccl::global_data::env().enable_sycl_output_event = 1;
-    }
-    LOG_DEBUG("CCL_SYCL_OUTPUT_EVENT is set to 1");
-}
-
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 } // namespace ccl
diff --git a/src/coll/coll_util.hpp b/src/coll/coll_util.hpp
index 86170b4b1..637bb5110 100644
--- a/src/coll/coll_util.hpp
+++ b/src/coll/coll_util.hpp
@@ -64,8 +64,6 @@ void add_scaleout(ccl_sched* sched,
                   int global_root = 0);
 
 bool is_queue_in_order(const ccl_stream* s);
-
-void enable_sycl_output_barrier_in_order_queue(const ccl_stream* s);
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 } // namespace ccl
diff --git a/src/coll/selection/selection.cpp b/src/coll/selection/selection.cpp
index f02b7d9d9..9e55fea97 100644
--- a/src/coll/selection/selection.cpp
+++ b/src/coll/selection/selection.cpp
@@ -325,6 +325,9 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
     RETURN_FALSE_IF(!param.comm->get_topo_manager().has_p2p_access(),
                     "no p2p access between devices");
 
+    RETURN_FALSE_IF(!param.comm->get_topo_manager().has_all_vertices_connected(),
+                    "no connection between vertices");
+
     RETURN_FALSE_IF(!param.comm->get_topo_manager().has_same_ppn(),
                     "ppn is not the same among the nodes");
 
@@ -374,45 +377,37 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
             " is not supported for family1");
     }
 
-    RETURN_FALSE_IF(checkers::is_unknown_device_family(param),
-                    "topo algo is not supported for unknown device family");
+    if (checkers::is_unknown_device_family(param)) {
+        LOG_WARN("Applying topo algorithm, but device family is not recognized");
 #ifndef CCL_BF16_GPU_TRUNCATE
-    RETURN_FALSE_IF(checkers::is_unknown_device_family(param) &&
-                        (param.dtype.idx() == ccl::datatype::bfloat16) &&
-                        (param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_reduce ||
-                         param.ctype == ccl_coll_reduce_scatter),
-                    "bfloat16 reduction is not supported for unknown device family");
+        if (param.dtype.idx() == ccl::datatype::bfloat16 &&
+            (param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_reduce ||
+             param.ctype == ccl_coll_reduce_scatter)) {
+            LOG_WARN("Applying topo algorithm, but bfloat16 reduction may not be"
+                     "supported for unknown device family");
+        }
 #endif // !CCL_BF16_GPU_TRUNCATE
+    }
 #endif // CCL_ENABLE_SYCL
 
-    RETURN_FALSE_IF((((param.ctype == ccl_coll_allreduce) || (param.ctype == ccl_coll_bcast) ||
-                      (param.ctype == ccl_coll_reduce)) &&
-                     ((comm_size < 2) || (local_proc_count == 1))),
-                    "unsupported comm size for ",
-                    ccl_coll_type_to_str(param.ctype));
-
     RETURN_FALSE_IF(param.ctype == ccl_coll_bcast && !checkers::is_single_node(param),
                     "multi-node for ",
                     ccl_coll_type_to_str(param.ctype),
                     " is not supported");
 
-    RETURN_FALSE_IF(((param.ctype == ccl_coll_reduce) && (comm_size % local_proc_count != 0)),
-                    "ppn must be equal");
-
-    RETURN_FALSE_IF(param.ctype == ccl_coll_allgatherv && !checkers::is_single_card(param) &&
-                        comm_size % local_proc_count != 0,
-                    "ppn must be equal");
-
-    RETURN_FALSE_IF((comm_size % 2 != 0), "odd comm_size is not supported");
+    RETURN_FALSE_IF((comm_size % 2 != 0 && comm_size != 1), "odd comm_size is not supported");
 
     const int node_comm_size = param.comm->get_node_comm().get()->size();
-    RETURN_FALSE_IF((node_comm_size % 2 != 0), "odd node_comm_size is not supported");
+
+    RETURN_FALSE_IF((node_comm_size % 2 != 0 && comm_size != 1),
+                    "odd node_comm_size is not supported");
 
     RETURN_FALSE_IF(!checkers::is_single_card(param) && !checkers::is_single_node(param) &&
                         (local_proc_count % 2 != 0),
                     "odd proc count per node is not supported");
 
-    RETURN_FALSE_IF((param.ctype == ccl_coll_reduce) && (param.count < size_t(param.comm->size())),
+    RETURN_FALSE_IF((param.ctype == ccl_coll_reduce) &&
+                        (param.count < size_t(param.comm->size())) && (comm_size != 1),
                     "reduce with count < comm_size not supported");
 
     if (param.ctype == ccl_coll_recv || param.ctype == ccl_coll_send) {
diff --git a/src/comm/comm.cpp b/src/comm/comm.cpp
index d1f737fdb..7579c7bbc 100644
--- a/src/comm/comm.cpp
+++ b/src/comm/comm.cpp
@@ -136,6 +136,11 @@ void ccl_comm::init(int comm_id,
             LOG_INFO("topo_manager:", topo_manager.to_string());
         }
         create_topo_subcomms();
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        // init of fd manager is based on node comm,
+        // it initializes for every creation of comm in multi comms case
+        init_ipc_exchange_mode(node_comm);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     }
     else {
         local2global_map = atl_comm->get_rank2rank_map();
@@ -259,6 +264,32 @@ ccl::comm_interface_ptr ccl_comm::split(const ccl::comm_split_attr& attr) {
     return std::shared_ptr<ccl_comm>(new_comm);
 }
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+void ccl_comm::init_ipc_exchange_mode(std::shared_ptr<ccl_comm> comm) {
+    if (device_ptr && context_ptr) {
+        LOG_DEBUG("initialize ipc_exchange_mode");
+        if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd &&
+            ccl::ze::fd_manager::is_pidfd_supported()) {
+            LOG_DEBUG("pidfd exchange mode is verified successfully");
+        }
+#ifdef CCL_ENABLE_DRM
+        else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
+            fd_manager = std::make_shared<ccl::ze::fd_manager>(comm->get_atl_comm());
+            // update physical_idx for each logical device, by default it is invalid
+#ifdef ZE_PCI_PROPERTIES_EXT_NAME
+            auto& devices = ccl::global_data::get().ze_data->devices;
+            for (size_t idx = 0; idx < devices.size(); idx++) {
+                devices[idx].physical_idx = ccl::ze::fd_manager::get_physical_device_idx(
+                    fd_manager->get_physical_devices(), devices[idx].pci);
+            }
+#endif // ZE_PCI_PROPERTIES_EXT_NAME
+            LOG_DEBUG("drmfd exchange mode is verified successfully");
+        }
+#endif // CCL_ENABLE_DRM
+    }
+}
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
 std::string ccl_comm::to_string() const {
     std::stringstream ss;
     ss << "{ rank: " << rank() << ", size: " << size() << ", id: " << id() << " }";
diff --git a/src/comm/comm.hpp b/src/comm/comm.hpp
index a4ba1b512..ad0246293 100644
--- a/src/comm/comm.hpp
+++ b/src/comm/comm.hpp
@@ -38,6 +38,7 @@
 #include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "oneapi/ccl/coll_attr.hpp"
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+#include "common/global/ze/ze_fd_manager.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 #include "types_generator_defines.hpp"
@@ -261,6 +262,17 @@ class alignas(CACHELINE_SIZE) ccl_comm : public ccl::comm_interface {
         }
     }
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    std::shared_ptr<ccl::ze::fd_manager> get_fd_manager() const {
+        if (parent_comm) {
+            return parent_comm->get_fd_manager();
+        }
+        else {
+            return fd_manager;
+        }
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
     std::shared_ptr<ccl_comm_env> get_env() const {
         return env;
     }
@@ -332,8 +344,11 @@ class alignas(CACHELINE_SIZE) ccl_comm : public ccl::comm_interface {
 
     ccl_rank2rank_map local2global_map{};
     ccl::topo_manager topo_manager;
-
     std::shared_ptr<ccl_comm_env> env;
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    std::shared_ptr<ccl::ze::fd_manager> fd_manager;
+    void init_ipc_exchange_mode(std::shared_ptr<ccl_comm> comm);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     ccl_sched_id_t next_sched_id_internal;
     ccl_sched_id_t next_sched_id_external;
diff --git a/src/common/api_wrapper/mpi_api_wrapper.hpp b/src/common/api_wrapper/mpi_api_wrapper.hpp
index 6e855f78d..15cb5a546 100644
--- a/src/common/api_wrapper/mpi_api_wrapper.hpp
+++ b/src/common/api_wrapper/mpi_api_wrapper.hpp
@@ -70,6 +70,7 @@ typedef struct mpi_lib_ops {
     decltype(MPI_Op_free) *MPI_Op_free_ptr;
     decltype(MPI_Query_thread) *MPI_Query_thread_ptr;
     decltype(MPI_Reduce) *MPI_Reduce_ptr;
+    decltype(MPI_Reduce_scatter) *MPI_Reduce_scatter_ptr;
     decltype(MPI_Reduce_scatter_block) *MPI_Reduce_scatter_block_ptr;
     decltype(MPI_Test) *MPI_Test_ptr;
     decltype(MPI_Type_commit) *MPI_Type_commit_ptr;
@@ -125,6 +126,7 @@ static std::vector<std::string> mpi_fn_names = {
     "MPI_Op_free",
     "MPI_Query_thread",
     "MPI_Reduce",
+    "MPI_Reduce_scatter",
     "MPI_Reduce_scatter_block",
     "MPI_Test",
     "MPI_Type_commit",
@@ -181,6 +183,7 @@ extern ccl::mpi_lib_ops_t mpi_lib_ops;
 #define MPI_Op_free               ccl::mpi_lib_ops.MPI_Op_free_ptr
 #define MPI_Query_thread          ccl::mpi_lib_ops.MPI_Query_thread_ptr
 #define MPI_Reduce                ccl::mpi_lib_ops.MPI_Reduce_ptr
+#define MPI_Reduce_scatter        ccl::mpi_lib_ops.MPI_Reduce_scatter_ptr
 #define MPI_Reduce_scatter_block  ccl::mpi_lib_ops.MPI_Reduce_scatter_block_ptr
 #define MPI_Test                  ccl::mpi_lib_ops.MPI_Test_ptr
 #define MPI_Type_commit           ccl::mpi_lib_ops.MPI_Type_commit_ptr
diff --git a/src/common/api_wrapper/pmix_api_wrapper.cpp b/src/common/api_wrapper/pmix_api_wrapper.cpp
index 9ce6632f9..f442b8aa2 100644
--- a/src/common/api_wrapper/pmix_api_wrapper.cpp
+++ b/src/common/api_wrapper/pmix_api_wrapper.cpp
@@ -37,7 +37,7 @@ bool get_pmix_local_coord(int *local_proc_idx, int *local_proc_count) {
         return false;
     }
 
-    PMIX_PROC_CONSTRUCT(&proc);
+    PMIx_Proc_construct(&proc);
     memset(proc.nspace, '\0', PMIX_MAX_NSLEN);
     memcpy(proc.nspace, global_proc.nspace, strnlen(global_proc.nspace, PMIX_MAX_NSLEN - 1));
     proc.rank = PMIX_RANK_WILDCARD;
@@ -45,18 +45,23 @@ bool get_pmix_local_coord(int *local_proc_idx, int *local_proc_count) {
     // number of local ranks on node
     if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_LOCAL_SIZE, NULL, 0, &val))) {
         LOG_WARN("PMIx_Get(PMIX_LOCAL_SIZE) failed: ", PMIx_Error_string(rc));
+        PMIx_Proc_destruct(&proc);
         return false;
     }
     *local_proc_count = val->data.uint32;
-    PMIX_VALUE_RELEASE(val);
+    PMIx_Value_destruct(val);
+    pmix_free(val);
 
     // my local rank on node
     if (PMIX_SUCCESS != (rc = PMIx_Get(&global_proc, PMIX_LOCAL_RANK, NULL, 0, &val))) {
         LOG_WARN("PMIx_Get(PMIX_LOCAL_RANK) failed: ", PMIx_Error_string(rc));
+        PMIx_Proc_destruct(&proc);
         return false;
     }
     *local_proc_idx = val->data.uint16;
-    PMIX_VALUE_RELEASE(val);
+    PMIx_Value_destruct(val);
+    pmix_free(val);
+    PMIx_Proc_destruct(&proc);
 
     LOG_DEBUG("get pmix_local_rank/size - local_proc_idx: ",
               *local_proc_idx,
diff --git a/src/common/api_wrapper/pmix_api_wrapper.hpp b/src/common/api_wrapper/pmix_api_wrapper.hpp
index 2a7f9f23c..97f7238d5 100644
--- a/src/common/api_wrapper/pmix_api_wrapper.hpp
+++ b/src/common/api_wrapper/pmix_api_wrapper.hpp
@@ -33,13 +33,14 @@ typedef struct pmix_lib_ops {
     decltype(::PMIx_Get) *PMIx_Get;
     decltype(::PMIx_Finalize) *PMIx_Finalize;
     decltype(::PMIx_Value_destruct) *PMIx_Value_destruct;
+    decltype(::PMIx_Proc_construct) *PMIx_Proc_construct;
+    decltype(::PMIx_Proc_destruct) *PMIx_Proc_destruct;
 } pmix_lib_ops_t;
 
-static std::vector<std::string> pmix_fn_names = { "PMIx_Init",
-                                                  "PMIx_Error_string",
-                                                  "PMIx_Get",
-                                                  "PMIx_Finalize",
-                                                  "PMIx_Value_destruct" };
+static std::vector<std::string> pmix_fn_names = {
+    "PMIx_Init",           "PMIx_Error_string",   "PMIx_Get",          "PMIx_Finalize",
+    "PMIx_Value_destruct", "PMIx_Proc_construct", "PMIx_Proc_destruct"
+};
 
 extern ccl::pmix_lib_ops_t pmix_lib_ops;
 
@@ -48,6 +49,8 @@ extern ccl::pmix_lib_ops_t pmix_lib_ops;
 #define PMIx_Get            ccl::pmix_lib_ops.PMIx_Get
 #define PMIx_Finalize       ccl::pmix_lib_ops.PMIx_Finalize
 #define PMIx_Value_destruct ccl::pmix_lib_ops.PMIx_Value_destruct
+#define PMIx_Proc_construct ccl::pmix_lib_ops.PMIx_Proc_construct
+#define PMIx_Proc_destruct  ccl::pmix_lib_ops.PMIx_Proc_destruct
 
 bool get_pmix_local_coord(int *local_proc_idx, int *local_proc_count);
 #endif // CCL_ENABLE_PMIX
diff --git a/src/common/api_wrapper/ze_api_wrapper.hpp b/src/common/api_wrapper/ze_api_wrapper.hpp
index ccec6bada..881755d2a 100644
--- a/src/common/api_wrapper/ze_api_wrapper.hpp
+++ b/src/common/api_wrapper/ze_api_wrapper.hpp
@@ -89,6 +89,7 @@ typedef struct ze_lib_ops {
     decltype(zeModuleDestroy) *zeModuleDestroy;
     decltype(zeModuleBuildLogGetString) *zeModuleBuildLogGetString;
     decltype(zeModuleBuildLogDestroy) *zeModuleBuildLogDestroy;
+    decltype(zeModuleGetNativeBinary) *zeModuleGetNativeBinary;
     decltype(zeDeviceGetComputeProperties) *zeDeviceGetComputeProperties;
     decltype(zeDeviceGetMemoryAccessProperties) *zeDeviceGetMemoryAccessProperties;
     decltype(zeDeviceGetMemoryProperties) *zeDeviceGetMemoryProperties;
@@ -101,6 +102,11 @@ typedef struct ze_lib_ops {
 #ifdef ZE_PCI_PROPERTIES_EXT_NAME
     decltype(zeDevicePciGetPropertiesExt) *zeDevicePciGetPropertiesExt;
 #endif // ZE_PCI_PROPERTIES_EXT_NAME
+    decltype(zeDriverGetExtensionFunctionAddress) *zeDriverGetExtensionFunctionAddress;
+    decltype(zeFabricVertexGetExp) *zeFabricVertexGetExp;
+    decltype(zeFabricVertexGetSubVerticesExp) *zeFabricVertexGetSubVerticesExp;
+    decltype(zeFabricEdgeGetExp) *zeFabricEdgeGetExp;
+    decltype(zeFabricEdgeGetPropertiesExp) *zeFabricEdgeGetPropertiesExp;
 } ze_lib_ops_t;
 
 static std::vector<std::string> ze_fn_names = {
@@ -165,6 +171,7 @@ static std::vector<std::string> ze_fn_names = {
     "zeModuleDestroy",
     "zeModuleBuildLogGetString",
     "zeModuleBuildLogDestroy",
+    "zeModuleGetNativeBinary",
     "zeDeviceGetComputeProperties",
     "zeDeviceGetMemoryAccessProperties",
     "zeDeviceGetMemoryProperties",
@@ -177,6 +184,11 @@ static std::vector<std::string> ze_fn_names = {
 #ifdef ZE_PCI_PROPERTIES_EXT_NAME
     "zeDevicePciGetPropertiesExt",
 #endif // ZE_PCI_PROPERTIES_EXT_NAME
+    "zeDriverGetExtensionFunctionAddress",
+    "zeFabricVertexGetExp",
+    "zeFabricVertexGetSubVerticesExp",
+    "zeFabricEdgeGetExp",
+    "zeFabricEdgeGetPropertiesExp",
 };
 
 extern ccl::ze_lib_ops_t ze_lib_ops;
@@ -244,6 +256,7 @@ extern ccl::ze_lib_ops_t ze_lib_ops;
 #define zeModuleDestroy                   ccl::ze_lib_ops.zeModuleDestroy
 #define zeModuleBuildLogGetString         ccl::ze_lib_ops.zeModuleBuildLogGetString
 #define zeModuleBuildLogDestroy           ccl::ze_lib_ops.zeModuleBuildLogDestroy
+#define zeModuleGetNativeBinary           ccl::ze_lib_ops.zeModuleGetNativeBinary
 #define zeDeviceGetComputeProperties      ccl::ze_lib_ops.zeDeviceGetComputeProperties
 #define zeDeviceGetMemoryAccessProperties ccl::ze_lib_ops.zeDeviceGetMemoryAccessProperties
 #define zeDeviceGetMemoryProperties       ccl::ze_lib_ops.zeDeviceGetMemoryProperties
@@ -256,6 +269,12 @@ extern ccl::ze_lib_ops_t ze_lib_ops;
 #ifdef ZE_PCI_PROPERTIES_EXT_NAME
 #define zeDevicePciGetPropertiesExt ccl::ze_lib_ops.zeDevicePciGetPropertiesExt
 #endif // ZE_PCI_PROPERTIES_EXT_NAME
+#define zeDriverGetExtensionFunctionAddress ccl::ze_lib_ops.zeDriverGetExtensionFunctionAddress
+#define zeFabricVertexGetExp                ccl::ze_lib_ops.zeFabricVertexGetExp
+#define zeFabricVertexGetSubVerticesExp     ccl::ze_lib_ops.zeFabricVertexGetSubVerticesExp
+#define zeFabricEdgeGetExp                  ccl::ze_lib_ops.zeFabricEdgeGetExp
+#define zeFabricEdgeGetPropertiesExp        ccl::ze_lib_ops.zeFabricEdgeGetPropertiesExp
+
 bool ze_api_init();
 void ze_api_fini();
 
diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp
index dee07cea2..e4cc045e9 100644
--- a/src/common/env/env.cpp
+++ b/src/common/env/env.cpp
@@ -98,6 +98,8 @@ env_data::env_data()
           enable_atl_cache(1),
           enable_sync_coll(0),
           enable_extra_ep(0),
+          enable_run_id_detection(1),
+          enable_run_id_with_ppid(1),
 
           mnic_type(ATL_MNIC_NONE),
           mnic_count(CCL_ENV_SIZET_NOT_SPECIFIED),
@@ -141,12 +143,12 @@ env_data::env_data()
           // monolithic kernel for xelink transfer
           reduce_scatter_monolithic_kernel(0),
           // reduce_scatter pipelined monolithic kernel for xelink + mdfi transfer
-          reduce_scatter_monolithic_pipeline_kernel(0),
+          reduce_scatter_monolithic_pipeline_kernel(1),
           // reduce_scatter fallback implementation using allreduce
           reduce_scatter_fallback_algo(0),
           allgatherv_monolithic_kernel(0), // monolithic kernel for xelink transfer
           allgatherv_monolithic_pipeline_kernel(
-              0), // pipelined monolithic kernel for xelink + mdfi transfer
+              1), // pipelined monolithic kernel for xelink + mdfi transfer
           alltoallv_monolithic_kernel(1),
           alltoallv_monolithic_read_kernel(1),
 
@@ -156,6 +158,27 @@ env_data::env_data()
           allreduce_pipe_chunk_count(0),
           reduce_scatter_pipe_chunk_count(0),
           reduce_pipe_chunk_count(0),
+
+          allreduce_use_tmp_buf(0),
+          allreduce_small_size_threshold(524288),
+          allreduce_medium_size_threshold(16777216),
+
+          reduce_scatter_use_tmp_buf(0),
+          reduce_scatter_small_size_threshold(2097152),
+          reduce_scatter_medium_size_threshold(67108864),
+
+          allgatherv_use_tmp_buf(0),
+          allgatherv_chunk_size(16777216),
+          allgatherv_small_size_threshold(131072),
+          allgatherv_medium_size_threshold(2097152),
+#ifdef ENABLE_DEBUG //TODO: MLSL-2664
+          skip_scheduler(0),
+#else // ENABLE_DEBUG
+          skip_scheduler(0),
+#endif // ENABLE_DEBUG
+
+          use_ccl_barrier(0),
+          use_sycl_barrier(0),
 #endif // CCL_ENABLE_SYCL
 
           allreduce_nreduce_buffering(0),
@@ -181,6 +204,7 @@ env_data::env_data()
           topo_color(topo_color_mode::fixed),
 #endif // CCL_ENABLE_SYCL
           enable_p2p_access(CCL_ENV_INT_NOT_SPECIFIED),
+          enable_fabric_vertex_connection_check(1),
 
 #ifdef CCL_ENABLE_MPI
           mpi_lib_path(),
@@ -190,6 +214,7 @@ env_data::env_data()
 #ifdef CCL_ENABLE_SYCL
           kernel_path(),
           kernel_debug(0),
+          kernel_module_cache(1),
 
           // 32 is more generic constant value
           // for gpus to avoid imbalance issue
@@ -216,6 +241,9 @@ env_data::env_data()
           ze_device_cache_upper_limit(800 * 1024L * 1024L),
           ze_device_cache_num_blocks_in_chunk(1),
           ze_device_cache_policy(ccl::ze::device_cache_policy_mode::chunk),
+          ze_device_mem_disable_clear(1),
+          ze_device_mem_alloc_size(2 * 1024L * 1024L * 1024L),
+          ze_device_mem_enable(1),
 
           // Note: env. vars are required when
           // functionality is completed to support bypass/cache
@@ -233,7 +261,8 @@ env_data::env_data()
           ze_enable_oversubscription_throw(1),
           ze_serialize_mode(0),
           ze_copy_engine(ccl::ze::copy_engine_mode::link),
-          ze_h2d_copy_engine(ccl::ze::h2d_copy_engine_mode::none),
+          ze_h2d_copy_engine(ccl::ze::h2d_copy_engine_mode::main),
+          ze_d2d_copy_engine(ccl::ze::d2d_copy_engine_mode::none),
           ze_max_compute_queues(1),
           ze_max_copy_queues(CCL_ENV_SIZET_NOT_SPECIFIED),
           ze_enable_ccs_fallback_for_copy(1),
@@ -248,7 +277,7 @@ env_data::env_data()
 #ifdef CCL_ENABLE_DRM
           ze_ipc_exchange(ccl::ze::ipc_exchange_mode::drmfd),
 #else // CCL_ENABLE_DRM
-          ze_ipc_exchange(ccl::ze::ipc_exchange_mode::none),
+          ze_ipc_exchange(ccl::ze::ipc_exchange_mode::sockets),
 #endif // CCL_ENABLE_DRM
 #ifdef ZE_PCI_PROPERTIES_EXT_NAME
           ze_drm_bdf_support(1),
@@ -257,6 +286,7 @@ env_data::env_data()
 #endif // ZE_PCI_PROPERTIES_EXT_NAME
           ze_pt2pt_read(1),
           type2_mode(type2_tune_mode::undetected),
+          ze_enable_drmfd_multi_instance(1),
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
@@ -327,6 +357,8 @@ void env_data::parse() {
     env_2_type(CCL_ATL_CACHE, enable_atl_cache);
     env_2_type(CCL_ATL_SYNC_COLL, enable_sync_coll);
     env_2_type(CCL_ATL_EXTRA_EP, enable_extra_ep);
+    env_2_type(CCL_ENABLE_RUN_ID_DETECTION, enable_run_id_detection);
+    env_2_type(CCL_ENABLE_RUN_ID_WITH_PPID, enable_run_id_with_ppid);
 
     env_2_enum(CCL_MNIC, mnic_type_names, mnic_type);
     env_2_type(CCL_MNIC_NAME, mnic_name_raw);
@@ -429,6 +461,23 @@ void env_data::parse() {
     env_2_type(CCL_ALLREDUCE_PIPE_CHUNK_COUNT, allreduce_pipe_chunk_count);
     env_2_type(CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT, reduce_scatter_pipe_chunk_count);
     env_2_type(CCL_REDUCE_PIPE_CHUNK_COUNT, reduce_pipe_chunk_count);
+
+    env_2_type(CCL_ALLREDUCE_USE_TMP_BUF, allreduce_use_tmp_buf);
+    env_2_type(CCL_ALLREDUCE_SMALL_SIZE_THRESHOLD, allreduce_small_size_threshold);
+    env_2_type(CCL_ALLREDUCE_MEDIUM_SIZE_THRESHOLD, allreduce_medium_size_threshold);
+
+    env_2_type(CCL_REDUCE_SCATTER_USE_TMP_BUF, reduce_scatter_use_tmp_buf);
+    env_2_type(CCL_REDUCE_SCATTER_SMALL_SIZE_THRESHOLD, reduce_scatter_small_size_threshold);
+    env_2_type(CCL_REDUCE_SCATTER_MEDIUM_SIZE_THRESHOLD, reduce_scatter_medium_size_threshold);
+
+    env_2_type(CCL_ALLGATHERV_USE_TMP_BUF, allgatherv_use_tmp_buf);
+    env_2_type(CCL_ALLGATHERV_CHUNK_SIZE, allgatherv_chunk_size);
+    env_2_type(CCL_ALLGATHERV_SMALL_SIZE_THRESHOLD, allgatherv_small_size_threshold);
+    env_2_type(CCL_ALLGATHERV_MEDIUM_SIZE_THRESHOLD, allgatherv_medium_size_threshold);
+
+    env_2_type(CCL_SKIP_SCHEDULER, skip_scheduler);
+    env_2_type(CCL_USE_CCL_BARRIER, use_ccl_barrier);
+    env_2_type(CCL_USE_SYCL_BARRIER, use_sycl_barrier);
 #endif // CCL_ENABLE_SYCL
 
     env_2_type(CCL_ALLREDUCE_NREDUCE_BUFFERING, allreduce_nreduce_buffering);
@@ -460,6 +509,7 @@ void env_data::parse() {
     env_2_type(CCL_TOPO_ALGO, enable_topo_algo);
     env_2_topo(CCL_TOPO_COLOR, topo_color_names, topo_color);
     env_2_type(CCL_TOPO_P2P_ACCESS, enable_p2p_access);
+    env_2_type(CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK, enable_fabric_vertex_connection_check);
 
 #ifdef CCL_ENABLE_MPI
     env_2_type(CCL_MPI_LIBRARY_PATH, mpi_lib_path);
@@ -484,6 +534,7 @@ void env_data::parse() {
     }
 
     env_2_type(CCL_KERNEL_DEBUG, kernel_debug);
+    env_2_type(CCL_KERNEL_MODULE_CACHE, kernel_module_cache);
     env_2_type(CCL_KERNEL_GROUP_SIZE, kernel_group_size);
     env_2_type(CCL_KERNEL_GROUP_COUNT, kernel_group_count);
     env_2_type(CCL_KERNEL_MEM_ALIGN, kernel_mem_align);
@@ -506,6 +557,9 @@ void env_data::parse() {
     env_2_type(CCL_ZE_DEVICE_CACHE_UPPER_LIMIT, ze_device_cache_upper_limit);
     env_2_type(CCL_ZE_DEVICE_CACHE_NUM_BLOCKS_IN_CHUNK, ze_device_cache_num_blocks_in_chunk);
     env_2_enum(CCL_ZE_DEVICE_CACHE_POLICY, ccl::ze::device_cache_policy_names, ze_device_cache_policy);
+    env_2_type(CCL_ZE_DEVICE_MEM_DISABLE_CLEAR, ze_device_mem_disable_clear);
+    env_2_type(CCL_ZE_DEVICE_MEM_ALLOC_SIZE, ze_device_mem_alloc_size);
+    env_2_type(CCL_ZE_DEVICE_MEM_ENABLE, ze_device_mem_enable);
     env_2_type(CCL_ZE_CACHE_OPEN_IPC_HANDLES, enable_ze_cache_open_ipc_handles);
     env_2_type(CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD, ze_cache_open_ipc_handles_threshold);
     if (enable_ze_cache == 0) {
@@ -527,6 +581,7 @@ void env_data::parse() {
     env_2_type(CCL_ZE_SERIALIZE, ze_serialize_mode);
     env_2_enum(CCL_ZE_COPY_ENGINE, ccl::ze::copy_engine_names, ze_copy_engine);
     env_2_enum(CCL_ZE_H2D_COPY_ENGINE, ccl::ze::h2d_copy_engine_names, ze_h2d_copy_engine);
+    env_2_enum(CCL_ZE_D2D_COPY_ENGINE, ccl::ze::d2d_copy_engine_names, ze_d2d_copy_engine);
     env_2_type(CCL_ZE_MAX_COMPUTE_QUEUES, ze_max_compute_queues);
     CCL_THROW_IF_NOT(
         ze_max_compute_queues == CCL_ENV_SIZET_NOT_SPECIFIED || ze_max_compute_queues > 0,
@@ -560,6 +615,7 @@ void env_data::parse() {
     env_2_type(CCL_ZE_DRM_BDF_SUPPORT, ze_drm_bdf_support);
     env_2_type(CCL_ZE_PT2PT_READ, ze_pt2pt_read);
     env_2_enum(CCL_ZE_TYPE2_TUNE_PORTS, type2_tune_mode_names, type2_mode);
+    env_2_type(CCL_ZE_ENABLE_DRMFD_MULTI_INSTANCE, ze_enable_drmfd_multi_instance);
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
@@ -663,6 +719,8 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ATL_CACHE, ": ", enable_atl_cache);
     LOG_DEBUG(CCL_ATL_SYNC_COLL, ": ", enable_sync_coll);
     LOG_DEBUG(CCL_ATL_EXTRA_EP, ": ", enable_extra_ep);
+    LOG_INFO(CCL_ENABLE_RUN_ID_DETECTION, ": ", enable_run_id_detection);
+    LOG_INFO(CCL_ENABLE_RUN_ID_WITH_PPID, ": ", enable_run_id_with_ppid);
 
     LOG_INFO(CCL_MNIC, ": ", str_by_enum(mnic_type_names, mnic_type));
     LOG_INFO(
@@ -775,6 +833,23 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ALLREDUCE_PIPE_CHUNK_COUNT, ": ", allreduce_pipe_chunk_count);
     LOG_INFO(CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT, ": ", reduce_scatter_pipe_chunk_count);
     LOG_INFO(CCL_REDUCE_PIPE_CHUNK_COUNT, ": ", reduce_pipe_chunk_count);
+
+    LOG_INFO(CCL_ALLREDUCE_USE_TMP_BUF, ": ", allreduce_use_tmp_buf);
+    LOG_INFO(CCL_ALLREDUCE_SMALL_SIZE_THRESHOLD, ": ", allreduce_small_size_threshold);
+    LOG_INFO(CCL_ALLREDUCE_MEDIUM_SIZE_THRESHOLD, ": ", allreduce_medium_size_threshold);
+
+    LOG_INFO(CCL_REDUCE_SCATTER_USE_TMP_BUF, ": ", reduce_scatter_use_tmp_buf);
+    LOG_INFO(CCL_REDUCE_SCATTER_SMALL_SIZE_THRESHOLD, ": ", reduce_scatter_small_size_threshold);
+    LOG_INFO(CCL_REDUCE_SCATTER_MEDIUM_SIZE_THRESHOLD, ": ", reduce_scatter_medium_size_threshold);
+
+    LOG_INFO(CCL_ALLGATHERV_USE_TMP_BUF, ": ", allgatherv_use_tmp_buf);
+    LOG_INFO(CCL_ALLGATHERV_CHUNK_SIZE, ": ", allgatherv_chunk_size);
+    LOG_INFO(CCL_ALLGATHERV_SMALL_SIZE_THRESHOLD, ": ", allgatherv_small_size_threshold);
+    LOG_INFO(CCL_ALLGATHERV_MEDIUM_SIZE_THRESHOLD, ": ", allgatherv_medium_size_threshold);
+
+    LOG_INFO(CCL_SKIP_SCHEDULER, ": ", skip_scheduler);
+    LOG_INFO(CCL_USE_CCL_BARRIER, ": ", use_ccl_barrier);
+    LOG_INFO(CCL_USE_SYCL_BARRIER, ": ", use_sycl_barrier);
 #endif // CCL_ENABLE_SYCL
 
     LOG_INFO(CCL_ALLREDUCE_NREDUCE_BUFFERING, ": ", allreduce_nreduce_buffering);
@@ -824,6 +899,7 @@ void env_data::print(int rank) {
              (enable_p2p_access != CCL_ENV_INT_NOT_SPECIFIED) ? std::to_string(enable_p2p_access)
                                                               : CCL_ENV_STR_NOT_SPECIFIED);
 
+    LOG_INFO(CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK, ": ", enable_fabric_vertex_connection_check);
     LOG_INFO(
         CCL_KERNEL_PATH, ": ", (!kernel_path.empty()) ? kernel_path : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_KERNEL_DEBUG, ": ", kernel_debug);
@@ -852,6 +928,9 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ZE_DEVICE_CACHE_UPPER_LIMIT, ": ", ze_device_cache_upper_limit);
     LOG_INFO(CCL_ZE_DEVICE_CACHE_NUM_BLOCKS_IN_CHUNK, ": ", ze_device_cache_num_blocks_in_chunk);
     LOG_INFO(CCL_ZE_DEVICE_CACHE_POLICY, ": ", str_by_enum(ccl::ze::device_cache_policy_names, ze_device_cache_policy));
+    LOG_INFO(CCL_ZE_DEVICE_MEM_DISABLE_CLEAR, ": ", ze_device_mem_disable_clear);
+    LOG_INFO(CCL_ZE_DEVICE_MEM_ALLOC_SIZE, ": ", ze_device_mem_alloc_size);
+    LOG_INFO(CCL_ZE_DEVICE_MEM_ENABLE, ": ", ze_device_mem_enable);
     LOG_INFO(CCL_ZE_CACHE_OPEN_IPC_HANDLES, ": ", enable_ze_cache_open_ipc_handles);
     LOG_INFO(CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD, ": ", ze_cache_open_ipc_handles_threshold);
     LOG_INFO(CCL_ZE_CACHE_GET_IPC_HANDLES, ": ", enable_ze_cache_get_ipc_handles);
@@ -865,6 +944,9 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ZE_H2D_COPY_ENGINE,
              ": ",
              str_by_enum(ccl::ze::h2d_copy_engine_names, ze_h2d_copy_engine));
+    LOG_INFO(CCL_ZE_D2D_COPY_ENGINE,
+             ": ",
+             str_by_enum(ccl::ze::d2d_copy_engine_names, ze_d2d_copy_engine));
     LOG_INFO(CCL_ZE_MAX_COMPUTE_QUEUES,
              ": ",
              (ze_max_compute_queues != CCL_ENV_SIZET_NOT_SPECIFIED)
@@ -890,6 +972,7 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ZE_DRM_BDF_SUPPORT, ": ", ze_drm_bdf_support);
     LOG_INFO(CCL_ZE_PT2PT_READ, ": ", ze_pt2pt_read);
     LOG_INFO(CCL_ZE_TYPE2_TUNE_PORTS, ": ", str_by_enum(type2_tune_mode_names, type2_mode));
+    LOG_INFO(CCL_ZE_ENABLE_DRMFD_MULTI_INSTANCE, ": ", ze_enable_drmfd_multi_instance);
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp
index f9d25d07b..1619013e4 100644
--- a/src/common/env/env.hpp
+++ b/src/common/env/env.hpp
@@ -114,6 +114,8 @@ class env_data {
     int enable_atl_cache;
     int enable_sync_coll;
     int enable_extra_ep;
+    int enable_run_id_detection;
+    int enable_run_id_with_ppid;
 
     atl_mnic_t mnic_type;
     std::string mnic_name_raw;
@@ -190,6 +192,23 @@ class env_data {
     size_t allreduce_pipe_chunk_count;
     size_t reduce_scatter_pipe_chunk_count;
     size_t reduce_pipe_chunk_count;
+
+    int allreduce_use_tmp_buf;
+    size_t allreduce_small_size_threshold;
+    size_t allreduce_medium_size_threshold;
+
+    int reduce_scatter_use_tmp_buf;
+    size_t reduce_scatter_small_size_threshold;
+    size_t reduce_scatter_medium_size_threshold;
+
+    size_t allgatherv_use_tmp_buf;
+    size_t allgatherv_chunk_size;
+    size_t allgatherv_small_size_threshold;
+    size_t allgatherv_medium_size_threshold;
+
+    int skip_scheduler;
+    int use_ccl_barrier;
+    int use_sycl_barrier;
 #endif // CCL_ENABLE_SYCL
 
     int allreduce_nreduce_buffering;
@@ -210,6 +229,7 @@ class env_data {
     int enable_topo_algo;
     topo_color_mode topo_color;
     int enable_p2p_access;
+    int enable_fabric_vertex_connection_check;
 
 #ifdef CCL_ENABLE_MPI
     std::string mpi_lib_path;
@@ -219,6 +239,7 @@ class env_data {
 #ifdef CCL_ENABLE_SYCL
     std::string kernel_path;
     int kernel_debug;
+    int kernel_module_cache;
     ssize_t kernel_group_size;
     ssize_t kernel_group_count;
     ssize_t kernel_mem_align;
@@ -241,6 +262,9 @@ class env_data {
     long ze_device_cache_upper_limit;
     int ze_device_cache_num_blocks_in_chunk;
     ccl::ze::device_cache_policy_mode ze_device_cache_policy;
+    int ze_device_mem_disable_clear;
+    long ze_device_mem_alloc_size;
+    size_t ze_device_mem_enable;
     int enable_ze_cache_cmdlists;
     int enable_ze_cache_cmdqueues;
     int enable_ze_cache_event_pools;
@@ -255,6 +279,7 @@ class env_data {
     int ze_serialize_mode;
     ccl::ze::copy_engine_mode ze_copy_engine;
     ccl::ze::h2d_copy_engine_mode ze_h2d_copy_engine;
+    ccl::ze::d2d_copy_engine_mode ze_d2d_copy_engine;
     ssize_t ze_max_compute_queues;
     ssize_t ze_max_copy_queues;
     int ze_enable_ccs_fallback_for_copy;
@@ -270,6 +295,7 @@ class env_data {
     int ze_drm_bdf_support;
     int ze_pt2pt_read;
     type2_tune_mode type2_mode;
+    int ze_enable_drmfd_multi_instance;
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
diff --git a/src/common/env/vars.hpp b/src/common/env/vars.hpp
index d00f19ddf..35d47fcda 100644
--- a/src/common/env/vars.hpp
+++ b/src/common/env/vars.hpp
@@ -151,6 +151,8 @@ constexpr const char* CCL_ATL_SEND_PROXY = "CCL_ATL_SEND_PROXY";
 constexpr const char* CCL_ATL_SYNC_COLL = "CCL_ATL_SYNC_COLL";
 constexpr const char* CCL_ATL_EXTRA_EP = "CCL_ATL_EXTRA_EP";
 constexpr const char* CCL_ATL_CACHE = "CCL_ATL_CACHE";
+constexpr const char* CCL_ENABLE_RUN_ID_DETECTION = "CCL_ENABLE_RUN_ID_DETECTION";
+constexpr const char* CCL_ENABLE_RUN_ID_WITH_PPID = "CCL_ENABLE_RUN_ID_WITH_PPID";
 
 constexpr const char* CCL_MNIC = "CCL_MNIC";
 constexpr const char* CCL_MNIC_NAME = "CCL_MNIC_NAME";
@@ -733,6 +735,7 @@ constexpr const char* CCL_ALLTOALL_SCATTER_MAX_OPS = "CCL_ALLTOALL_SCATTER_MAX_O
 constexpr const char* CCL_BACKEND = "CCL_BACKEND";
 
 constexpr const char* CCL_KERNEL_PATH = "CCL_KERNEL_PATH";
+constexpr const char* CCL_KERNEL_MODULE_CACHE = "CCL_KERNEL_MODULE_CACHE";
 constexpr const char* CCL_KERNEL_DEBUG = "CCL_KERNEL_DEBUG";
 constexpr const char* CCL_KERNEL_GROUP_SIZE = "CCL_KERNEL_GROUP_SIZE";
 constexpr const char* CCL_KERNEL_GROUP_COUNT = "CCL_KERNEL_GROUP_COUNT";
@@ -822,6 +825,7 @@ constexpr const char* CCL_PROCESS_LAUNCHER = "CCL_PROCESS_LAUNCHER";
 constexpr const char* CCL_TOPO_ALGO = "CCL_TOPO_ALGO";
 constexpr const char* CCL_TOPO_COLOR = "CCL_TOPO_COLOR";
 constexpr const char* CCL_TOPO_P2P_ACCESS = "CCL_TOPO_P2P_ACCESS";
+constexpr const char* CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK = "CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK";
 
 #ifdef CCL_ENABLE_MPI
 constexpr const char* CCL_MPI_LIBRARY_PATH = "CCL_MPI_LIBRARY_PATH";
@@ -839,6 +843,9 @@ constexpr const char* CCL_ZE_DEVICE_CACHE_EVICT_SMALLEST = "CCL_ZE_DEVICE_CACHE_
 constexpr const char* CCL_ZE_DEVICE_CACHE_UPPER_LIMIT = "CCL_ZE_DEVICE_CACHE_UPPER_LIMIT";
 constexpr const char* CCL_ZE_DEVICE_CACHE_NUM_BLOCKS_IN_CHUNK = "CCL_ZE_DEVICE_CACHE_NUM_BLOCKS_IN_CHUNK";
 constexpr const char* CCL_ZE_DEVICE_CACHE_POLICY = "CCL_ZE_DEVICE_CACHE_POLICY";
+constexpr const char* CCL_ZE_DEVICE_MEM_DISABLE_CLEAR = "CCL_ZE_DEVICE_MEM_DISABLE_CLEAR";
+constexpr const char* CCL_ZE_DEVICE_MEM_ALLOC_SIZE = "CCL_ZE_DEVICE_MEM_ALLOC_SIZE";
+constexpr const char* CCL_ZE_DEVICE_MEM_ENABLE = "CCL_ZE_DEVICE_MEM_ENABLE";
 constexpr const char* CCL_ZE_CACHE_OPEN_IPC_HANDLES = "CCL_ZE_CACHE_OPEN_IPC_HANDLES";
 constexpr const char* CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD = "CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD";
 constexpr const char* CCL_ZE_CACHE_GET_IPC_HANDLES = "CCL_ZE_CACHE_GET_IPC_HANDLES";
@@ -874,6 +881,7 @@ constexpr const char* CCL_ZE_SERIALIZE = "CCL_ZE_SERIALIZE";
 
 constexpr const char* CCL_ZE_COPY_ENGINE = "CCL_ZE_COPY_ENGINE";
 constexpr const char* CCL_ZE_H2D_COPY_ENGINE = "CCL_ZE_H2D_COPY_ENGINE";
+constexpr const char* CCL_ZE_D2D_COPY_ENGINE = "CCL_ZE_D2D_COPY_ENGINE";
 constexpr const char* CCL_ZE_MAX_COMPUTE_QUEUES = "CCL_ZE_MAX_COMPUTE_QUEUES";
 constexpr const char* CCL_ZE_MAX_COPY_QUEUES = "CCL_ZE_MAX_COPY_QUEUES";
 // use CCS for intra-card copy if main CE is not available
@@ -889,6 +897,7 @@ constexpr const char* CCL_ZE_LIBRARY_PATH = "CCL_ZE_LIBRARY_PATH";
 constexpr const char* CCL_ZE_ENABLE = "CCL_ZE_ENABLE";
 constexpr const char* CCL_ZE_FINI_WA = "CCL_ZE_FINI_WA";
 constexpr const char* CCL_ZE_MULTI_WORKERS = "CCL_ZE_MULTI_WORKERS";
+constexpr const char* CCL_ZE_ENABLE_DRMFD_MULTI_INSTANCE = "CCL_ZE_ENABLE_DRMFD_MULTI_INSTANCE";
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
diff --git a/src/common/env/vars_experimental.hpp b/src/common/env/vars_experimental.hpp
index 811fac8a6..facd69ec4 100644
--- a/src/common/env/vars_experimental.hpp
+++ b/src/common/env/vars_experimental.hpp
@@ -170,4 +170,21 @@ constexpr const char* CCL_BARRIER_SYNC = "CCL_BARRIER_SYNC";
 /** @} */
 /** @} */
 
+constexpr const char* CCL_ALLREDUCE_USE_TMP_BUF = "CCL_ALLREDUCE_USE_TMP_BUF";
+constexpr const char* CCL_ALLREDUCE_SMALL_SIZE_THRESHOLD = "CCL_ALLREDUCE_SMALL_SIZE_THRESHOLD";
+constexpr const char* CCL_ALLREDUCE_MEDIUM_SIZE_THRESHOLD = "CCL_ALLREDUCE_MEDIUM_SIZE_THRESHOLD";
+
+constexpr const char* CCL_REDUCE_SCATTER_USE_TMP_BUF = "CCL_REDUCE_SCATTER_USE_TMP_BUF";
+constexpr const char* CCL_REDUCE_SCATTER_SMALL_SIZE_THRESHOLD = "CCL_REDUCE_SCATTER_SMALL_SIZE_THRESHOLD";
+constexpr const char* CCL_REDUCE_SCATTER_MEDIUM_SIZE_THRESHOLD = "CCL_REDUCE_SCATTER_MEDIUM_SIZE_THRESHOLD";
+
+constexpr const char* CCL_ALLGATHERV_USE_TMP_BUF = "CCL_ALLGATHERV_USE_TMP_BUF";
+constexpr const char* CCL_ALLGATHERV_CHUNK_SIZE = "CCL_ALLGATHERV_CHUNK_SIZE";
+constexpr const char* CCL_ALLGATHERV_SMALL_SIZE_THRESHOLD = "CCL_ALLGATHERV_SMALL_SIZE_THRESHOLD";
+constexpr const char* CCL_ALLGATHERV_MEDIUM_SIZE_THRESHOLD = "CCL_ALLGATHERV_MEDIUM_SIZE_THRESHOLD";
+
+constexpr const char* CCL_SKIP_SCHEDULER = "CCL_SKIP_SCHEDULER";
+constexpr const char* CCL_USE_CCL_BARRIER = "CCL_USE_CCL_BARRIER";
+constexpr const char* CCL_USE_SYCL_BARRIER = "CCL_USE_SYCL_BARRIER";
+
 #endif // CCL_ENABLE_SYCL
diff --git a/src/common/event/impls/host_event.cpp b/src/common/event/impls/host_event.cpp
index aac4780f2..95e7daa95 100644
--- a/src/common/event/impls/host_event.cpp
+++ b/src/common/event/impls/host_event.cpp
@@ -39,7 +39,7 @@ host_event_impl::host_event_impl(ccl_request* r) : req(r) {
     }
 #endif // CCL_ENABLE_ZE
 #endif // CCL_ENABLE_SYCL
-    if (req->synchronous) {
+    if (req->get_sched()->coll_attr.synchronous) {
         if (!ccl::global_data::get().executor.get()->is_locked) {
             ccl_release_request(req);
         }
@@ -56,8 +56,7 @@ host_event_impl::~host_event_impl() {
     // event which always complete, this way LOG_ERROR is never called
     if (!completed
 #ifdef CCL_ENABLE_SYCL
-        && (ccl::global_data::env().enable_sycl_output_event &&
-            !utils::is_sycl_event_completed(get_native()))) {
+        && (native_event && !utils::is_sycl_event_completed(*native_event))) {
         LOG_WARN("not completed event is destroyed");
     }
 #else // CCL_ENABLE_SYCL
@@ -101,8 +100,9 @@ void host_event_impl::wait() {
         }
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
         //TODO call native_event->wait() both for out-of-order and in-order queues (MLSL-2374)
-        if (native_event && ccl::is_queue_in_order(stream))
+        if (native_event && ccl::is_queue_in_order(stream)) {
             native_event->wait();
+        }
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         completed = true;
     }
diff --git a/src/common/global/global.cpp b/src/common/global/global.cpp
index bafd780f2..edd381762 100644
--- a/src/common/global/global.cpp
+++ b/src/common/global/global.cpp
@@ -160,9 +160,10 @@ void global_data::getenv_local_coord(const char* local_proc_idx_env_name,
         LOG_WARN("could not get local_idx/count from environment variables, "
                  "trying to get them from ATL");
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
-        CCL_THROW_IF_NOT(
-            global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets,
-            "to get local_idx/count from ATL, set CCL_ZE_IPC_EXCHANGE=sockets explicitly");
+        LOG_WARN("fallback to 'sockets' mode of ze exchange mechanism, to use "
+                 "CCL_ZE_IPC_EXHANGE=drmfd, set CCL_LOCAL_RANK/SIZE explicitly "
+                 " or use process launcher");
+        global_data::env().ze_ipc_exchange = ccl::ze::ipc_exchange_mode::sockets;
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
         local_proc_idx = CCL_ENV_INT_NOT_SPECIFIED;
         local_proc_count = CCL_ENV_INT_NOT_SPECIFIED;
@@ -195,9 +196,10 @@ void global_data::set_local_coord() {
                 LOG_WARN("could not get local_idx/count from environment variables, "
                          "trying to get them from ATL");
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
-                CCL_THROW_IF_NOT(
-                    global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets,
-                    "to get local_idx/count from ATL, set CCL_ZE_IPC_EXCHANGE=sockets explicitly");
+                LOG_WARN("fallback to 'sockets' mode of ze exchange mechanism, to use "
+                         "CCL_ZE_IPC_EXHANGE=drmfd, set CCL_LOCAL_RANK/SIZE explicitly "
+                         " or use process launcher");
+                env.ze_ipc_exchange = ccl::ze::ipc_exchange_mode::sockets;
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
             }
             else {
diff --git a/src/common/global/global.hpp b/src/common/global/global.hpp
index c9e9de517..be7300df5 100644
--- a/src/common/global/global.hpp
+++ b/src/common/global/global.hpp
@@ -104,6 +104,10 @@ class global_data {
         local_proc_count = local_count;
     }
 
+    std::string get_local_run_id() const {
+        return local_run_id;
+    }
+
 private:
     global_data();
 
@@ -112,6 +116,7 @@ class global_data {
 
     int local_proc_idx{ ccl_comm::invalid_rank };
     int local_proc_count{ ccl::utils::invalid_err_code };
+    std::string local_run_id{ "" };
     void getenv_local_coord(const char* local_proc_idx_env_name,
                             const char* local_proc_count_env_name);
     void set_local_coord();
diff --git a/src/common/global/ze/ze_data.cpp b/src/common/global/ze/ze_data.cpp
index 20d91fc9d..8e6ae98e5 100644
--- a/src/common/global/ze/ze_data.cpp
+++ b/src/common/global/ze/ze_data.cpp
@@ -85,11 +85,10 @@ global_data_desc::global_data_desc() {
     LOG_DEBUG("found devices: ", devices.size());
 
     cache = std::make_unique<ze::cache>(global_data::env().worker_count);
+    dev_memory_manager = std::make_unique<ze::device_memory_manager>();
 
     topo_manager::detect_tune_port_count(devices);
 
-    init_ipc_exchange_mode();
-
     LOG_INFO("initialized level-zero");
 }
 
@@ -113,41 +112,5 @@ global_data_desc::~global_data_desc() {
     LOG_INFO("finalized level-zero");
 }
 
-void global_data_desc::init_ipc_exchange_mode() {
-    if (global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd &&
-        ze::fd_manager::is_pidfd_supported()) {
-        LOG_DEBUG("pidfd exchange mode is verified successfully");
-    }
-#ifdef CCL_ENABLE_DRM
-    else if (global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
-        fd_manager = std::make_unique<ze::fd_manager>();
-        // update physical_idx for each logical device, by default it is invalid
-#ifdef ZE_PCI_PROPERTIES_EXT_NAME
-        for (size_t idx = 0; idx < devices.size(); idx++) {
-            devices[idx].physical_idx = ccl::ze::fd_manager::get_physical_device_idx(
-                fd_manager->get_physical_devices(), devices[idx].pci);
-        }
-#endif // ZE_PCI_PROPERTIES_EXT_NAME
-        LOG_DEBUG("drmfd exchange mode is verified successfully");
-    }
-#endif // CCL_ENABLE_DRM
-    else if (global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::none) {
-        LOG_WARN("CCL_ZE_IPC_EXCHANGE is set to none."
-                 " It will fail with GPU buffers and topo algorithms");
-    }
-    else if (global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets) {
-        // TODO: remove ipc_exchange_mode::none, and warning when MLSL-2078 is done
-        LOG_WARN("sockets exchange mode is set. It may cause"
-                 " potential problem of 'Too many open file descriptors'");
-    }
-    else {
-        // we must use std::cerr to see the error message because
-        // comm_selector.cpp:57 create_comm_impl: EXCEPTION: ze_data was not initialized
-        // has higher priority of printing the its error message
-        std::cerr << "ERROR:  unexpected ipc exchange mode" << std::endl;
-        throw std::runtime_error(std::string(__FUNCTION__));
-    }
-}
-
 } // namespace ze
 } // namespace ccl
diff --git a/src/common/global/ze/ze_data.hpp b/src/common/global/ze/ze_data.hpp
index e8d98e7fe..2e01c6bd2 100644
--- a/src/common/global/ze/ze_data.hpp
+++ b/src/common/global/ze/ze_data.hpp
@@ -45,21 +45,17 @@ class global_data_desc {
     std::vector<ze_context_handle_t> contexts;
     std::vector<device_info> devices;
     std::unique_ptr<ze::cache> cache;
+    std::unique_ptr<ze::device_memory_manager> dev_memory_manager;
     std::unordered_map<ze_context_handle_t, ccl::ze::dynamic_event_pool> dynamic_event_pools;
 
     std::atomic<size_t> kernel_counter{};
 
-    std::unique_ptr<ze::fd_manager> fd_manager;
-
     global_data_desc();
     global_data_desc(const global_data_desc&) = delete;
     global_data_desc(global_data_desc&&) = delete;
     global_data_desc& operator=(const global_data_desc&) = delete;
     global_data_desc& operator=(global_data_desc&&) = delete;
     ~global_data_desc();
-
-private:
-    void init_ipc_exchange_mode();
 };
 
 } // namespace ze
diff --git a/src/common/global/ze/ze_fd_manager.cpp b/src/common/global/ze/ze_fd_manager.cpp
index a6cd24a9c..57dcb6a25 100644
--- a/src/common/global/ze/ze_fd_manager.cpp
+++ b/src/common/global/ze/ze_fd_manager.cpp
@@ -45,7 +45,8 @@
 namespace ccl {
 namespace ze {
 
-fd_manager::fd_manager() {
+fd_manager::fd_manager(std::shared_ptr<atl_base_comm> comm) : comm(comm) {
+    CCL_THROW_IF_NOT(comm, "no comm in fd_manager init");
     device_fds = init_device_fds();
     exchange_device_fds();
     LOG_DEBUG("init completed");
@@ -55,7 +56,7 @@ fd_manager::~fd_manager() {
     all_socks.clear();
     all_pids.clear();
     for (auto fd : device_fds) {
-        close(fd);
+        ccl::utils::close_fd(fd);
     }
     device_fds.clear();
     device_bdfs.clear();
@@ -88,68 +89,12 @@ bool fd_manager::is_pidfd_supported() {
     check_fd(dupfd);
 
     for (auto &fd : fds) {
-        close(fd);
+        ccl::utils::close_fd(fd);
     }
     unlink(filename);
     return result;
 }
 
-void fd_manager::barrier(void *mem) {
-    static int call_count = 1;
-
-    int local_count = ccl::global_data::get().get_local_proc_count();
-    std::atomic<int> *barrier_counter = static_cast<std::atomic<int> *>(mem);
-    CCL_THROW_IF_NOT(barrier_counter == mem,
-                     "barrier_counter: ",
-                     barrier_counter,
-                     " and mem:",
-                     mem,
-                     " must be the same");
-
-    ++(*barrier_counter);
-    LOG_DEBUG("barrier_counter: ", *barrier_counter);
-
-    while ((*barrier_counter) < (call_count * local_count)) {
-        ccl_yield(ccl::global_data::env().yield_type);
-    }
-    call_count++;
-}
-
-std::string fd_manager::get_shm_filename() {
-    std::string filename = "/dev/shm/ccl-shm-file";
-    uid_t uid = getuid();
-    std::stringstream ss;
-    ss << filename << "-" << std::to_string(uid);
-    return ss.str();
-}
-
-void *fd_manager::create_shared_memory() {
-    int local_count = ccl::global_data::get().get_local_proc_count();
-    auto length = size_per_proc * local_count + counter_offset;
-    int prot = PROT_READ | PROT_WRITE;
-    int flags = MAP_SHARED;
-
-    auto shm_filename = get_shm_filename();
-    int fd = open(shm_filename.c_str(), O_CREAT | O_RDWR, 0666);
-    CCL_THROW_IF_NOT(fd > 0, "open failed: fd: ", fd, ", errno: ", strerror(errno));
-    int ret = ftruncate(fd, length);
-    CCL_THROW_IF_NOT(ret != ccl::utils::invalid_err_code,
-                     "ioctl failed: ret: ",
-                     ret,
-                     ", errno: ",
-                     strerror(errno));
-
-    void *mem = mmap(nullptr, length, prot, flags, fd, 0);
-    CCL_THROW_IF_NOT(mem != MAP_FAILED, "mmap failed: mem: ", mem, ", errno: ", strerror(errno));
-
-    LOG_DEBUG("shm_filename: ", shm_filename, ", mem: ", mem, ", fd: ", fd);
-    barrier(mem);
-
-    close(fd);
-    unlink(shm_filename.c_str());
-    return mem;
-}
-
 // get functions impl
 std::vector<int> fd_manager::get_device_fds() {
     return device_fds;
@@ -469,31 +414,31 @@ int fd_manager::mem_handle_to_fd(int convert_from_fd, int handle) {
     return fd;
 }
 
-std::vector<int> fd_manager::setup_device_fds(int local_count,
-                                              int proc_idx,
+std::vector<int> fd_manager::setup_device_fds(int comm_size,
+                                              int rank_idx,
                                               std::vector<bdf_info> &return_bdf) {
     std::vector<int> fds;
     std::vector<bdf_info> bdf_data;
     // bdf_info info;
-    if (proc_idx == 0) {
+    if (rank_idx == 0) {
         fds = device_fds;
         return_bdf = device_bdfs;
         // send the fds to all other local processes
-        for (int p_idx = 1; p_idx < local_count; p_idx++) {
+        for (int p_idx = 1; p_idx < comm_size; p_idx++) {
             for (auto &fd : fds) {
                 ccl::utils::sendmsg_call(
                     all_socks[p_idx],
                     fd,
                     device_bdfs.empty() ? nullptr : device_bdfs.data(),
                     device_bdfs.empty() ? 0 : device_bdfs.size() * sizeof(bdf_info),
-                    proc_idx);
+                    rank_idx);
             }
         }
     }
     else {
         // receive the fds from local process 0
         for (auto fd : device_fds) {
-            close(fd);
+            ccl::utils::close_fd(fd);
         }
         fds.resize(device_fds.size());
         for (auto &fd : fds) {
@@ -502,7 +447,7 @@ std::vector<int> fd_manager::setup_device_fds(int local_count,
                                      &fd,
                                      bdf_data.empty() ? nullptr : bdf_data.data(),
                                      bdf_data.empty() ? 0 : bdf_data.size() * sizeof(bdf_info),
-                                     proc_idx);
+                                     rank_idx);
             return_bdf = bdf_data;
         }
     }
@@ -516,43 +461,37 @@ void fd_manager::exchange_device_fds() {
     memset(&sockaddr, 0, sizeof(sockaddr));
     unsigned int sockaddr_len = sizeof(sockaddr);
     int enable = 1;
+    int ack = 1;
 
-    int local_count = ccl::global_data::get().get_local_proc_count();
-    int local_idx = ccl::global_data::get().get_local_proc_idx();
+    int comm_size = comm->get_size();
+    int rank_idx = comm->get_rank();
 
-    auto length = size_per_proc * local_count + counter_offset;
-
-    all_pids.resize(local_count, ccl::utils::invalid_pid);
-    all_socks.resize(local_count, ccl::utils::invalid_fd);
+    std::vector<int> all_ints(comm_size, ccl::utils::invalid_fd);
+    all_socks.resize(comm_size, ccl::utils::invalid_fd);
+    all_pids.resize(comm_size, ccl::utils::invalid_pid);
 
     pid_t pid = getpid();
+    ccl::utils::allgather(comm, &pid, all_pids.data(), sizeof(pid_t));
 
-    // send own pid to all processes via shm
-    void *mem = create_shared_memory();
-    void *shmem = (char *)mem + counter_offset;
-
-    ((pid_t *)shmem)[local_idx] = pid;
-
-    barrier(mem);
-
-    for (int i = 0; i < local_count; i++) {
-        all_pids[i] = ((pid_t *)shmem)[i];
-    }
     CCL_THROW_IF_NOT(!all_pids.empty(), "all_pids shouldn't be empty");
     LOG_DEBUG("pid exchange is done: ", all_pids.size());
 
-    // create a named socket between local_idx
+    // create a named socket between rank_idx
     // 0 and all other local processes
-    if (local_idx == 0) {
-        barrier(mem);
-        for (int i = 1; i < local_count; ++i) {
+    if (rank_idx == 0) {
+        // TODO: barrier from ofi transport is hanging, remove allgather
+        // after fix, here and everywhere
+        ccl::utils::allgather(comm, &ack, all_ints.data(), sizeof(int));
+        for (int i = 1; i < comm_size; ++i) {
             std::string remote_sock_name;
             struct sockaddr_un remote_sockaddr;
 
             remote_sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(all_pids[i]) + ":" +
-                               std::to_string(i) + "-" + std::to_string(local_idx);
+                               std::to_string(i) + "-" + std::to_string(rank_idx) + "-" +
+                               std::to_string(comm_size);
             sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(pid) + ":" +
-                        std::to_string(local_idx) + "-" + std::to_string(i);
+                        std::to_string(rank_idx) + "-" + std::to_string(i) + "-" +
+                        std::to_string(comm_size);
 
             // create a socket for local proc j
             all_socks[i] = socket(AF_UNIX, SOCK_STREAM, 0);
@@ -600,8 +539,8 @@ void fd_manager::exchange_device_fds() {
     else {
         int sock;
         // create the local socket name
-        sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(pid) + ":" +
-                    std::to_string(local_idx) + "-" + std::to_string(0);
+        sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(pid) + ":" + std::to_string(rank_idx) +
+                    "-" + std::to_string(0) + "-" + std::to_string(comm_size);
         // create a socket for local proc i
         sock = socket(AF_UNIX, SOCK_STREAM, 0);
         CCL_THROW_IF_NOT(sock != ccl::utils::invalid_fd,
@@ -625,7 +564,7 @@ void fd_manager::exchange_device_fds() {
                          strerror(errno));
 
         // listen to the socket to accept a connection to the other process
-        sock_err = listen(sock, local_count);
+        sock_err = listen(sock, comm_size);
         CCL_THROW_IF_NOT(sock_err != ccl::utils::invalid_err_code,
                          "listen failed: sock_err: ",
                          sock_err,
@@ -633,7 +572,7 @@ void fd_manager::exchange_device_fds() {
                          strerror(errno));
 
         // notify the other process that the socket is created and being listened to
-        barrier(mem);
+        ccl::utils::allgather(comm, &ack, all_ints.data(), sizeof(int));
 
         all_socks[0] = accept(sock, (struct sockaddr *)&sockaddr, &sockaddr_len);
         CCL_THROW_IF_NOT(all_socks[0] != ccl::utils::invalid_err_code,
@@ -646,31 +585,28 @@ void fd_manager::exchange_device_fds() {
             ccl::utils::invalid_err_code) {
             CCL_THROW("setsockopt failed: sock: ", all_socks[0], ", errno: ", strerror(errno));
         }
-        close(sock);
+        ccl::utils::close_fd(sock);
     }
 
     LOG_DEBUG("connection is set up");
-    device_fds = setup_device_fds(local_count, local_idx, device_bdfs);
+    device_fds = setup_device_fds(comm_size, rank_idx, device_bdfs);
     physical_devices = fill_physical_devices();
 
     // close sockets
-    if (local_idx == 0) {
-        close_sockets(local_count, local_idx);
-        barrier(mem);
+    if (rank_idx == 0) {
+        close_sockets(comm_size, rank_idx);
+        ccl::utils::allgather(comm, &ack, all_ints.data(), sizeof(int));
     }
     else {
-        barrier(mem);
-        close_sockets(local_count, local_idx);
+        ccl::utils::allgather(comm, &ack, all_ints.data(), sizeof(int));
+        close_sockets(comm_size, rank_idx);
     }
-
-    int ret = munmap(mem, length);
-    CCL_THROW_IF_NOT(ret == 0, "munmap failed: ret: ", ret, ", errno: ", strerror(errno));
 }
 
-void fd_manager::close_sockets(int local_count, int proc_idx) {
+void fd_manager::close_sockets(int comm_size, int rank_idx) {
     int sock_err;
     std::string sock_name;
-    for (int i = 0; i < local_count; ++i) {
+    for (int i = 0; i < comm_size; ++i) {
         if (all_socks[i] != ccl::utils::invalid_fd) {
             sock_err = close(all_socks[i]);
             CCL_THROW_IF_NOT(sock_err != ccl::utils::invalid_err_code,
@@ -680,10 +616,11 @@ void fd_manager::close_sockets(int local_count, int proc_idx) {
                              strerror(errno));
         }
 
-        if (all_pids[proc_idx] != ccl::utils::invalid_pid && proc_idx != i) {
-            sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(all_pids[proc_idx]) + ":" +
-                        std::to_string(proc_idx) + "-" + std::to_string(i);
-            sock_err = unlink(sock_name.c_str());
+        if (all_pids[rank_idx] != ccl::utils::invalid_pid && rank_idx != i) {
+            sock_name = "/tmp/ccl-ipc-fd-sock-" + std::to_string(all_pids[rank_idx]) + ":" +
+                        std::to_string(rank_idx) + "-" + std::to_string(i) + "-" +
+                        std::to_string(comm_size);
+            unlink(sock_name.c_str());
         }
     }
 }
diff --git a/src/common/global/ze/ze_fd_manager.hpp b/src/common/global/ze/ze_fd_manager.hpp
index a5df5430e..e2cc1cf56 100644
--- a/src/common/global/ze/ze_fd_manager.hpp
+++ b/src/common/global/ze/ze_fd_manager.hpp
@@ -32,8 +32,7 @@ static std::map<ipc_exchange_mode, std::string> ipc_exchange_names = {
 #ifdef CCL_ENABLE_DRM
     std::make_pair(ipc_exchange_mode::drmfd, "drmfd"),
 #endif // CCL_ENABLE_DRM
-    std::make_pair(ipc_exchange_mode::pidfd, "pidfd"),
-    std::make_pair(ipc_exchange_mode::none, "none")
+    std::make_pair(ipc_exchange_mode::pidfd, "pidfd")
 };
 
 // RAII
@@ -103,7 +102,7 @@ class fd_manager {
     // permissions for group and others
     static constexpr mode_t rwe_umask = 077;
 
-    fd_manager();
+    fd_manager(std::shared_ptr<atl_base_comm> comm);
     fd_manager(const fd_manager&) = delete;
     fd_manager(fd_manager&&) = delete;
     fd_manager& operator=(const fd_manager&) = delete;
@@ -125,22 +124,15 @@ class fd_manager {
 
 private:
     void exchange_device_fds();
-    std::vector<int> setup_device_fds(int local_count,
-                                      int proc_idx,
+    std::vector<int> setup_device_fds(int comm_size,
+                                      int rank_idx,
                                       std::vector<bdf_info>& return_bdf);
 
-    void close_sockets(int local_count, int proc_idx);
-
-    std::string get_shm_filename();
-    void* create_shared_memory();
-    void barrier(void* mem);
+    void close_sockets(int comm_size, int rank_idx);
 
     static int convert_fd_pidfd(int convert_from_fd, int handle);
     static int convert_fd_drmfd(int convert_from_fd, int handle);
 
-    const int counter_offset = sizeof(int);
-    const int size_per_proc = sizeof(pid_t);
-
     // init
     std::vector<int> init_device_fds();
     std::vector<bdf_info> init_device_bdfs(const size_t size);
@@ -160,6 +152,7 @@ class fd_manager {
 
     std::vector<int> device_fds;
     std::vector<device_bdf_info> physical_devices;
+    std::shared_ptr<atl_base_comm> comm;
 };
 
 } // namespace ze
diff --git a/src/common/request/request.hpp b/src/common/request/request.hpp
index ba82060a9..4a2a6ea46 100644
--- a/src/common/request/request.hpp
+++ b/src/common/request/request.hpp
@@ -31,6 +31,8 @@ class alignas(CACHELINE_SIZE) ccl_request {
     using dump_func = std::function<void(std::ostream&)>;
 
     ccl_request(ccl_sched& sched);
+    ccl_request(const ccl_request& other) = delete;
+    ccl_request& operator=(const ccl_request& other) = delete;
 
     virtual ~ccl_request();
 
@@ -47,8 +49,6 @@ class alignas(CACHELINE_SIZE) ccl_request {
 
     mutable bool urgent = false;
 
-    bool synchronous = false;
-
 #ifdef CCL_ENABLE_SYCL
     void set_native_event(sycl::event new_event) {
         native_event = std::make_shared<sycl::event>(new_event);
diff --git a/src/common/stream/stream.cpp b/src/common/stream/stream.cpp
index 28870f7be..e5b87fc60 100644
--- a/src/common/stream/stream.cpp
+++ b/src/common/stream/stream.cpp
@@ -29,6 +29,7 @@ std::string to_string(device_family family) {
     switch (family) {
         case device_family::family1: return "family1";
         case device_family::family2: return "family2";
+        case device_family::family3: return "family3";
         default: return "unknown";
     }
 }
diff --git a/src/common/utils/buffer.hpp b/src/common/utils/buffer.hpp
index 8c840590a..0e1a6d2c7 100644
--- a/src/common/utils/buffer.hpp
+++ b/src/common/utils/buffer.hpp
@@ -46,7 +46,7 @@ class ccl_buffer {
                       access_size);
         }
 
-        if ((size != -1) && (offset + access_size > (size_t)size)) {
+        if ((size != -1) && (offset + access_size > (size_t)size) && (access_size != 0)) {
             result = false;
             LOG_ERROR("unexpected (offset + access_size): ",
                       "size ",
@@ -62,7 +62,12 @@ class ccl_buffer {
 
 public:
     ccl_buffer(void* src) = delete;
+    // user is responsible for freeing the memory after all buffers
+    // are destroyed
+    ~ccl_buffer() = default;
 
+    // buffer does not take *src* ownership, user is responsible for
+    // freeing all memory after all buffers are destroyed
     ccl_buffer(void* src, ssize_t size, size_t offset, ccl_buffer_type type)
             : src(src),
               size(size),
diff --git a/src/common/utils/profile.cpp b/src/common/utils/profile.cpp
index c0cb6bd0b..7997a51cc 100644
--- a/src/common/utils/profile.cpp
+++ b/src/common/utils/profile.cpp
@@ -16,31 +16,32 @@
 #include "common/utils/utils.hpp"
 #include "common/log/log.hpp"
 
-ccl::profile::metrics_manager::~metrics_manager() {
-    finalize();
+void ccl::profile::metrics_counter::init() {
+    this->nonparallel_calls_per_count.clear();
+    this->parallel_calls_per_count.clear();
 }
 
-void ccl::profile::metrics_manager::init() {
-    allreduce_pipe_nonparallel_calls_per_count.clear();
-    allreduce_pipe_parallel_calls_per_count.clear();
-}
-
-void ccl::profile::metrics_manager::finalize() {
-    std::string allreduce_pipe_metrics;
+ccl::profile::metrics_counter::~metrics_counter() {
+    std::string pipe_metrics;
 
-    for (auto calls_per_count : allreduce_pipe_nonparallel_calls_per_count) {
-        allreduce_pipe_metrics += "nonparallel_calls_per_count[" +
-                                  std::to_string(calls_per_count.first) +
-                                  "]=" + std::to_string(calls_per_count.second) + ",\n";
+    for (auto calls_per_count : this->nonparallel_calls_per_count) {
+        pipe_metrics += "nonparallel_calls_per_count[" + std::to_string(calls_per_count.first) +
+                        "]=" + std::to_string(calls_per_count.second) + ",\n";
     }
 
-    for (auto calls_per_count : allreduce_pipe_parallel_calls_per_count) {
-        allreduce_pipe_metrics += "   parallel_calls_per_count[" +
-                                  std::to_string(calls_per_count.first) +
-                                  "]=" + std::to_string(calls_per_count.second) + ",\n";
+    for (auto calls_per_count : this->parallel_calls_per_count) {
+        pipe_metrics += "   parallel_calls_per_count[" + std::to_string(calls_per_count.first) +
+                        "]=" + std::to_string(calls_per_count.second) + ",\n";
     }
 
-    if (!allreduce_pipe_metrics.empty()) {
-        LOG_INFO("allreduce_pipe_metrics: [\n", allreduce_pipe_metrics, "]");
+    if (!pipe_metrics.empty()) {
+        LOG_INFO(this->collective_name, "_pipe_metrics: [\n", pipe_metrics, "]");
     }
 }
+
+void ccl::profile::metrics_manager::init() {
+    this->allreduce_pipe.init();
+    this->reduce_pipe.init();
+    this->reduce_scatter_pipe.init();
+    this->allgatherv_pipe.init();
+}
diff --git a/src/common/utils/profile.hpp b/src/common/utils/profile.hpp
index 29b375723..72437d1fd 100644
--- a/src/common/utils/profile.hpp
+++ b/src/common/utils/profile.hpp
@@ -16,19 +16,31 @@
 #pragma once
 
 #include <map>
+#include <cstddef>
 
 namespace ccl {
 namespace profile {
 
-class metrics_manager {
-    void finalize();
+class metrics_counter {
+    const char *collective_name;
+
+public:
+    std::map<size_t, size_t> nonparallel_calls_per_count, parallel_calls_per_count;
+    metrics_counter(const char *collective_name) : collective_name(collective_name){};
+    void init();
+    ~metrics_counter();
+    metrics_counter(const metrics_counter &) = delete;
+    metrics_counter &operator=(const metrics_counter &) = delete;
+};
 
+class metrics_manager {
 public:
-    std::map<size_t, size_t> allreduce_pipe_nonparallel_calls_per_count,
-        allreduce_pipe_parallel_calls_per_count;
+    metrics_counter allreduce_pipe{ "allreduce" };
+    metrics_counter reduce_pipe{ "reduce" };
+    metrics_counter reduce_scatter_pipe{ "reduce_scatter" };
+    metrics_counter allgatherv_pipe{ "allgatherv" };
 
     void init();
-    ~metrics_manager();
 };
 
 } // namespace profile
diff --git a/src/common/utils/utils.cpp b/src/common/utils/utils.cpp
index a5fe2be11..122fee40a 100644
--- a/src/common/utils/utils.cpp
+++ b/src/common/utils/utils.cpp
@@ -98,5 +98,10 @@ std::string join_strings(const std::vector<std::string>& tokens, const std::stri
     return ss.str();
 }
 
+void close_fd(int fd) {
+    LOG_DEBUG("closing fd: ", fd);
+    close(fd);
+}
+
 } // namespace utils
 } // namespace ccl
diff --git a/src/common/utils/utils.hpp b/src/common/utils/utils.hpp
index b183d7250..6c6137f7b 100644
--- a/src/common/utils/utils.hpp
+++ b/src/common/utils/utils.hpp
@@ -255,5 +255,7 @@ std::string to_hex(T integer) {
     return ss.str();
 }
 
+void close_fd(int fd);
+
 } // namespace utils
 } // namespace ccl
diff --git a/src/comp/bf16/bf16.cpp b/src/comp/bf16/bf16.cpp
index f1a59d14a..6a46f83b9 100644
--- a/src/comp/bf16/bf16.cpp
+++ b/src/comp/bf16/bf16.cpp
@@ -89,7 +89,7 @@ void ccl_bf16_reduce(const void* in_buf,
                      void* inout_buf,
                      size_t* out_count,
                      ccl::reduction op) {
-    LOG_DEBUG("BF16 reduction for %zu elements", in_count);
+    LOG_DEBUG("BF16 reduction for", in_count, " elements");
 
     if (out_count != nullptr) {
         *out_count = in_count;
diff --git a/src/comp/fp16/fp16.cpp b/src/comp/fp16/fp16.cpp
index 99e33f325..6472e6fdd 100644
--- a/src/comp/fp16/fp16.cpp
+++ b/src/comp/fp16/fp16.cpp
@@ -41,7 +41,7 @@ void ccl_fp16_reduce(const void* in_buf,
                      void* inout_buf,
                      size_t* out_cnt,
                      ccl::reduction op) {
-    LOG_DEBUG("FP16 reduction for %zu elements\n", in_cnt);
+    LOG_DEBUG("FP16 reduction for", in_cnt, " elements");
 
     if (out_cnt != nullptr) {
         *out_cnt = in_cnt;
diff --git a/src/exec/exec.hpp b/src/exec/exec.hpp
index 186a7e41a..6fb7e1a6c 100644
--- a/src/exec/exec.hpp
+++ b/src/exec/exec.hpp
@@ -60,6 +60,12 @@ class alignas(CACHELINE_SIZE) ccl_executor {
             parent = src.parent;
             src.parent = nullptr;
         }
+        worker_guard& operator=(worker_guard&) = delete;
+        worker_guard& operator=(worker_guard&& src) {
+            parent = src.parent;
+            src.parent = nullptr;
+            return *this;
+        }
         ~worker_guard() {
             if (parent)
                 parent->unlock_workers();
@@ -143,7 +149,7 @@ inline ccl_wait_result ccl_wait_impl(ccl_executor* exec, ccl_request* request) {
             request,
             " completed, sched ",
             ccl_coll_type_to_str(static_cast<sched_type*>(request->get_sched())->coll_param.ctype));
-        if (!request->synchronous) {
+        if (!request->get_sched()->coll_attr.synchronous) {
             ccl_release_request(request);
             ret = ccl_wait_result_completed_released;
         }
diff --git a/src/exec/thread/service_worker.hpp b/src/exec/thread/service_worker.hpp
index a9267df29..c02576fd3 100644
--- a/src/exec/thread/service_worker.hpp
+++ b/src/exec/thread/service_worker.hpp
@@ -26,6 +26,9 @@ class ccl_service_worker : public ccl_worker {
                        ccl_fusion_manager& fusion_manager);
     ~ccl_service_worker();
 
+    ccl_service_worker(const ccl_service_worker& other) = delete;
+    ccl_service_worker& operator=(const ccl_service_worker& other) = delete;
+
     ccl::status do_work(size_t& processed_count) override;
 
     bool can_reset() override;
diff --git a/src/hwloc/hwloc_wrapper.hpp b/src/hwloc/hwloc_wrapper.hpp
index 586d796a9..e003444db 100644
--- a/src/hwloc/hwloc_wrapper.hpp
+++ b/src/hwloc/hwloc_wrapper.hpp
@@ -17,6 +17,9 @@
 
 #include "hwloc.h"
 
+#include <vector>
+#include <string>
+
 #define CCL_HWLOC_INVALID_NUMA_NODE (-1)
 
 struct ccl_numa_node {
@@ -39,6 +42,8 @@ struct ccl_numa_node {
 class ccl_hwloc_wrapper {
 public:
     ccl_hwloc_wrapper();
+    ccl_hwloc_wrapper(const ccl_hwloc_wrapper& other) = delete;
+    ccl_hwloc_wrapper& operator=(const ccl_hwloc_wrapper& other) = delete;
     ~ccl_hwloc_wrapper();
 
     bool is_initialized();
diff --git a/src/kernels/bf16.h b/src/kernels/bf16.h
index 82004ed3e..e89cd474c 100644
--- a/src/kernels/bf16.h
+++ b/src/kernels/bf16.h
@@ -32,21 +32,7 @@ ushort __fp32_to_bf16(float V) {
 #ifdef cl_intel_bfloat16_conversions
 #pragma OPENCL EXTENSION cl_intel_bfloat16_conversions : enable
 #else // cl_intel_bfloat16_conversions
-
-// declare SPIR-V intrinsics directly
-ushort __builtin_IB_ftobf_1(float);
-float __builtin_IB_bftof_1(ushort);
-
-// implement built-in functions using these intrinsics
-#define __ovld __attribute__((overloadable))
-ushort __ovld intel_convert_bfloat16_as_ushort(float f) {
-    return __builtin_IB_ftobf_1(f);
-}
-
-float __ovld intel_convert_as_bfloat16_float(ushort u) {
-    return __builtin_IB_bftof_1(u);
-}
-
+#error "cl_intel_bfloat16_conversions are not defined, compilation failed."
 #endif // cl_intel_bfloat16_conversions
 
 float __bf16_to_fp32(ushort V) {
diff --git a/src/kernels/kernels.cl b/src/kernels/kernels.cl
index f21d29314..4e179dc43 100644
--- a/src/kernels/kernels.cl
+++ b/src/kernels/kernels.cl
@@ -64,9 +64,28 @@ __kernel void empty_kernel(int my_rank,
         out_buf##b[idx] = in_buf##b[idx]; \
     }
 
-#define BUFFER_COPY(dst, src, b) \
-    for (size_t idx = thread_id; idx < count##b; idx += work_group_size) { \
-        dst##b[idx] = src##b[idx]; \
+#define CONVERT_half_USHORT(val)   as_ushort((half)val)
+#define CONVERT_ushort_USHORT(val) val
+#define CONVERT_short_USHORT(val)  val
+#define CONVERT_uchar_USHORT(val)  val
+#define CONVERT_char_USHORT(val)   val
+#define CONVERT_uint_USHORT(val)   val
+#define CONVERT_int_USHORT(val)    val
+#define CONVERT_ulong_USHORT(val)  val
+#define CONVERT_long_USHORT(val)   val
+#define CONVERT_float_USHORT(val)  val
+#define CONVERT_double_USHORT(val) val
+
+#define BUFFER_COPY(Dtype, dst, src, b) \
+    { \
+        const long rem_elem_count = count##b - subgroup_idx; \
+        if (rem_elem_count > 0 && rem_elem_count >= subgroup_size && sizeof(Dtype) == 2) { \
+            intel_sub_group_block_write_us((__global ushort*)(&dst##b[idx]), \
+                                           CONVERT_##Dtype##_USHORT(src##b[idx])); \
+        } \
+        else if (idx < count##b) { \
+            dst##b[idx] = src##b[idx]; \
+        } \
     }
 
 // ALLTOALLV_COPY#: 2-16 args number, max case is 16 ranks
@@ -80,13 +99,13 @@ __kernel void empty_kernel(int my_rank,
 #define ALLTOALLV_COPY16 ALLTOALLV_COPY14 ALLTOALLV_COPY(14) ALLTOALLV_COPY(15)
 
 // BUFFER_COPY#: 1-7 args number, max case is 16 ranks
-#define BUFFER_COPY1(dst, src) BUFFER_COPY(dst, src, 1)
-#define BUFFER_COPY2(dst, src) BUFFER_COPY1(dst, src) BUFFER_COPY(dst, src, 2)
-#define BUFFER_COPY3(dst, src) BUFFER_COPY2(dst, src) BUFFER_COPY(dst, src, 3)
-#define BUFFER_COPY4(dst, src) BUFFER_COPY3(dst, src) BUFFER_COPY(dst, src, 4)
-#define BUFFER_COPY5(dst, src) BUFFER_COPY4(dst, src) BUFFER_COPY(dst, src, 5)
-#define BUFFER_COPY6(dst, src) BUFFER_COPY5(dst, src) BUFFER_COPY(dst, src, 6)
-#define BUFFER_COPY7(dst, src) BUFFER_COPY6(dst, src) BUFFER_COPY(dst, src, 7)
+#define BUFFER_COPY1(Dtype, dst, src) BUFFER_COPY(Dtype, dst, src, 1)
+#define BUFFER_COPY2(Dtype, dst, src) BUFFER_COPY1(Dtype, dst, src) BUFFER_COPY(Dtype, dst, src, 2)
+#define BUFFER_COPY3(Dtype, dst, src) BUFFER_COPY2(Dtype, dst, src) BUFFER_COPY(Dtype, dst, src, 3)
+#define BUFFER_COPY4(Dtype, dst, src) BUFFER_COPY3(Dtype, dst, src) BUFFER_COPY(Dtype, dst, src, 4)
+#define BUFFER_COPY5(Dtype, dst, src) BUFFER_COPY4(Dtype, dst, src) BUFFER_COPY(Dtype, dst, src, 5)
+#define BUFFER_COPY6(Dtype, dst, src) BUFFER_COPY5(Dtype, dst, src) BUFFER_COPY(Dtype, dst, src, 6)
+#define BUFFER_COPY7(Dtype, dst, src) BUFFER_COPY6(Dtype, dst, src) BUFFER_COPY(Dtype, dst, src, 7)
 
 #define DEFINE_ALLTOALLV_KERNEL(DtypeName, Dtype, OpName, OpFunc, N) \
     __kernel void alltoallv_kernel_##N##_##DtypeName##_##OpName( \
@@ -97,17 +116,27 @@ __kernel void empty_kernel(int my_rank,
     }
 
 // reduction for local_reduce
-#define REDUCTION(OpFunc, b) \
-    xelink_tmp_buf##b[idx] = OpFunc(local_send_buf##b[idx], mdfi_buf##b[idx]);
+#define REDUCTION(Dtype, OpFunc, b) \
+    { \
+        Dtype reduction = OpFunc(mdfi_buf##b[idx], local_send_buf##b[idx]); \
+        if (can_use_block == 1 && rem_elem_count > 0 && rem_elem_count >= subgroup_size && \
+            sizeof(Dtype) == 2) { \
+            intel_sub_group_block_write_us((__global ushort*)(&xelink_tmp_buf##b[idx]), \
+                                           CONVERT_##Dtype##_USHORT(reduction)); \
+        } \
+        else { \
+            xelink_tmp_buf##b[idx] = reduction; \
+        } \
+    }
 
 // REDUCTION#: 1-7 args number, max case is 16 ranks
-#define REDUCTION1(OpFunc) REDUCTION(OpFunc, 0)
-#define REDUCTION2(OpFunc) REDUCTION1(OpFunc) REDUCTION(OpFunc, 1)
-#define REDUCTION3(OpFunc) REDUCTION2(OpFunc) REDUCTION(OpFunc, 2)
-#define REDUCTION4(OpFunc) REDUCTION3(OpFunc) REDUCTION(OpFunc, 3)
-#define REDUCTION5(OpFunc) REDUCTION4(OpFunc) REDUCTION(OpFunc, 4)
-#define REDUCTION6(OpFunc) REDUCTION5(OpFunc) REDUCTION(OpFunc, 5)
-#define REDUCTION7(OpFunc) REDUCTION6(OpFunc) REDUCTION(OpFunc, 6)
+#define REDUCTION1(Dtype, OpFunc) REDUCTION(Dtype, OpFunc, 0)
+#define REDUCTION2(Dtype, OpFunc) REDUCTION1(Dtype, OpFunc) REDUCTION(Dtype, OpFunc, 1)
+#define REDUCTION3(Dtype, OpFunc) REDUCTION2(Dtype, OpFunc) REDUCTION(Dtype, OpFunc, 2)
+#define REDUCTION4(Dtype, OpFunc) REDUCTION3(Dtype, OpFunc) REDUCTION(Dtype, OpFunc, 3)
+#define REDUCTION5(Dtype, OpFunc) REDUCTION4(Dtype, OpFunc) REDUCTION(Dtype, OpFunc, 4)
+#define REDUCTION6(Dtype, OpFunc) REDUCTION5(Dtype, OpFunc) REDUCTION(Dtype, OpFunc, 5)
+#define REDUCTION7(Dtype, OpFunc) REDUCTION6(Dtype, OpFunc) REDUCTION(Dtype, OpFunc, 6)
 
 // reduction for local_reduce
 #define FIRST_REDUCE(OpFunc, b0, b1) \
@@ -130,15 +159,20 @@ __kernel void empty_kernel(int my_rank,
         ALL_PTR_ARGS(Dtype, mdfi_buf, N), \
         ALL_PTR_ARGS(Dtype, xelink_tmp_buf, N), \
         ulong count, \
-        ulong last_count) { \
+        ulong last_count, \
+        int can_use_block) { \
         DEBUG_BLOCK(printf("in reduce_read_write_kernel count %ld\n", count)); \
         size_t work_group_size = get_global_size(0); \
         size_t thread_id = get_global_id(0); \
+        const size_t subgroup_size = get_sub_group_size(); \
+        const size_t subgroup_idx = thread_id / subgroup_size * subgroup_size; \
         for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
-            REDUCTION##N(OpFunc) \
+            const long rem_elem_count = count - subgroup_idx; \
+            REDUCTION##N(Dtype, OpFunc) \
         } \
         for (size_t idx = thread_id; idx < last_count; idx += work_group_size) { \
-            REDUCTION(OpFunc, N) \
+            const long rem_elem_count = last_count - subgroup_idx; \
+            REDUCTION(Dtype, OpFunc, N) \
         } \
     }
 
@@ -157,16 +191,26 @@ __kernel void empty_kernel(int my_rank,
     __kernel void allreduce_kernel_##DtypeName##_##OpName(int my_rank, \
                                                           int comm_size, \
                                                           ulong count, \
+                                                          int can_use_block, \
                                                           const __global Dtype* input_buffer, \
                                                           __global Dtype* output_buffer, \
                                                           const __global Dtype* peer_input_buffer, \
                                                           __global Dtype* peer_output_buffer) { \
         DEBUG_BLOCK(printf("rank: %d, comm size: %d, count: %zu\n", my_rank, comm_size, count)); \
         size_t work_group_size = get_global_size(0); \
-        size_t thread_id = get_global_id(0); \
-        for (size_t i = 0; thread_id + i < count; i += work_group_size) { \
-            const size_t idx = thread_id + i; \
-            Dtype ret = OpFunc(input_buffer[idx], peer_input_buffer[idx]); \
+        size_t idx = get_global_id(0); \
+        const size_t subgroup_size = get_sub_group_size(); \
+        const size_t subgroup_idx = idx / subgroup_size * subgroup_size; \
+        const long rem_elem_count = count - subgroup_idx; \
+        Dtype ret = OpFunc(input_buffer[idx], peer_input_buffer[idx]); \
+        if (can_use_block == 1 && rem_elem_count > 0 && rem_elem_count >= subgroup_size && \
+            sizeof(Dtype) == 2) { \
+            intel_sub_group_block_write_us((__global ushort*)&output_buffer[subgroup_idx], \
+                                           CONVERT_##Dtype##_USHORT(ret)); \
+            intel_sub_group_block_write_us((__global ushort*)&peer_output_buffer[subgroup_idx], \
+                                           CONVERT_##Dtype##_USHORT(ret)); \
+        } \
+        else if (idx < count) { \
             output_buffer[idx] = ret; \
             peer_output_buffer[idx] = ret; \
         } \
@@ -205,18 +249,19 @@ __kernel void empty_kernel(int my_rank,
     __kernel void reduce_single_local_inplace_kernel_##DtypeName##_##OpName( \
         ulong count, \
         int peer_count, \
-        const __global Dtype* input_buffer, \
-        __global Dtype* inoutput_buffer) { \
+        const __global Dtype* input_buffer1, \
+        const __global Dtype* input_buffer2, \
+        __global Dtype* output_buffer) { \
         DEBUG_BLOCK(printf("in reduce_single_local_inplace_kernel\n")); \
         size_t work_group_size = get_global_size(0); \
         size_t thread_id = get_global_id(0); \
         for (size_t i = 0; thread_id + i < count; i += work_group_size) { \
             const size_t idx = thread_id + i; \
-            Dtype ret = OpFunc(input_buffer[idx], inoutput_buffer[idx]); \
+            Dtype ret = OpFunc(input_buffer1[idx], input_buffer2[idx]); \
             for (int j = 1; j < peer_count; j++) { \
-                ret = OpFunc(inoutput_buffer[j * count + idx], ret); \
+                ret = OpFunc(input_buffer2[j * count + idx], ret); \
             } \
-            inoutput_buffer[idx] = ret; \
+            output_buffer[idx] = ret; \
         } \
     }
 
@@ -523,10 +568,12 @@ __kernel void empty_kernel(int my_rank,
         PTR_ARGS##N(Dtype, peer_output_buffer), \
         CONST_ARGS##N(ulong, count)) { \
         DEBUG_BLOCK(printf("in read_write_monolithic_kernel_%d\n", N)); \
-        size_t work_group_size = get_global_size(0); \
-        size_t thread_id = get_global_id(0); \
-        BUFFER_COPY##N(output_buffer, peer_buffer) if (pipeline_count > 1) { \
-            BUFFER_COPY##N(peer_output_buffer, output_buffer) \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t idx = get_global_id(0); \
+        const size_t subgroup_size = get_sub_group_size(); \
+        const size_t subgroup_idx = idx / subgroup_size * subgroup_size; \
+        BUFFER_COPY##N(Dtype, output_buffer, peer_buffer) if (pipeline_count > 1) { \
+            BUFFER_COPY##N(Dtype, peer_output_buffer, output_buffer) \
         } \
     }
 
diff --git a/src/kernels/kernels.spv b/src/kernels/kernels.spv
index 2aaf5c03e..5c7640d29 100644
Binary files a/src/kernels/kernels.spv and b/src/kernels/kernels.spv differ
diff --git a/src/parallelizer/parallelizer.cpp b/src/parallelizer/parallelizer.cpp
index 985d8cdc8..20ed37a7f 100644
--- a/src/parallelizer/parallelizer.cpp
+++ b/src/parallelizer/parallelizer.cpp
@@ -158,7 +158,8 @@ ccl::status ccl_parallelizer::process_pre_post_copies(ccl_sched* sched) {
 }
 
 ccl::status ccl_parallelizer::process_output_event(ccl_sched* sched) {
-    if (!ccl::utils::should_use_sycl_output_event(sched->coll_param.stream)) {
+    if (!ccl::utils::should_use_sycl_output_event(sched->coll_param.stream) &&
+        !ccl::is_queue_in_order(sched->coll_param.stream)) {
         return ccl::status::success;
     }
 
@@ -558,6 +559,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
                                                    coll_param.get_send_count(),
                                                    recv_buf,
                                                    coll_param.recv_counts.data(),
+                                                   std::vector<ccl_buffer>{},
                                                    dtype,
                                                    comm);
                 }
diff --git a/src/sched/buffer/buffer_cache.cpp b/src/sched/buffer/buffer_cache.cpp
index c917e61bd..f3f81b0da 100644
--- a/src/sched/buffer/buffer_cache.cpp
+++ b/src/sched/buffer/buffer_cache.cpp
@@ -18,6 +18,53 @@
 
 namespace ccl {
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+typedef ze_result_t (*pFnzexDriverImportExternalPointer)(ze_driver_handle_t, void*, size_t);
+typedef ze_result_t (*pFnzexDriverReleaseImportedPointer)(ze_driver_handle_t, void*);
+pFnzexDriverImportExternalPointer zexDriverImportExternalPointer = nullptr;
+pFnzexDriverReleaseImportedPointer zexDriverReleaseImportedPointer = nullptr;
+ze_driver_handle_t ze_driver_handle;
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+buffer_cache::buffer_cache(size_t instance_count)
+        : reg_buffers(instance_count)
+#ifdef CCL_ENABLE_SYCL
+          ,
+          sycl_buffers(instance_count)
+#endif // CCL_ENABLE_SYCL
+{
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    // Instruct the UMD to create the internal graphics allocation for each system memory allocation
+    // against a driver handle, instead of a command list handle.
+    // By doing this, the UMD is able to reuse the internal graphics allocation for any new or reset list,
+    // until the application decides to release the imported pointer. Any GPU driver handle fits.
+    // This API is a part of exported extensions, therefore have to check for availability first.
+    // Note: ze_data may be not initialized in some cases like stub backend mode or CCL_ZE_ENABLE=0
+    if (global_data::env().enable_buffer_cache && global_data::get().ze_data &&
+        !global_data::get().ze_data->drivers.empty()) {
+        ze_driver_handle = global_data::get().ze_data->drivers.front();
+        ze_result_t res_import = zeDriverGetExtensionFunctionAddress(
+            ze_driver_handle,
+            "zexDriverImportExternalPointer",
+            reinterpret_cast<void**>(&zexDriverImportExternalPointer));
+        ze_result_t res_release = zeDriverGetExtensionFunctionAddress(
+            ze_driver_handle,
+            "zexDriverReleaseImportedPointer",
+            reinterpret_cast<void**>(&zexDriverReleaseImportedPointer));
+        if ((res_import != ZE_RESULT_SUCCESS) || (res_release != ZE_RESULT_SUCCESS)) {
+            // Reset function pointers for safety
+            zexDriverImportExternalPointer = nullptr;
+            zexDriverReleaseImportedPointer = nullptr;
+            LOG_INFO("Can not initialize Import Extension API ",
+                     "(zexDriverReleaseImportedPointer/zexDriverImportExternalPointer: ",
+                     std::to_string(res_import),
+                     " ",
+                     std::to_string(res_release));
+        }
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+}
+
 buffer_cache::~buffer_cache() {
     for (auto& instance : reg_buffers) {
         instance.clear();
@@ -77,6 +124,15 @@ void regular_buffer_cache::get(size_t bytes, void** pptr) {
         }
     }
     *pptr = CCL_MALLOC(bytes, "buffer");
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (zexDriverImportExternalPointer) {
+        ze_result_t res = zexDriverImportExternalPointer(ze_driver_handle, *pptr, bytes);
+        if (res != ZE_RESULT_SUCCESS) {
+            LOG_INFO("zexDriverImportExternalPointer can not register the pointer with error: ",
+                     std::to_string(res));
+        }
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 }
 
 void regular_buffer_cache::push(size_t bytes, void* ptr) {
@@ -87,6 +143,15 @@ void regular_buffer_cache::push(size_t bytes, void* ptr) {
         LOG_DEBUG("inserted to buffer cache: bytes: ", bytes, ", ptr: ", ptr);
         return;
     }
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (zexDriverReleaseImportedPointer) {
+        ze_result_t res = zexDriverReleaseImportedPointer(ze_driver_handle, ptr);
+        if (res != ZE_RESULT_SUCCESS) {
+            LOG_INFO("zexDriverReleaseImportPointer can not release the pointer with error: ",
+                     std::to_string(res));
+        }
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     CCL_FREE(ptr);
 }
 
diff --git a/src/sched/buffer/buffer_cache.hpp b/src/sched/buffer/buffer_cache.hpp
index 50b28e034..033e4fa1e 100644
--- a/src/sched/buffer/buffer_cache.hpp
+++ b/src/sched/buffer/buffer_cache.hpp
@@ -34,14 +34,7 @@ class sycl_buffer_cache;
 
 class buffer_cache {
 public:
-    buffer_cache(size_t instance_count)
-            : reg_buffers(instance_count)
-#ifdef CCL_ENABLE_SYCL
-              ,
-              sycl_buffers(instance_count)
-#endif // CCL_ENABLE_SYCL
-    {
-    }
+    buffer_cache(size_t instance_count);
     buffer_cache(const buffer_cache&) = delete;
     buffer_cache& operator=(const buffer_cache&) = delete;
     ~buffer_cache();
@@ -69,6 +62,8 @@ class regular_buffer_cache {
 public:
     regular_buffer_cache() = default;
     ~regular_buffer_cache();
+    regular_buffer_cache(const regular_buffer_cache& other) = delete;
+    regular_buffer_cache& operator=(const regular_buffer_cache& other) = delete;
 
     void clear();
     void get(size_t bytes, void** pptr);
@@ -86,6 +81,8 @@ class regular_buffer_cache {
 class sycl_buffer_cache {
 public:
     sycl_buffer_cache() = default;
+    sycl_buffer_cache(const sycl_buffer_cache& other) = delete;
+    sycl_buffer_cache& operator=(const sycl_buffer_cache& other) = delete;
     ~sycl_buffer_cache();
 
     void clear();
diff --git a/src/sched/entry/coll/coll_entry.cpp b/src/sched/entry/coll/coll_entry.cpp
index 6de4c8973..82f11ed6b 100644
--- a/src/sched/entry/coll/coll_entry.cpp
+++ b/src/sched/entry/coll/coll_entry.cpp
@@ -33,6 +33,7 @@ ccl::status coll_entry::build_sched(ccl_sched* sched, const ccl_coll_param& para
                                             param.send_count,
                                             param.recv_buf,
                                             param.recv_counts.data(),
+                                            param.recv_scale_out_bufs,
                                             param.dtype,
                                             param.comm,
                                             param.is_scaleout);
@@ -43,6 +44,7 @@ ccl::status coll_entry::build_sched(ccl_sched* sched, const ccl_coll_param& para
                                            param.send_buf,
                                            param.recv_buf,
                                            param.count,
+                                           param.recv_scale_out_bufs,
                                            param.dtype,
                                            param.reduction,
                                            param.comm,
diff --git a/src/sched/entry/copy/copy_helper.cpp b/src/sched/entry/copy/copy_helper.cpp
index 5704879e2..501fc4df8 100644
--- a/src/sched/entry/copy/copy_helper.cpp
+++ b/src/sched/entry/copy/copy_helper.cpp
@@ -27,7 +27,8 @@ copy_attr::copy_attr()
           use_nontemporal(false)
 #ifdef CCL_ENABLE_ZE
           ,
-          hint_queue_index(0)
+          hint_queue_index(0),
+          force_queue_type(ccl::ze::queue_group_type::unknown)
 #endif // CCL_ENABLE_ZE
 {
 }
@@ -42,7 +43,8 @@ copy_attr::copy_attr(int peer_rank,
                      bool use_nontemporal
 #ifdef CCL_ENABLE_ZE
                      ,
-                     int hint_queue_index
+                     int hint_queue_index,
+                     ccl::ze::queue_group_type force_queue_type
 #endif // CCL_ENABLE_ZE
                      )
         : peer_rank(peer_rank),
@@ -55,7 +57,8 @@ copy_attr::copy_attr(int peer_rank,
           use_nontemporal(use_nontemporal)
 #ifdef CCL_ENABLE_ZE
           ,
-          hint_queue_index(hint_queue_index)
+          hint_queue_index(hint_queue_index),
+          force_queue_type(force_queue_type)
 #endif // CCL_ENABLE_ZE
 {
 }
@@ -65,6 +68,12 @@ copy_attr::copy_attr(copy_direction direction, size_t in_buf_offset, size_t out_
           in_buf_offset(in_buf_offset),
           out_buf_offset(out_buf_offset) {}
 
+#ifdef CCL_ENABLE_ZE
+copy_attr::copy_attr(copy_direction direction, ccl::ze::queue_group_type force_queue_type)
+        : direction(direction),
+          force_queue_type(force_queue_type) {}
+#endif // CCL_ENABLE_ZE
+
 using copy_direction_str_enum =
     ccl::utils::enum_to_str<ccl::utils::enum_to_underlying(copy_direction::c2c) + 1>;
 std::string to_string(copy_direction val) {
diff --git a/src/sched/entry/copy/copy_helper.hpp b/src/sched/entry/copy/copy_helper.hpp
index 8179d03c5..db8a3dbaf 100644
--- a/src/sched/entry/copy/copy_helper.hpp
+++ b/src/sched/entry/copy/copy_helper.hpp
@@ -22,6 +22,7 @@
 
 #ifdef CCL_ENABLE_SYCL
 #include "common/utils/sycl_utils.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
 #endif // CCL_ENABLE_SYCL
 
 enum class copy_direction { undefined, h2h, d2h, h2d, d2d, t2t, c2c };
@@ -41,6 +42,7 @@ struct copy_attr {
 
 #ifdef CCL_ENABLE_ZE
     int hint_queue_index = 0;
+    ccl::ze::queue_group_type force_queue_type = ccl::ze::queue_group_type::unknown;
 #endif // CCL_ENABLE_ZE
 
     copy_attr();
@@ -54,10 +56,15 @@ struct copy_attr {
               bool use_nontemporal = false
 #ifdef CCL_ENABLE_ZE
               ,
-              int hint_queue_index = 0
+              int hint_queue_index = 0,
+              ccl::ze::queue_group_type force_queue_type = ccl::ze::queue_group_type::unknown
 #endif // CCL_ENABLE_ZE
     );
     copy_attr(copy_direction direction, size_t in_buf_offset = 0, size_t out_buf_offset = 0);
+
+#ifdef CCL_ENABLE_ZE
+    copy_attr(copy_direction direction, ccl::ze::queue_group_type force_queue_type);
+#endif // CCL_ENABLE_ZE
 };
 
 #ifdef CCL_ENABLE_SYCL
diff --git a/src/sched/entry/recv_entry.hpp b/src/sched/entry/recv_entry.hpp
index 11c5b25b8..23b4bf7ef 100644
--- a/src/sched/entry/recv_entry.hpp
+++ b/src/sched/entry/recv_entry.hpp
@@ -28,6 +28,8 @@ class recv_entry : public sched_entry,
         return "RECV";
     }
 
+    recv_entry(const recv_entry& other) = delete;
+    recv_entry& operator=(const recv_entry& other) = delete;
     recv_entry() = delete;
     recv_entry(ccl_sched* sched,
                ccl_buffer buf,
diff --git a/src/sched/entry/recv_reduce_entry.hpp b/src/sched/entry/recv_reduce_entry.hpp
index 828b39baf..47447fbdb 100644
--- a/src/sched/entry/recv_reduce_entry.hpp
+++ b/src/sched/entry/recv_reduce_entry.hpp
@@ -30,7 +30,8 @@ class recv_reduce_entry final : public sched_entry {
         return "RECV_REDUCE";
     }
 
-    recv_reduce_entry() = delete;
+    recv_reduce_entry(const recv_reduce_entry& other) = delete;
+    recv_reduce_entry& operator=(const recv_reduce_entry& other) = delete;
     recv_reduce_entry(ccl_sched* sched,
                       ccl_buffer inout_buf,
                       size_t cnt,
diff --git a/src/sched/entry/write_entry.hpp b/src/sched/entry/write_entry.hpp
index e9e94942b..c4a2f2b02 100644
--- a/src/sched/entry/write_entry.hpp
+++ b/src/sched/entry/write_entry.hpp
@@ -28,6 +28,8 @@ class write_entry : public sched_entry,
     }
 
     write_entry() = delete;
+    write_entry(const write_entry& other) = delete;
+    write_entry& operator=(const write_entry& other) = delete;
     write_entry(ccl_sched* sched,
                 ccl_buffer src_buf,
                 atl_mr_t* src_mr,
diff --git a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
index 124770703..9a8c66e4e 100644
--- a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
+++ b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
@@ -142,6 +142,7 @@ void ze_a2a_allreduce_entry::init_ze_hook() {
     ze_a2a_reduce_scatter_entry::fill_list(this,
                                            send_buf.get_ptr(),
                                            tmp_buf.get_ptr(),
+                                           tmp_buf.get_ptr(),
                                            peer_send_bufs,
                                            peer_count,
                                            comm_rank,
diff --git a/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
index 63176c3fa..0ed87d1e3 100644
--- a/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
+++ b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
@@ -177,6 +177,13 @@ void ze_onesided_allreduce_entry::init_ze_hook() {
         // otherwise run two kernels, one for unaligned and one for aligned data
         CCL_ASSERT(kernel_count == 2);
         for (size_t i = start_kernel_idx; i < end_kernel_idx; i++) {
+            // the block stores can not be used for unaligned data that's why
+            // it's skipped taking into account that the first kernel, if they
+            // are two ones, is always unaligned
+            int can_use_block = 1;
+            if (i == 0 && end_kernel_idx == 2) {
+                can_use_block = 0;
+            }
             void* send_buf_ptr_tmp = static_cast<char*>(send_buf_ptr) + offsets[i];
             void* recv_buf_ptr_tmp = static_cast<char*>(recv_buf_ptr) + offsets[i];
             void* right_send_buf_ptr_tmp = static_cast<char*>(right_send_buf_ptr) + offsets[i];
@@ -184,6 +191,7 @@ void ze_onesided_allreduce_entry::init_ze_hook() {
             ze_kernel_args_t main_kernel_args{ &comm_rank,
                                                &comm_size,
                                                &counts[i],
+                                               &can_use_block,
                                                &send_buf_ptr_tmp,
                                                &recv_buf_ptr_tmp,
                                                &right_send_buf_ptr_tmp,
diff --git a/src/sched/entry/ze/cache/ze_cache.cpp b/src/sched/entry/ze/cache/ze_cache.cpp
index 4132e0149..a9ea4bc5e 100644
--- a/src/sched/entry/ze/cache/ze_cache.cpp
+++ b/src/sched/entry/ze/cache/ze_cache.cpp
@@ -441,7 +441,7 @@ void mem_handle_cache::handle_desc::close_handle() const {
     auto fd = get_fd_from_handle(handle);
     auto res = zeMemCloseIpcHandle(remote_context, ptr);
     if (res == ZE_RESULT_ERROR_INVALID_ARGUMENT) {
-        close(fd);
+        ccl::utils::close_fd(fd);
     }
     else if (res != ZE_RESULT_SUCCESS) {
         CCL_THROW("error at zeMemCloseIpcHandle, code: ", to_string(res));
diff --git a/src/sched/entry/ze/cache/ze_cache.hpp b/src/sched/entry/ze/cache/ze_cache.hpp
index 9ff4103c6..7be124b37 100644
--- a/src/sched/entry/ze/cache/ze_cache.hpp
+++ b/src/sched/entry/ze/cache/ze_cache.hpp
@@ -29,6 +29,8 @@ namespace ze {
 class kernel_cache {
 public:
     kernel_cache() = default;
+    kernel_cache(const kernel_cache&) = delete;
+    kernel_cache& operator=(const kernel_cache&) = delete;
     ~kernel_cache();
 
     void clear();
@@ -47,6 +49,8 @@ class kernel_cache {
 class list_cache {
 public:
     list_cache() = default;
+    list_cache(const list_cache&) = delete;
+    list_cache& operator=(const list_cache&) = delete;
     ~list_cache();
 
     void clear();
@@ -71,6 +75,8 @@ class list_cache {
 class queue_cache {
 public:
     queue_cache() = default;
+    queue_cache(const queue_cache&) = delete;
+    queue_cache& operator=(const queue_cache&) = delete;
     ~queue_cache();
 
     void clear();
@@ -100,6 +106,8 @@ class queue_cache {
 class event_pool_cache {
 public:
     event_pool_cache() = default;
+    event_pool_cache(const event_pool_cache&) = delete;
+    event_pool_cache& operator=(const event_pool_cache&) = delete;
     ~event_pool_cache();
 
     void clear();
@@ -124,6 +132,8 @@ struct ipc_handle_desc;
 class module_cache {
 public:
     module_cache() = default;
+    module_cache(const module_cache&) = delete;
+    module_cache& operator=(const module_cache&) = delete;
     ~module_cache();
 
     void clear();
@@ -171,6 +181,8 @@ class mem_handle_cache {
     using value_t = typename std::shared_ptr<const handle_desc>;
 
     mem_handle_cache();
+    mem_handle_cache(const mem_handle_cache& other) = delete;
+    mem_handle_cache& operator=(const mem_handle_cache& other) = delete;
     ~mem_handle_cache();
 
     void clear();
@@ -214,6 +226,8 @@ class ipc_handle_cache {
     using value_t = ze_ipc_mem_handle_t;
 
     ipc_handle_cache() = default;
+    ipc_handle_cache(const ipc_handle_cache&) = delete;
+    ipc_handle_cache& operator=(const ipc_handle_cache&) = delete;
     ~ipc_handle_cache();
 
     void clear();
diff --git a/src/sched/entry/ze/cache/ze_device_cache.cpp b/src/sched/entry/ze/cache/ze_device_cache.cpp
index 84fd57d8c..db56958c1 100644
--- a/src/sched/entry/ze/cache/ze_device_cache.cpp
+++ b/src/sched/entry/ze/cache/ze_device_cache.cpp
@@ -14,12 +14,62 @@
  limitations under the License.
 */
 #include "common/global/global.hpp"
-#include "sched/entry/ze/cache/ze_device_cache.hpp"
 #include "sched/entry/ze/cache/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_device_cache.hpp"
 
 namespace ccl {
 namespace ze {
 
+static size_t current_allocated_memory = 0;
+static std::unordered_map<void*, size_t> recorded_allocations;
+
+void device_allocate(ze_context_handle_t context,
+                     const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                     size_t bytes,
+                     size_t alignment,
+                     ze_device_handle_t device,
+                     void** pptr) {
+    current_allocated_memory += bytes;
+    LOG_DEBUG("|MEMLOG| Allocating: ",
+              bytes / 1024,
+              "KB. Current memory footprint: ",
+              current_allocated_memory / 1024,
+              "KB");
+
+    ZE_CALL(zeMemAllocDevice, (context, &device_mem_alloc_desc, bytes, alignment, device, pptr));
+    auto [_, inserted] = recorded_allocations.try_emplace(*pptr, bytes);
+
+    if (!inserted) {
+        LOG_WARN(
+            "Could not record device allocation. Memory footprint might not be representing real consumption!");
+    }
+}
+
+void device_free(ze_context_handle_t context, void* ptr) {
+    auto recorded_allocation = recorded_allocations.find(ptr);
+
+    // bytes = ccl::utils::invalid_bytes_value indicate an error in our recorded_allocations map
+    // this could be caused by improper usage of the device memory wrapper
+    size_t bytes = ccl::utils::invalid_bytes_value;
+
+    if (recorded_allocation != recorded_allocations.end()) {
+        bytes = recorded_allocation->second;
+        current_allocated_memory -= bytes;
+        recorded_allocations.erase(recorded_allocation);
+    }
+    else {
+        LOG_WARN(
+            "Could not record device allocation. Memory footprint might not be representing real consumption!");
+    }
+
+    LOG_DEBUG("|MEMLOG| Freeing: ",
+              bytes / 1024,
+              "KB. Current memory footprint: ",
+              current_allocated_memory / 1024,
+              "KB");
+    ZE_CALL(zeMemFree, (context, ptr));
+}
+
 template <class map_t, class... keys_t>
 bool get_from_cache(map_t& cache, typename map_t::mapped_type& object, keys_t... keys) {
     bool success{};
@@ -97,8 +147,7 @@ void plain_device_mem_cache::get(ze_context_handle_t context,
                         bytes,
                         device_mem_alloc_desc.flags,
                         device_mem_alloc_desc.ordinal)) {
-        ZE_CALL(zeMemAllocDevice,
-                (context, &device_mem_alloc_desc, bytes, alignment, device, pptr));
+        device_allocate(context, device_mem_alloc_desc, bytes, alignment, device, pptr);
     }
 }
 
@@ -119,7 +168,7 @@ void plain_device_mem_cache::push(ze_context_handle_t context,
                        bytes,
                        device_mem_alloc_desc.flags,
                        device_mem_alloc_desc.ordinal)) {
-        ZE_CALL(zeMemFree, (context, ptr));
+        device_free(context, ptr);
     }
 }
 
@@ -196,8 +245,7 @@ void chunk_device_mem_cache::get(ze_context_handle_t context,
         }
     }
     else {
-        ZE_CALL(zeMemAllocDevice,
-                (context, &device_mem_alloc_desc, bytes, alignment, device, pptr));
+        device_allocate(context, device_mem_alloc_desc, bytes, alignment, device, pptr);
         LOG_DEBUG("allocated directly: object: ", *pptr);
     }
 }
@@ -218,8 +266,7 @@ void chunk_device_mem_cache::push(ze_context_handle_t context,
     if (global_data::env().enable_ze_cache) {
         // find the corresponding memory chunk and mark the block as free.
         for (auto& chunk : memory_chunks) {
-            if (reinterpret_cast<uintptr_t>(ptr) - reinterpret_cast<uintptr_t>(chunk.base_ptr) <=
-                chunk.size) {
+            if (ptr >= chunk.base_ptr && ptr < (static_cast<char*>(chunk.base_ptr) + chunk.size)) {
                 size_t offset =
                     reinterpret_cast<uintptr_t>(ptr) - reinterpret_cast<uintptr_t>(chunk.base_ptr);
                 size_t block_index = offset / chunk.block_size;
@@ -240,7 +287,7 @@ void chunk_device_mem_cache::push(ze_context_handle_t context,
     }
 
     // if the pointer does not belong to any existing chunk, free it directly.
-    ZE_CALL(zeMemFree, (context, ptr));
+    device_free(context, ptr);
     LOG_DEBUG("freed directly: object: ", ptr);
 }
 
@@ -266,7 +313,7 @@ void chunk_device_mem_cache::evict_chunk(ze_context_handle_t context, Comparison
                          });
 
     if (chunk_it != memory_chunks.end() && !is_chunk_used(*chunk_it)) {
-        ZE_CALL(zeMemFree, (context, chunk_it->base_ptr));
+        device_free(context, chunk_it->base_ptr);
         memory_chunks.erase(chunk_it);
     }
 }
@@ -305,13 +352,108 @@ void chunk_device_mem_cache::allocate_new_chunk(
 
     // allocate the memory chunk and create the memory_chunk structure.
     void* base_ptr;
-    ZE_CALL(zeMemAllocDevice,
-            (context, &device_mem_alloc_desc, chunk_size, alignment, device, &base_ptr));
+    device_allocate(context, device_mem_alloc_desc, chunk_size, alignment, device, &base_ptr);
     memory_chunks.emplace_back(chunk_size, block_size);
     memory_chunks.back().base_ptr = base_ptr;
     memory_chunks.back().used_blocks[0] = true; // mark the first block as used
 }
 
+template <class map_t, class... keys_t>
+bool get_from_mem_dev_cache(size_t requested_size,
+                            map_t& cache,
+                            typename map_t::mapped_type& object,
+                            keys_t... keys) {
+    bool success{};
+
+    for (auto it : cache) {
+        if (std::get<2>(it.first) >= requested_size) {
+            object = it.second;
+            success = true;
+            break;
+        }
+    }
+
+    return success;
+}
+
+template <class map_t, class... keys_t>
+bool push_to_dev_mem_cache(map_t& cache,
+                           const typename map_t::mapped_type& object,
+                           keys_t... keys) {
+    bool success{};
+
+    typename map_t::key_type key(keys...);
+    auto it = cache.find(key);
+    if (it != cache.end()) {
+        LOG_DEBUG("cache already contains object with the same key");
+        CCL_THROW_IF_NOT(it->second != object, "trying to push an object that already exists");
+    }
+    cache[key] = object;
+    success = true;
+    return success;
+}
+
+void device_memory_manager::get_global_ptr(ze_context_handle_t context,
+                                           ze_device_handle_t device,
+                                           const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                                           size_t size_need,
+                                           size_t alignment,
+                                           void** pptr) {
+    CCL_THROW_IF_NOT(context);
+    CCL_THROW_IF_NOT(device);
+
+    if (!get_from_mem_dev_cache(size_need,
+                                cache,
+                                *pptr,
+                                context,
+                                device,
+                                size_need,
+                                device_mem_alloc_desc.flags,
+                                device_mem_alloc_desc.ordinal)) {
+        std::lock_guard<std::mutex> lock(mutex);
+        device_allocate(context, device_mem_alloc_desc, size_need, alignment, device, pptr);
+
+        push_to_dev_mem_cache(cache,
+                              *pptr,
+                              context,
+                              device,
+                              size_need,
+                              device_mem_alloc_desc.flags,
+                              device_mem_alloc_desc.ordinal);
+    }
+}
+
+void device_memory_manager::clear() {
+    // TODO: hangs if memfree is enabled MLSL-2641
+    if (!global_data::env().ze_device_mem_disable_clear) {
+        std::lock_guard<std::mutex> lock(mutex);
+
+        auto compare_size = [](const key_t& a, const key_t& b) {
+            return std::get<2>(a) < std::get<2>(b);
+        };
+
+        // Find the largest element in the cache
+        auto largest_it =
+            std::max_element(cache.begin(), cache.end(), [&](const auto& a, const auto& b) {
+                return compare_size(a.first, b.first);
+            });
+
+        if (largest_it != cache.end()) {
+            // Iterate through the cache, removing all elements except the largest one
+            size_t largest_size = std::get<2>(largest_it->first);
+            for (auto it = cache.begin(); it != cache.end();) {
+                if (std::get<2>(it->first) < largest_size) {
+                    device_free(std::get<0>(it->first), it->second);
+                    it = cache.erase(it);
+                }
+                else {
+                    ++it;
+                }
+            }
+        }
+    }
+}
+
 // cache
 void cache::get(size_t instance_idx,
                 ze_context_handle_t context,
diff --git a/src/sched/entry/ze/cache/ze_device_cache.hpp b/src/sched/entry/ze/cache/ze_device_cache.hpp
index 9a487e5fc..4f01ca22d 100644
--- a/src/sched/entry/ze/cache/ze_device_cache.hpp
+++ b/src/sched/entry/ze/cache/ze_device_cache.hpp
@@ -24,6 +24,15 @@
 namespace ccl {
 namespace ze {
 
+void device_allocate(ze_context_handle_t context,
+                     const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                     size_t bytes,
+                     size_t alignment,
+                     ze_device_handle_t device,
+                     void** pptr);
+
+void device_free(ze_context_handle_t context, void* ptr);
+
 enum class device_cache_policy_mode : int { plain, chunk, none };
 static std::map<device_cache_policy_mode, std::string> device_cache_policy_names = {
     std::make_pair(device_cache_policy_mode::plain, "plain"),
@@ -56,6 +65,8 @@ class device_mem_cache {
 class plain_device_mem_cache : public device_mem_cache {
 public:
     plain_device_mem_cache() = default;
+    plain_device_mem_cache(const plain_device_mem_cache&) = delete;
+    plain_device_mem_cache& operator=(const plain_device_mem_cache&) = delete;
     ~plain_device_mem_cache();
 
     void clear();
@@ -89,6 +100,8 @@ class plain_device_mem_cache : public device_mem_cache {
 class chunk_device_mem_cache : public device_mem_cache {
 public:
     chunk_device_mem_cache() = default;
+    chunk_device_mem_cache(const chunk_device_mem_cache&) = delete;
+    chunk_device_mem_cache& operator=(const chunk_device_mem_cache&) = delete;
     ~chunk_device_mem_cache();
     void clear();
 
@@ -139,5 +152,34 @@ class chunk_device_mem_cache : public device_mem_cache {
     std::mutex mutex;
 };
 
+class device_memory_manager {
+public:
+    device_memory_manager() = default;
+    device_memory_manager(const device_memory_manager&) = delete;
+    device_memory_manager& operator=(const device_memory_manager&) = delete;
+    ~device_memory_manager() {
+        cache.clear();
+    }
+
+    void get_global_ptr(ze_context_handle_t context,
+                        ze_device_handle_t device,
+                        const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                        size_t size_need,
+                        size_t alignment,
+                        void** pptr);
+
+    void clear();
+
+private:
+    using key_t = std::tuple<ze_context_handle_t,
+                             ze_device_handle_t,
+                             size_t,
+                             ze_device_mem_alloc_flags_t,
+                             uint32_t>;
+    using value_t = void*;
+    std::unordered_map<key_t, value_t, utils::tuple_hash> cache;
+    std::mutex mutex;
+};
+
 } // namespace ze
 } // namespace ccl
diff --git a/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp b/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
index e23e24e78..5f83593aa 100644
--- a/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
@@ -58,6 +58,11 @@ void ze_a2a_allgatherv_entry::init_ze_hook() {
     std::vector<ccl_buffer> peer_recv_bufs(comm->size());
     std::vector<ccl_buffer> pair_peer_recv_bufs(comm->size());
 
+    if (is_monolithic_pipeline &&
+        (dtype == ccl::datatype::bfloat16 || dtype == ccl::datatype::float16)) {
+        ccl::global_data::env().kernel_mem_align = 64;
+    }
+
     for (int i = 0; i < peer_count; ++i) {
         const int peer_rank = (comm_rank + i + 1) % comm->size();
         int peer_global_rank = comm->get_global_rank(peer_rank);
@@ -280,7 +285,9 @@ void ze_a2a_allgatherv_op::select(ze_a2a_allgatherv_op& args, std::vector<ze_ker
         // copy send_buf to my buffer
         void* dst = args.recv_bufs.at(args.comm->rank()).get_ptr();
         if (args.is_monolithic_pipeline) {
-            // TODO: how is this going to work in all cases? what if comm is !world_comm? Then my_global_rank will point to an incorrect rank. Which, at the very least, can get us referencing "recv_bufs[much_larger_than_size]".
+            // TODO: how is this going to work in all cases? what if comm is !world_comm?
+            // Then my_global_rank will point to an incorrect rank.
+            // Which, at the very least, can get us referencing "recv_bufs[much_larger_than_size]".
             const int my_global_rank = args.comm->get_global_rank(args.comm->rank());
             dst = args.recv_bufs.at(my_global_rank).get_ptr();
         }
diff --git a/src/sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.cpp b/src/sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.cpp
index 6ce19c2fe..aee0a7890 100644
--- a/src/sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.cpp
@@ -66,6 +66,14 @@ void ze_a2a_pipeline_read_write_entry::init_ze_hook() {
     size_t block_count_last_rank = block_count;
     block_count_last_rank += base_count % even_comm->size();
 
+    // the block stores can`t be used for unaligned data.
+    // Address must be aligned to a 128-bit (16-byte) boundary.
+    // When using remote target buffers, each of them is allocated and aligned separately.
+    // However, for the local tmp buffer, one buffer cut into several blocks of contiguous data is required.
+    // Therefore it is hard to align each block inside the buffer
+    // and ensure correctness for the subsequent entries.
+    int can_use_block = attrs.use_remote_target;
+
     if (!attrs.use_continous_data) {
         CCL_THROW_IF_NOT(block_count_last_rank == block_count,
                          "block_count : ",
@@ -122,9 +130,8 @@ void ze_a2a_pipeline_read_write_entry::init_ze_hook() {
         }
     }
 
-    ze_kernel_args_t kernel_args{
-        local_send_bufs, mdfi_bufs, tmp_buf_ptrs, &block_count, &block_count_last_rank
-    };
+    ze_kernel_args_t kernel_args{ local_send_bufs,        mdfi_bufs,     tmp_buf_ptrs, &block_count,
+                                  &block_count_last_rank, &can_use_block };
 
     ze_kernel kernel(module, kernel_name, kernel_args, block_count, worker_idx);
     ZE_APPEND_CALL(ze_cmd_launch_kernel,
@@ -244,10 +251,68 @@ void alloc_tmp_bufs(ccl_sched* sched,
     if (even_comm->rank() == even_comm->size() - 1) {
         block_count += base_count % even_comm->size();
     }
-    for (int idx = 0; idx < even_comm->size(); idx++) {
-        ccl::alloc_param alloc_tmp_param(
-            block_count * dtype.size(), ccl::buffer_type::ze, ccl::buffer_place::device);
-        tmp_bufs[idx] = sched->alloc_buffer(alloc_tmp_param);
+
+    //TODO: handle when pipe_chunk_count is added for other collectives
+    //TODO: refactor it to only go thgough this path when necessary
+    if (global_data::env().ze_device_mem_enable &&
+        ccl::global_data::env().allreduce_pipe_chunk_count <= 0 &&
+        ccl::global_data::env().reduce_pipe_chunk_count <= 0 &&
+        ccl::global_data::env().reduce_scatter_pipe_chunk_count <= 0 &&
+        ccl::global_data::env().allgatherv_pipe_chunk_count <= 0 &&
+        ccl::is_queue_in_order(sched->coll_param.stream)) {
+        tmp_bufs.resize(even_comm->size());
+        tmp_buf_idx_start = in_buffers.size();
+        in_buffers.reserve(tmp_buf_idx_start + tmp_bufs.size());
+
+        size_t sub_buffer_size_in_bytes = block_count * dtype.size();
+
+        size_t alignment_size = 128;
+        size_t sub_buffer_aligned_size =
+            alignment_size * ((sub_buffer_size_in_bytes + alignment_size - 1) / alignment_size);
+
+        size_t precalculated_size_in_bytes = even_comm->size() * sub_buffer_aligned_size;
+        size_t total_required_size_in_bytes = even_comm->size() * sub_buffer_aligned_size;
+
+        if (global_data::env().ze_device_mem_alloc_size != 0 &&
+            static_cast<long>(total_required_size_in_bytes) <=
+                global_data::env().ze_device_mem_alloc_size) {
+            LOG_DEBUG("precalculated_size: ",
+                      global_data::env().ze_device_mem_alloc_size,
+                      "(bytes), total_required_size: ",
+                      total_required_size_in_bytes,
+                      "(bytes) can be fullfilled");
+            precalculated_size_in_bytes = global_data::env().ze_device_mem_alloc_size;
+        }
+        else if (global_data::env().ze_device_mem_alloc_size != 0) {
+            LOG_WARN("precalculated_size: ",
+                     global_data::env().ze_device_mem_alloc_size,
+                     "(bytes), total_required_size: ",
+                     total_required_size_in_bytes,
+                     "(bytes) can not be fullfilled");
+        }
+
+        void* global_ptr = nullptr;
+        ccl::global_data::get().ze_data->dev_memory_manager->get_global_ptr(
+            sched->coll_param.stream->get_ze_context(),
+            sched->coll_param.stream->get_ze_device(),
+            ze::default_device_mem_alloc_desc,
+            precalculated_size_in_bytes,
+            0,
+            &global_ptr);
+        CCL_THROW_IF_NOT(global_ptr, "main ptr for temp buffers is invalid");
+
+        for (int idx = 0; idx < even_comm->size(); idx++) {
+            void* sub_buffer_ptr = (char*)global_ptr + (idx * sub_buffer_aligned_size);
+            tmp_bufs[idx] = ccl_buffer(sub_buffer_ptr, sub_buffer_aligned_size);
+            in_buffers.push_back({ tmp_bufs[idx].get_ptr(), ccl::ze::ipc_mem_type::memory });
+        }
+    }
+    else {
+        for (int idx = 0; idx < even_comm->size(); idx++) {
+            ccl::alloc_param alloc_tmp_param(
+                block_count * dtype.size(), ccl::buffer_type::ze, ccl::buffer_place::device);
+            tmp_bufs[idx] = sched->alloc_buffer(alloc_tmp_param);
+        }
     }
     tmp_buf_idx_start = in_buffers.size();
     in_buffers.reserve(tmp_buf_idx_start + tmp_bufs.size());
diff --git a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
index c8ca8cdd9..12c434622 100644
--- a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
@@ -48,6 +48,7 @@ ze_a2a_reduce_scatter_entry::ze_a2a_reduce_scatter_entry(
 void ze_a2a_reduce_scatter_entry::kernel_init(size_t rank_buf_offset,
                                               size_t block_count,
                                               void* send_buf,
+                                              void* output_buf,
                                               void* base_ptr,
                                               const std::vector<ccl_buffer>& peer_send_bufs,
                                               int peer_count,
@@ -104,12 +105,12 @@ void ze_a2a_reduce_scatter_entry::kernel_init(size_t rank_buf_offset,
                                  (rank_buf_offset + peer_buf_offset) * dtype.size() + offsets[i];
                 peer_bufs.push_back(peer_buf);
             }
-            void* output_buf = static_cast<char*>(base_ptr) + offsets[i];
+            void* output_ptr = static_cast<char*>(output_buf) + offsets[i];
 
             ze_kernel_args_t kernel_args{ &count_local,
                                           &input_buf,
                                           peer_bufs, //peer_bufs_ze_arg,
-                                          &output_buf };
+                                          &output_ptr };
 
             kernels.emplace_back(
                 module, monolithic_kernel_name, kernel_args, count_local, worker_idx);
@@ -129,9 +130,9 @@ void ze_a2a_reduce_scatter_entry::kernel_init(size_t rank_buf_offset,
         LOG_DEBUG("get kernel_name: ", kernel_name);
         // reduce peer values in tmp_buf and own values in send_buf into tmp_buf
         kernels.reserve(1);
-        void* input_buf = static_cast<char*>(send_buf) + rank_buf_offset * dtype.size();
-        void* inoutput_buf = base_ptr;
-        ze_kernel_args_t kernel_args{ &count, &peer_count, &input_buf, &inoutput_buf };
+        void* input_buf1 = static_cast<char*>(send_buf) + rank_buf_offset * dtype.size();
+        void* input_buf2 = base_ptr;
+        ze_kernel_args_t kernel_args{ &count, &peer_count, &input_buf1, &input_buf2, &output_buf };
         kernels.emplace_back(module, kernel_name, kernel_args, count, worker_idx);
     }
     else {
@@ -159,6 +160,7 @@ void ze_a2a_reduce_scatter_entry::kernel_init(size_t rank_buf_offset,
 void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
                                             void* send_buf,
                                             void* output_buf,
+                                            void* tmp_buf,
                                             const std::vector<ccl_buffer>& peer_send_bufs,
                                             int peer_count,
                                             int comm_rank,
@@ -184,6 +186,7 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
                 block_count,
                 send_buf,
                 output_buf,
+                tmp_buf,
                 peer_send_bufs,
                 peer_count,
                 dtype,
@@ -230,7 +233,7 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
         for (int i = 0; i < peer_count; i++) {
             void* src = static_cast<char*>(peer_send_bufs[i].get_ptr()) +
                         (rank_buf_offset + peer_buf_offset) * dtype.size();
-            void* dst = static_cast<char*>(output_buf) + i * copy_bytes;
+            void* dst = static_cast<char*>(tmp_buf) + i * copy_bytes;
             // TODO: if we are on the same device, then use t2t direction
             auto list = entry->get_copy_list(copy_direction::c2c, i);
             ZE_APPEND_CALL_TO_ENTRY(entry,
@@ -246,6 +249,11 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
         ZE_APPEND_CALL_TO_ENTRY(
             entry, ze_cmd_barrier, entry->get_comp_list(), barrier_event, copy_events);
 
+        // when output_buf == tmp_buf, then fill_list is invoked from allreduce_entry
+        // and in that case we cannot signal the entry since allreduce_entry has an
+        // allgatherv to finish after this reduce_scatter
+        const bool is_signal_entry = (output_buf != tmp_buf) && is_single_kernel;
+
         /* reduce stage */
         for (size_t i = 0; i < kernels.size(); ++i) {
             ZE_APPEND_CALL_TO_ENTRY(
@@ -253,7 +261,7 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
                 ze_cmd_launch_kernel,
                 entry->get_comp_list(),
                 std::move(kernels[i]),
-                kernel_events.at(i),
+                is_signal_entry ? entry->entry_event : kernel_events.at(i),
                 ze_events_t({ (i == 0) ? barrier_event : kernel_events.at(i - 1) }));
             // TODO: Can we parallelize by only waiting on barrier_event?
         }
@@ -263,6 +271,7 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
 void ze_a2a_reduce_scatter_entry::init_ze_hook() {
     /* get peer buffers */
     bool is_monolithic = ccl::global_data::env().reduce_scatter_monolithic_kernel;
+    bool is_single_kernel = ccl::global_data::env().enable_kernel_single_reduce_peers;
     std::vector<ccl_buffer> peer_send_bufs(peer_count);
 
     for (int i = 0; i < peer_count; ++i) {
@@ -283,13 +292,14 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
         }
         return;
     }
+    void* output_buf = recv_buf.get_ptr();
+    void* tmp_buf;
     ccl::alloc_param alloc_param(tmp_buf_bytes, buffer_type::ze, buffer_place::device);
-    void* output_buf;
     if (is_monolithic && peer_count <= (int)ccl::ze::max_peer_count) {
-        output_buf = recv_buf.get_ptr();
+        tmp_buf = nullptr;
     }
     else {
-        output_buf = sched->alloc_buffer(alloc_param).get_ptr();
+        tmp_buf = sched->alloc_buffer(alloc_param).get_ptr();
     }
 
     LOG_DEBUG("rank ",
@@ -313,7 +323,7 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
         // leftover kernel and aligned kernel
         kernel_events.resize((int)ccl::utils::align_kernels::count);
     }
-    else if (ccl::global_data::env().enable_kernel_single_reduce_peers) {
+    else if (is_single_kernel) {
         // when kernel merge is used only one kernel is required
         kernel_events.resize(1);
     }
@@ -332,6 +342,7 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
     fill_list(this,
               send_buf.get_ptr(),
               output_buf,
+              tmp_buf,
               peer_send_bufs,
               peer_count,
               comm_rank,
@@ -350,11 +361,11 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
               worker_idx,
               peer_buf_offset,
               is_monolithic,
-              ccl::global_data::env().enable_kernel_single_reduce_peers);
+              is_single_kernel);
 
-    if (!(is_monolithic && peer_count <= (int)ccl::ze::max_peer_count)) {
-        // in case of non-monolithic use case, we do the copy
-        // from tmp buf to recv buf
+    if (!(is_monolithic && peer_count <= (int)ccl::ze::max_peer_count) && !is_single_kernel) {
+        // in case of non-monolithic and non-single kernel use case,
+        // we do the copy from tmp buf to recv buf
         ZE_APPEND_CALL(ze_cmd_memory_copy,
                        ze_base_entry::get_copy_list(),
                        recv_buf.get_ptr(),
@@ -363,7 +374,9 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
                        ze_base_entry::entry_event,
                        kernel_events);
     }
-    else {
+    else if (is_monolithic && peer_count <= (int)ccl::ze::max_peer_count) {
+        // in case of monolithic kernel, use a barrier to combine the
+        // events from unaligned and aligned kernels
         CCL_THROW_IF_NOT(kernel_events.size() == (int)ccl::utils::align_kernels::count,
                          "unexpected kernel events size: ",
                          kernel_events.size());
@@ -372,6 +385,13 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
                        ze_base_entry::entry_event,
                        kernel_events);
     }
+    else {
+        CCL_THROW_IF_NOT(kernel_events.size() == 1 && is_single_kernel,
+                         "single kernel event expected, size: ",
+                         kernel_events.size(),
+                         " single kernel mode expected, mode: ",
+                         is_single_kernel);
+    }
 }
 
 void ze_a2a_reduce_scatter_entry::update() {
@@ -541,11 +561,12 @@ void ze_a2a_reduce_scatter_write_kernel_entry::kernel_init(size_t rank_buf_offse
         LOG_DEBUG("get kernel name: ", kernel_name);
         // reduce peer values in tmp_buf and own values in send_buf into tmp_buf
         kernels.reserve(1);
-        void* input_buf = static_cast<char*>(rs_bufs.send_buf.get_ptr()) +
-                          (rank_buf_offset + rs_bufs.send_buf_offset) * rs_args.dtype.size();
-        void* inoutput_buf = rs_bufs.tmp_write_buf.get_ptr();
+        void* input_buf1 = static_cast<char*>(rs_bufs.send_buf.get_ptr()) +
+                           (rank_buf_offset + rs_bufs.send_buf_offset) * rs_args.dtype.size();
+        void* input_buf2 = rs_bufs.tmp_write_buf.get_ptr();
+        void* output_buf = rs_bufs.recv_buf.get_ptr();
 
-        ze_kernel_args_t kernel_args{ &count, &peer_count, &input_buf, &inoutput_buf };
+        ze_kernel_args_t kernel_args{ &count, &peer_count, &input_buf1, &input_buf2, &output_buf };
         kernels.emplace_back(module, kernel_name, kernel_args, count, worker_idx);
     }
     else {
@@ -605,13 +626,14 @@ void ze_a2a_reduce_scatter_write_kernel_entry::fill_list_kernel(
                                 ze_cmd_launch_kernel,
                                 entry->get_comp_list(),
                                 std::move(kernels[i]),
-                                kernel_events.at(i),
+                                is_single_kernel ? entry->entry_event : kernel_events.at(i),
                                 (i == 0) ? wait_events : ze_events_t({ kernel_events.at(i - 1) }));
     }
 }
 
 void ze_a2a_reduce_scatter_write_kernel_entry::init_ze_hook() {
     size_t buf_bytes = rs_args.dtype.size() * rs_args.recv_counts[comm_rank];
+    bool is_single_kernel = ccl::global_data::env().enable_kernel_single_reduce_peers;
 
     if (!buf_bytes) {
         ZE_APPEND_CALL(ze_cmd_barrier,
@@ -621,7 +643,7 @@ void ze_a2a_reduce_scatter_write_kernel_entry::init_ze_hook() {
         return;
     }
 
-    if (ccl::global_data::env().enable_kernel_single_reduce_peers) {
+    if (is_single_kernel) {
         kernel_events.resize(1);
     }
     else {
@@ -650,16 +672,21 @@ void ze_a2a_reduce_scatter_write_kernel_entry::init_ze_hook() {
                      device,
                      context,
                      worker_idx,
-                     ccl::global_data::env().enable_kernel_single_reduce_peers,
+                     is_single_kernel,
                      wait_events);
 
-    ZE_APPEND_CALL(ze_cmd_memory_copy,
-                   ze_base_entry::get_copy_list(),
-                   rs_bufs.recv_buf.get_ptr(),
-                   rs_bufs.tmp_write_buf.get_ptr(),
-                   buf_bytes,
-                   ze_base_entry::entry_event,
-                   kernel_events);
+    // single_kernel mode directly signals the entry,
+    // otherwise use a barrier that depends on
+    // all the kernels to signal the entry
+    if (!is_single_kernel) {
+        ZE_APPEND_CALL(ze_cmd_memory_copy,
+                       ze_base_entry::get_copy_list(),
+                       rs_bufs.recv_buf.get_ptr(),
+                       rs_bufs.tmp_write_buf.get_ptr(),
+                       buf_bytes,
+                       ze_base_entry::entry_event,
+                       kernel_events);
+    }
 }
 
 void ze_a2a_reduce_scatter_write_kernel_entry::update() {
diff --git a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp
index 997c4e4da..050cf3bc4 100644
--- a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp
+++ b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp
@@ -21,7 +21,7 @@
 typedef struct {
     ccl_comm* comm;
     const std::vector<size_t> recv_counts;
-    const ccl_datatype& dtype;
+    const ccl_datatype dtype;
     ccl::reduction op;
 } reduce_scatter_args;
 
@@ -64,6 +64,7 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
     static void fill_list(const ze_base_entry* entry,
                           void* send_buf,
                           void* recv_buf,
+                          void* tmp_buf,
                           const std::vector<ccl_buffer>& peer_send_bufs,
                           int peer_count,
                           int comm_rank,
@@ -105,6 +106,7 @@ class ze_a2a_reduce_scatter_entry : public ze_base_entry {
     static void kernel_init(size_t rank_buf_offset,
                             size_t block_count,
                             void* send_buf,
+                            void* recv_buf,
                             void* base_ptr,
                             const std::vector<ccl_buffer>& peer_send_bufs,
                             int peer_count,
diff --git a/src/sched/entry/ze/ze_barrier_entry.hpp b/src/sched/entry/ze/ze_barrier_entry.hpp
index f9cac1a6a..ed3213a72 100644
--- a/src/sched/entry/ze/ze_barrier_entry.hpp
+++ b/src/sched/entry/ze/ze_barrier_entry.hpp
@@ -27,6 +27,8 @@ class ze_barrier_entry : public sched_entry {
         return class_name();
     }
 
+    ze_barrier_entry(const ze_barrier_entry&) = delete;
+    ze_barrier_entry& operator=(const ze_barrier_entry&) = delete;
     ze_barrier_entry() = delete;
     explicit ze_barrier_entry(ccl_sched* sched,
                               ccl_comm* comm,
diff --git a/src/sched/entry/ze/ze_base_entry.cpp b/src/sched/entry/ze/ze_base_entry.cpp
index c9d43253f..31b2a86e3 100644
--- a/src/sched/entry/ze/ze_base_entry.cpp
+++ b/src/sched/entry/ze/ze_base_entry.cpp
@@ -224,8 +224,10 @@ ze_command_list_handle_t ze_base_entry::get_comp_list(uint32_t index) const {
 }
 
 ze_command_list_handle_t ze_base_entry::get_copy_list(copy_direction direction,
-                                                      uint32_t index) const {
-    return sched->get_memory().list_manager->get_copy_list(this, wait_events, direction, index);
+                                                      uint32_t index,
+                                                      queue_group_type force_queue_type) const {
+    return sched->get_memory().list_manager->get_copy_list(
+        this, wait_events, direction, index, force_queue_type);
 }
 
 ze_event_handle_t ze_base_entry::create_event(ze_event_pool_handle_t event_pool,
diff --git a/src/sched/entry/ze/ze_base_entry.hpp b/src/sched/entry/ze/ze_base_entry.hpp
index 2b4255b8d..d545aa0d1 100644
--- a/src/sched/entry/ze/ze_base_entry.hpp
+++ b/src/sched/entry/ze/ze_base_entry.hpp
@@ -29,6 +29,7 @@ class ze_base_entry : public sched_entry {
 public:
     ze_base_entry() = delete;
     ze_base_entry(const ze_base_entry &) = delete;
+    ze_base_entry &operator=(const ze_base_entry &) = delete;
     virtual ~ze_base_entry();
 
     static ze_event_handle_t create_event(ze_event_pool_handle_t event_pool,
@@ -39,8 +40,10 @@ class ze_base_entry : public sched_entry {
     virtual void update() override;
 
     ze_command_list_handle_t get_comp_list(uint32_t index = 0) const;
-    ze_command_list_handle_t get_copy_list(copy_direction direction = copy_direction::d2d,
-                                           uint32_t index = 0) const;
+    ze_command_list_handle_t get_copy_list(
+        copy_direction direction = copy_direction::d2d,
+        uint32_t index = 0,
+        queue_group_type force_queue_type = queue_group_type::unknown) const;
 
     ze_event_handle_t entry_event{};
 
diff --git a/src/sched/entry/ze/ze_call.hpp b/src/sched/entry/ze/ze_call.hpp
index e324fcfa6..bc82da184 100644
--- a/src/sched/entry/ze/ze_call.hpp
+++ b/src/sched/entry/ze/ze_call.hpp
@@ -31,6 +31,8 @@ class ze_call {
     };
 
     ze_call();
+    ze_call(const ze_call&) = delete;
+    ze_call& operator=(const ze_call&) = delete;
     ~ze_call();
     ze_result_t do_call(ze_result_t ze_result, const char* ze_name) const;
 
diff --git a/src/sched/entry/ze/ze_copy_entry.cpp b/src/sched/entry/ze/ze_copy_entry.cpp
index 4d18c8d26..faba19243 100644
--- a/src/sched/entry/ze/ze_copy_entry.cpp
+++ b/src/sched/entry/ze/ze_copy_entry.cpp
@@ -61,7 +61,7 @@ void ze_copy_entry::init_ze_hook() {
     void* src = static_cast<char*>(in_buf.get_ptr()) + attr.in_buf_offset * dtype.size();
 
     ze_command_list_handle_t list =
-        ze_base_entry::get_copy_list(attr.direction, attr.hint_queue_index);
+        ze_base_entry::get_copy_list(attr.direction, attr.hint_queue_index, attr.force_queue_type);
 
     ZE_APPEND_CALL(ze_cmd_memory_copy,
                    list,
diff --git a/src/sched/entry/ze/ze_handle_exchange_entry.cpp b/src/sched/entry/ze/ze_handle_exchange_entry.cpp
index 67c13c259..76535edb1 100644
--- a/src/sched/entry/ze/ze_handle_exchange_entry.cpp
+++ b/src/sched/entry/ze/ze_handle_exchange_entry.cpp
@@ -68,13 +68,10 @@ ze_handle_exchange_entry::~ze_handle_exchange_entry() {
     if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets) {
         close_sockets();
         unlink_sockets();
-    }
-
-    if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
-        for (auto& fd : opened_pidfds) {
-            close(fd);
+        for (int fd : opened_sockets_fds) {
+            ccl::utils::close_fd(fd);
         }
-        opened_pidfds.clear();
+        opened_sockets_fds.clear();
     }
 }
 
@@ -138,8 +135,7 @@ void ze_handle_exchange_entry::create_local_ipc_handles(const std::vector<mem_de
                 sched->get_memory().handle_manager.get_handle(mem_info.first, &ipc_handle);
 
                 if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
-                    physical_devices =
-                        ccl::global_data::get().ze_data->fd_manager->get_physical_devices();
+                    physical_devices = comm->get_fd_manager()->get_physical_devices();
                     mem_handle = ipc_to_mem_handle(
                         ipc_handle,
                         ccl::ze::get_device_id(sched->coll_param.stream->get_ze_device()));
@@ -250,7 +246,6 @@ void ze_handle_exchange_entry::fill_remote_handle(const payload_t& payload,
     handles[idx][buf_idx].remote_context_id = payload.remote_context_id;
     handles[idx][buf_idx].remote_pid = payload.remote_pid;
     handles[idx][buf_idx].remote_device_id = payload.remote_device_id;
-    handles[idx][buf_idx].pidfd_fd = payload.pidfd_fd;
     handles[idx][buf_idx].device_fd = payload.device_fd;
     LOG_DEBUG("get IPC handle: { peer: ",
               idx,
@@ -286,36 +281,37 @@ int ze_handle_exchange_entry::get_remote_physical_device_fd(const ssize_t remote
 }
 
 void ze_handle_exchange_entry::common_fd_mode_exchange(const std::vector<mem_desc_t>& bufs) {
+    std::vector<payload_t> all_payloads(comm_size * bufs.size());
+    std::vector<payload_t> local_payloads(bufs.size());
+
     for (size_t buf_idx = 0; buf_idx < bufs.size(); buf_idx++) {
-        std::vector<payload_t> payloads(comm_size);
         payload_t payload{};
         fill_payload(payload, bufs, buf_idx);
+        local_payloads[buf_idx] = payload;
+    }
 
-        if (!(ccl::utils::allgather(
-                comm->get_atl_comm(), &payload, payloads.data(), sizeof(payload_t)))) {
-            CCL_THROW("allgather exchange is failed");
-        }
-
-        for (size_t idx = 0; idx < payloads.size(); idx++) {
-            if (comm->rank() == (int)idx) {
-                continue;
-            }
+    if (!(ccl::utils::allgather(comm->get_atl_comm(),
+                                local_payloads.data(),
+                                all_payloads.data(),
+                                sizeof(payload_t) * bufs.size()))) {
+        CCL_THROW("allgather exchange is failed");
+    }
 
+    for (int rank_idx = 0; rank_idx < comm_size; rank_idx++) {
+        if (comm->rank() == rank_idx) {
+            continue;
+        }
+        for (size_t buf_idx = 0; buf_idx < bufs.size(); buf_idx++) {
+            int payload_idx = rank_idx * bufs.size() + buf_idx;
             if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
-                payloads[idx].device_fd =
-                    get_remote_physical_device_fd(payloads[idx].remote_device_id);
-            }
-            else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
-                opened_pidfds.push_back(ccl::ze::fd_manager::pidfd_open(payloads[idx].remote_pid));
-                payloads[idx].pidfd_fd = opened_pidfds.back();
-            }
-            else {
-                CCL_THROW("unexpected ipc_exchange_mode");
+                all_payloads[payload_idx].device_fd =
+                    get_remote_physical_device_fd(all_payloads[payload_idx].remote_device_id);
             }
+
             fill_remote_handle(
-                payloads[idx],
-                {}, // ipc_handle is empty, it's initialized immeadiately before calling zeMemOpenIpcHandle
-                idx,
+                all_payloads[payload_idx],
+                {}, // ipc_handle is empty, it's initialized immediately before calling zeMemOpenIpcHandle
+                rank_idx,
                 buf_idx);
         }
     }
@@ -375,15 +371,7 @@ void ze_handle_exchange_entry::pt2pt_fd_mode_exchange(const std::vector<mem_desc
                     payloads[idx].device_fd =
                         get_remote_physical_device_fd(payloads[idx].remote_device_id);
                 }
-                else if (ccl::global_data::env().ze_ipc_exchange ==
-                         ccl::ze::ipc_exchange_mode::pidfd) {
-                    opened_pidfds.push_back(
-                        ccl::ze::fd_manager::pidfd_open(payloads[idx].remote_pid));
-                    payloads[idx].pidfd_fd = opened_pidfds.back();
-                }
-                else {
-                    CCL_THROW("unexpected ipc_exchange_mode for pt2pt");
-                }
+
                 fill_remote_handle(
                     payloads[idx],
                     {}, // ipc_handle is empty, it's initialized immeadiately before calling zeMemOpenIpcHandle
@@ -490,6 +478,8 @@ int ze_handle_exchange_entry::sockets_mode_exchange(const std::vector<mem_desc_t
                     ccl::utils::sendmsg_call(
                         right_peer_socket, recv_fd, &payload, sizeof(payload), rank);
                 }
+                opened_sockets_fds.push_back(recv_fd);
+
                 start_peer_idx++;
             }
             else if (poll_fds[0].revents & POLLERR) {
@@ -683,9 +673,9 @@ void ze_handle_exchange_entry::close_sockets() {
     // the socket during entry's destruction, to make sure we don't have any
     // open sockets left.
     if (!sockets_closed) {
-        close(left_peer_connect_socket);
-        close(left_peer_socket);
-        close(right_peer_socket);
+        ccl::utils::close_fd(left_peer_connect_socket);
+        ccl::utils::close_fd(left_peer_socket);
+        ccl::utils::close_fd(right_peer_socket);
         sockets_closed = true;
     }
 }
diff --git a/src/sched/entry/ze/ze_handle_exchange_entry.hpp b/src/sched/entry/ze/ze_handle_exchange_entry.hpp
index 2512d1c90..6e0b638d1 100644
--- a/src/sched/entry/ze/ze_handle_exchange_entry.hpp
+++ b/src/sched/entry/ze/ze_handle_exchange_entry.hpp
@@ -98,7 +98,6 @@ class ze_handle_exchange_entry : public sched_entry {
         uint64_t remote_mem_alloc_id{};
         ssize_t remote_context_id{ ccl::utils::invalid_context_id };
         ssize_t remote_device_id{ ccl::utils::invalid_device_id };
-        int pidfd_fd{ ccl::utils::invalid_fd };
         int device_fd{ ccl::utils::invalid_fd };
     };
 
@@ -142,7 +141,7 @@ class ze_handle_exchange_entry : public sched_entry {
     mem_info_t get_mem_info(const void* ptr);
 
     bool sockets_closed = false;
-    std::vector<int> opened_pidfds;
+    std::vector<int> opened_sockets_fds;
 
     void unlink_sockets();
     void close_sockets();
diff --git a/src/sched/entry/ze/ze_kernel.hpp b/src/sched/entry/ze/ze_kernel.hpp
index 4c0a91dbd..5489548bc 100644
--- a/src/sched/entry/ze/ze_kernel.hpp
+++ b/src/sched/entry/ze/ze_kernel.hpp
@@ -39,6 +39,8 @@ class ze_kernel {
 
     ze_kernel(const ze_kernel &) = delete;
     ze_kernel(ze_kernel &&other) noexcept;
+    ze_kernel &operator=(const ze_kernel &) = delete;
+    ze_kernel &operator=(ze_kernel &&other) = delete;
     ~ze_kernel() noexcept;
 
 private:
diff --git a/src/sched/entry/ze/ze_primitives.cpp b/src/sched/entry/ze/ze_primitives.cpp
index 2a4275b5d..6b4b47fa1 100644
--- a/src/sched/entry/ze/ze_primitives.cpp
+++ b/src/sched/entry/ze/ze_primitives.cpp
@@ -15,10 +15,12 @@
 */
 #include <algorithm>
 #include <fstream>
+#include <unordered_map>
 
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
 #include "common/utils/utils.hpp"
+#include "common/utils/version.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 namespace ccl {
@@ -38,6 +40,11 @@ std::map<h2d_copy_engine_mode, std::string> h2d_copy_engine_names = {
     std::make_pair(h2d_copy_engine_mode::auto_mode, "auto")
 };
 
+std::map<d2d_copy_engine_mode, std::string> d2d_copy_engine_names = {
+    std::make_pair(d2d_copy_engine_mode::none, "none"),
+    std::make_pair(d2d_copy_engine_mode::main, "main"),
+};
+
 std::string get_build_log_string(ze_module_build_log_handle_t build_log) {
     size_t log_size{};
     ZE_CALL(zeModuleBuildLogGetString, (build_log, &log_size, nullptr));
@@ -52,38 +59,91 @@ std::string get_build_log_string(ze_module_build_log_handle_t build_log) {
     return log;
 }
 
+static void load_file(std::ifstream& file, std::vector<uint8_t>& bytes) {
+    file.seekg(0, file.end);
+    size_t file_size = file.tellg();
+    file.seekg(0, file.beg);
+
+    bytes.resize(file_size);
+    file.read(reinterpret_cast<char*>(bytes.data()), file_size);
+}
+
 void load_module(const std::string& file_path,
                  ze_device_handle_t device,
                  ze_context_handle_t context,
                  ze_module_handle_t* module) {
-    LOG_DEBUG("module loading started: file: ", file_path);
-    CCL_THROW_IF_NOT(!file_path.empty(), "no file");
+    bool compiling_spirv_module = false;
 
-    std::ifstream file(file_path, std::ios_base::in | std::ios_base::binary);
-    CCL_THROW_IF_NOT(file.good(), "failed to load module: file: ", file_path);
+    ze_module_build_log_handle_t build_log{};
+    ze_module_format_t format{};
+    std::vector<uint8_t> module_data{};
 
-    file.seekg(0, file.end);
-    size_t filesize = file.tellg();
-    file.seekg(0, file.beg);
+    // Prepare name for cached module in format:
+    // /tmp/ccl-module-cache-{UID}-{CCL version hash}
+    size_t version_hash = std::hash<std::string>{}(utils::get_library_version().full);
 
-    std::vector<uint8_t> module_data(filesize);
-    file.read(reinterpret_cast<char*>(module_data.data()), filesize);
-    file.close();
+    std::stringstream ss;
+    ss << "/tmp/ccl-module-cache-" << getuid() << "-" << std::hex << version_hash;
+    const std::string cached_module_path = ss.str();
 
-    ze_module_build_log_handle_t build_log{};
-    ze_module_desc_t desc{};
-    ze_module_format_t format = ZE_MODULE_FORMAT_IL_SPIRV;
+    std::ifstream cached_module_file(cached_module_path, std::ios_base::in | std::ios_base::binary);
+
+    if (cached_module_file.good() && ccl::global_data::env().kernel_module_cache) {
+        LOG_DEBUG("|MODULE CACHE| Using cached module at: ", cached_module_path);
+
+        load_file(cached_module_file, module_data);
+        cached_module_file.close();
+
+        format = ZE_MODULE_FORMAT_NATIVE;
+        compiling_spirv_module = false;
+    }
+    else {
+        LOG_DEBUG("|MODULE CACHE| SPIR-V module loading started, file: ", file_path);
+        CCL_THROW_IF_NOT(!file_path.empty(), "incorrect path to SPIR-V module.");
+
+        std::ifstream spirv_module_file(file_path, std::ios_base::in | std::ios_base::binary);
+        CCL_THROW_IF_NOT(spirv_module_file.good(),
+                         "failed to load file containing oneCCL SPIR-V kernels, file: ",
+                         file_path);
+
+        load_file(spirv_module_file, module_data);
+        spirv_module_file.close();
+
+        format = ZE_MODULE_FORMAT_IL_SPIRV;
+        compiling_spirv_module = true;
+    }
+
+    ze_module_desc_t desc{ default_module_desc };
     desc.format = format;
-    desc.pInputModule = reinterpret_cast<const uint8_t*>(module_data.data());
     desc.inputSize = module_data.size();
+    desc.pInputModule = reinterpret_cast<const uint8_t*>(module_data.data());
 
-    if (ZE_CALL(zeModuleCreate, (context, device, &desc, module, &build_log)) !=
-        ZE_RESULT_SUCCESS) {
-        CCL_THROW(
-            "failed to create module: ", file_path, ", log: ", get_build_log_string(build_log));
+    try {
+        ZE_CALL(zeModuleCreate, (context, device, &desc, module, &build_log));
     }
-    else {
-        LOG_DEBUG("module loading completed: directory: file: ", file_path);
+    catch (std::string& error_message) {
+        CCL_THROW("failed to create module: ",
+                  file_path,
+                  "error message: ",
+                  error_message,
+                  ", log: ",
+                  get_build_log_string(build_log));
+    }
+
+    if (compiling_spirv_module && ccl::global_data::env().kernel_module_cache) {
+        // We had to compile SPIR-V binary to gpu specific ISA, so now we can cache the module
+        LOG_DEBUG("|MODULE CACHE| Caching compiled module to: ", cached_module_path);
+        std::ofstream cached_module_file_new(cached_module_path,
+                                             std::ios_base::out | std::ios_base::binary);
+
+        size_t binary_size = 0;
+        ZE_CALL(zeModuleGetNativeBinary, (*module, &binary_size, nullptr));
+
+        std::vector<uint8_t> compiled_module_data(binary_size);
+        ZE_CALL(zeModuleGetNativeBinary, (*module, &binary_size, compiled_module_data.data()));
+
+        cached_module_file_new.write(reinterpret_cast<char*>(compiled_module_data.data()),
+                                     binary_size);
     }
 
     ZE_CALL(zeModuleBuildLogDestroy, (build_log));
@@ -273,7 +333,7 @@ int get_fd_from_handle(const ze_ipc_mem_handle_t& handle) {
 
 void close_handle_fd(const ze_ipc_mem_handle_t& handle) {
     int fd = get_fd_from_handle(handle);
-    close(fd);
+    ccl::utils::close_fd(fd);
 }
 
 ze_ipc_mem_handle_t get_handle_from_fd(int fd) {
@@ -374,11 +434,11 @@ bool is_same_fabric_port(const zes_fabric_port_id_t& port1, const zes_fabric_por
     if (!(port1.fabricId == port2.fabricId && port1.attachId == port2.attachId &&
           port1.portNumber == port2.portNumber)) {
         result = false;
-        LOG_DEBUG("fabric ports are not the same:"
-                  " port1: ",
-                  ccl::ze::to_string(port1),
-                  " port2: ",
-                  ccl::ze::to_string(port2));
+        // LOG_DEBUG("fabric ports are not the same:"
+        //           " port1: ",
+        //           ccl::ze::to_string(port1),
+        //           " port2: ",
+        //           ccl::ze::to_string(port2));
     }
     return result;
 }
diff --git a/src/sched/entry/ze/ze_primitives.hpp b/src/sched/entry/ze/ze_primitives.hpp
index 60d7de9f7..2094be636 100644
--- a/src/sched/entry/ze/ze_primitives.hpp
+++ b/src/sched/entry/ze/ze_primitives.hpp
@@ -33,9 +33,11 @@ enum class device_id : uint32_t { unknown = 0x0, id1 = 0x200, id2 = 0xbd0, id3 =
 
 enum class copy_engine_mode { none, main, link, auto_mode };
 enum class h2d_copy_engine_mode { none, main, auto_mode };
+enum class d2d_copy_engine_mode { none, main };
 
 extern std::map<copy_engine_mode, std::string> copy_engine_names;
 extern std::map<h2d_copy_engine_mode, std::string> h2d_copy_engine_names;
+extern std::map<d2d_copy_engine_mode, std::string> d2d_copy_engine_names;
 
 constexpr ze_context_desc_t default_context_desc = { .stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC,
                                                      .pNext = nullptr,
@@ -82,6 +84,14 @@ constexpr ze_memory_allocation_properties_t default_alloc_props = {
     .pageSize = 0
 };
 
+constexpr ze_module_desc_t default_module_desc = { .stype = ZE_STRUCTURE_TYPE_MODULE_DESC,
+                                                   .pNext = nullptr,
+                                                   .format = ZE_MODULE_FORMAT_IL_SPIRV,
+                                                   .inputSize = 0,
+                                                   .pInputModule = nullptr,
+                                                   .pBuildFlags = nullptr,
+                                                   .pConstants = nullptr };
+
 constexpr ze_device_properties_t default_device_props = { .stype =
                                                               ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES,
                                                           .pNext = nullptr,
diff --git a/src/sched/queue/flow_control.hpp b/src/sched/queue/flow_control.hpp
index 2aced09c1..641cc9849 100644
--- a/src/sched/queue/flow_control.hpp
+++ b/src/sched/queue/flow_control.hpp
@@ -14,6 +14,7 @@
  limitations under the License.
 */
 #pragma once
+#include <cstddef>
 
 namespace ccl {
 
@@ -22,6 +23,8 @@ namespace ccl {
 class flow_control {
 public:
     flow_control();
+    flow_control(const flow_control &other) = delete;
+    flow_control &operator=(const flow_control &other) = delete;
     ~flow_control();
 
     void set_max_credits(size_t value);
diff --git a/src/sched/queue/queue.hpp b/src/sched/queue/queue.hpp
index b6f44f251..2e036531e 100644
--- a/src/sched/queue/queue.hpp
+++ b/src/sched/queue/queue.hpp
@@ -64,6 +64,8 @@ class ccl_sched_list {
 
     ccl_sched_list& operator=(const ccl_sched_list& other) = delete;
 
+    ccl_sched_list(const ccl_sched_list&) = delete;
+
     ccl_sched_list(ccl_sched_list&& src) {
         {
             std::lock_guard<sched_queue_lock_t> lock(src.elem_guard);
@@ -168,6 +170,7 @@ class ccl_sched_bin {
     }
 
     ccl_sched_bin() = delete;
+    ccl_sched_bin(const ccl_sched_bin& other) = delete;
     ccl_sched_bin& operator=(const ccl_sched_bin& other) = delete;
 
     ccl_sched_bin(ccl_sched_bin&& src) = default;
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index a0498493e..82e8ce06d 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -14,6 +14,7 @@
  limitations under the License.
 */
 #include "coll/coll_check.hpp"
+#include "coll/coll_util.hpp"
 #include "coll/selection/selection.hpp"
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
@@ -59,7 +60,8 @@ ccl_sched::ccl_sched(const ccl_sched_create_param& param, bool top_level_sched)
           req(new ccl_request(*this)),
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
           use_output_event(top_level_sched &&
-                           ccl::utils::should_use_sycl_output_event(coll_param.stream)),
+                           (ccl::utils::should_use_sycl_output_event(coll_param.stream) ||
+                            ccl::is_queue_in_order(coll_param.stream))),
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
           top_level_sched(top_level_sched),
           subsched_entry_parent_sched(nullptr),
@@ -568,7 +570,7 @@ size_t ccl_sched::entries_count() const {
 }
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-void ccl_sched::set_output_event(ccl_request* request) {
+void ccl_sched::create_sync_event(ccl_request* request) {
     if (!use_output_event) {
         return;
     }
@@ -594,10 +596,7 @@ void ccl_sched::set_output_event(ccl_request* request) {
     LOG_DEBUG("convert L0 event: ", ev, " into a SYCL event and submit a barrier");
 
     auto sync_event = ccl::utils::make_event(context, ev);
-    request->set_sync_event(sync_event);
-    if (coll_attr.synchronous) {
-        request->set_native_event(ccl::utils::submit_barrier(q));
-    }
+    request->set_sync_event(std::move(sync_event));
 
 #else // CCL_ENABLE_SYCL_INTEROP_EVENT
     CCL_THROW("interop event functionality is not available with current configuration, "
diff --git a/src/sched/sched.hpp b/src/sched/sched.hpp
index f8ff32050..9785d34bb 100644
--- a/src/sched/sched.hpp
+++ b/src/sched/sched.hpp
@@ -270,7 +270,7 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 private:
-    void set_output_event(ccl_request* request);
+    void create_sync_event(ccl_request* request);
     void update_active_request(bool use_delayed);
     static void complete_itt(const ccl_stream* stream);
 
diff --git a/src/sched/sched_base.cpp b/src/sched/sched_base.cpp
index 61360cb4f..a75942fc8 100644
--- a/src/sched/sched_base.cpp
+++ b/src/sched/sched_base.cpp
@@ -179,6 +179,7 @@ void ccl_sched_base::try_enable_ze_single_list() {
     use_single_list = ccl::global_data::env().enable_ze_single_list &&
                       ccl::global_data::env().kernel_debug == 0 &&
                       !ccl::global_data::env().enable_fusion;
+    LOG_DEBUG("ze_single_list set to: ", use_single_list);
 }
 
 void ccl_sched_base::append_to_ze_entries_list(sched_entry* entry) {
@@ -266,6 +267,8 @@ void ccl_sched_base::clear_memory() {
         memory.handle_manager.clear();
         memory.ipc_event_pool_manager.clear();
 
+        ccl::global_data::get().ze_data->dev_memory_manager->clear();
+
         // Since list_manager is a shared_ptr, call clear only for the last
         //  reference (when use_count() is 1).
         // In all other cases, it is correct to simply skip calling clear since
diff --git a/src/sched/sched_base.hpp b/src/sched/sched_base.hpp
index 4201273c8..137d05179 100644
--- a/src/sched/sched_base.hpp
+++ b/src/sched/sched_base.hpp
@@ -160,9 +160,10 @@ struct ccl_sched_base {
 protected:
     ~ccl_sched_base();
 
-    ccl_sched_base() {
-        CCL_THROW("unsupported");
-    }
+    ccl_sched_base() = delete;
+
+    ccl_sched_base(const ccl_sched_base& other) = delete;
+    ccl_sched_base& operator=(const ccl_sched_base& other) = delete;
 
     ccl_sched_base(const ccl_sched_create_param& param);
 
diff --git a/src/sched/sched_restart_manager.cpp b/src/sched/sched_restart_manager.cpp
index 6b766d8da..2586e6055 100644
--- a/src/sched/sched_restart_manager.cpp
+++ b/src/sched/sched_restart_manager.cpp
@@ -111,7 +111,7 @@ ccl_request* sched_restart_manager::preprocess(bool restart) {
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
         // we need to set output event and submit barrier immediately, otherwise q.wait()
         // called by a user can return earlier than we process all the delayed requests
-        sched->set_output_event(new_req);
+        sched->create_sync_event(new_req);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
         return new_req;
@@ -119,7 +119,7 @@ ccl_request* sched_restart_manager::preprocess(bool restart) {
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
     if (!sched->get_request()->has_output_event()) {
-        sched->set_output_event(sched->get_request());
+        sched->create_sync_event(sched->get_request());
     }
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
diff --git a/src/sched/sched_timer.cpp b/src/sched/sched_timer.cpp
index cff2d8af6..f6d3dd11f 100644
--- a/src/sched/sched_timer.cpp
+++ b/src/sched/sched_timer.cpp
@@ -149,6 +149,8 @@ void event_end(__itt_event event) {
 
     auto event_ref_count = inflight_event_ref_counts.find(event);
 
+    CCL_THROW_IF_NOT(event_ref_count != inflight_event_ref_counts.end(), "itt event not found");
+
     event_ref_count->second--;
     if (event_ref_count->second == 0) {
         // No more references to the event are currently used
diff --git a/src/sched/ze/ze_event_manager.cpp b/src/sched/ze/ze_event_manager.cpp
index 620f95ad8..e734f593a 100644
--- a/src/sched/ze/ze_event_manager.cpp
+++ b/src/sched/ze/ze_event_manager.cpp
@@ -209,8 +209,8 @@ ze_event_handle_t dynamic_event_pool::get_event() {
     pool_info.num_alloc_events = 0;
 
     slot.pool = event_pools.insert(event_pools.end(), pool_info);
-    slot.pool_idx = event_pool_request_idx;
-    event_pool_request_idx = ++event_pool_request_idx % event_pool_size;
+    slot.pool_idx = event_pool_request_idx++;
+    event_pool_request_idx %= event_pool_size;
 
     return create_event(slot);
 }
diff --git a/src/sched/ze/ze_handle_manager.cpp b/src/sched/ze/ze_handle_manager.cpp
index df057a7bf..42c9d0891 100644
--- a/src/sched/ze/ze_handle_manager.cpp
+++ b/src/sched/ze/ze_handle_manager.cpp
@@ -64,9 +64,13 @@ ze_ipc_mem_handle_t ipc_handle_desc::mem_to_ipc_handle() const {
         LOG_DEBUG("device_fd: ", device_fd, " gotten fd from mem_handle_to_fd: ", fd);
     }
     else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
-        CCL_THROW_IF_NOT(pidfd_fd != ccl::utils::invalid_fd, "pidfd_fd is invalid value");
-        fd = ccl::ze::fd_manager::mem_handle_to_fd(pidfd_fd, mem_handle);
-        LOG_DEBUG("pidfd_fd: ", pidfd_fd, ", gotten fd from mem_handle_to_fd: ", fd);
+        CCL_THROW_IF_NOT(remote_pid != ccl::utils::invalid_pid, "remote_pid is invalid value");
+        int opened_fd = ccl::ze::fd_manager::pidfd_open(remote_pid);
+        fd = ccl::ze::fd_manager::mem_handle_to_fd(opened_fd, mem_handle);
+        if (!global_data::env().enable_ze_cache) {
+            ccl::utils::close_fd(opened_fd);
+        }
+        LOG_DEBUG("remote_pid: ", remote_pid, ", gotten fd from mem_handle_to_fd: ", fd);
     }
     else {
         CCL_THROW("unexpected ipc_exchange_mode");
@@ -197,7 +201,8 @@ void ipc_handle_manager::set(const mem_handle_map_t& handles_arg, bool pt2pt_op)
 void* ipc_handle_manager::get_ptr(int rank,
                                   size_t buf_idx,
                                   const ccl_comm* map_comm,
-                                  bool pt2pt_op) {
+                                  bool pt2pt_op,
+                                  bool to_cache) {
     check_rank(rank, (map_comm) ? map_comm : comm, pt2pt_op);
     if (map_comm && (map_comm->id() != comm->id())) {
         int old_rank = rank;
@@ -233,7 +238,7 @@ void* ipc_handle_manager::get_ptr(int rank,
 
     if (mem_ptr == nullptr) {
         if (mem_type == ipc_mem_type::memory) {
-            open_handle(handle_info, &mem_ptr);
+            open_handle(handle_info, &mem_ptr, to_cache);
         }
         else if (mem_type == ipc_mem_type::pool) {
             ze_ipc_event_pool_handle_t pool_handle;
@@ -266,8 +271,9 @@ void ipc_handle_manager::get(int rank,
                              size_t buf_idx,
                              ccl_buffer& buf,
                              const ccl_comm* map_comm,
-                             bool pt2pt_op) {
-    buf.set(get_ptr(rank, buf_idx, map_comm, pt2pt_op));
+                             bool pt2pt_op,
+                             bool to_cache) {
+    buf.set(get_ptr(rank, buf_idx, map_comm, pt2pt_op, to_cache));
 }
 
 void ipc_handle_manager::get(int rank,
@@ -298,8 +304,13 @@ void ipc_handle_manager::get_handle(ze_event_pool_handle_t pool,
     ZE_CALL(zeEventPoolGetIpcHandle, (pool, ipc_handle));
 }
 
-void ipc_handle_manager::open_handle(ipc_handle_desc& info, void** ptr) {
-    if (global_data::env().enable_ze_cache && global_data::env().enable_ze_cache_open_ipc_handles) {
+void ipc_handle_manager::open_handle(ipc_handle_desc& info, void** ptr, bool to_cache) {
+    // 1. If enable_ze_cache == false, code not executed
+    // 2. If enable_ze_cache_open_ipc_handles == false, code not executed
+    // 3. If (all conditions) == false, code not executed
+    // 4. If all conditions true, code executed
+    if (global_data::env().enable_ze_cache && global_data::env().enable_ze_cache_open_ipc_handles &&
+        to_cache) {
         mem_handle_cache::value_t value{};
         global_data::get().ze_data->cache->get(context, device, info, &value);
         CCL_THROW_IF_NOT(value != nullptr, "unable to open ipc_handle");
@@ -308,8 +319,15 @@ void ipc_handle_manager::open_handle(ipc_handle_desc& info, void** ptr) {
         info.is_cached = true;
     }
     else {
-        ZE_CALL(zeMemOpenIpcHandle,
-                (context, device, info.mem_to_ipc_handle(), 0 /* cache allocation */, ptr));
+        auto handle = info.mem_to_ipc_handle();
+        ZE_CALL(zeMemOpenIpcHandle, (context, device, handle, 0 /* cache allocation */, ptr));
+        if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
+            close_handle_fd(handle);
+        }
+        if (!to_cache) {
+            // used by Sycl kernels, IPC handle needs to be kept open
+            info.is_cached = true;
+        }
     }
 }
 
diff --git a/src/sched/ze/ze_handle_manager.hpp b/src/sched/ze/ze_handle_manager.hpp
index 2136b965e..cdba207e0 100644
--- a/src/sched/ze/ze_handle_manager.hpp
+++ b/src/sched/ze/ze_handle_manager.hpp
@@ -50,7 +50,6 @@ struct ipc_handle_desc {
     ssize_t remote_context_id{ ccl::utils::invalid_context_id };
     uint64_t remote_mem_alloc_id{};
     ssize_t remote_device_id{ ccl::utils::invalid_device_id };
-    int pidfd_fd{ ccl::utils::invalid_fd };
     int device_fd{ ccl::utils::invalid_fd };
 
     bool is_cached = false;
@@ -86,12 +85,17 @@ class ipc_handle_manager {
 
     void set(const mem_handle_map_t& handles_arg, bool pt2pt_op = false);
 
-    void* get_ptr(int rank, size_t buf_idx, const ccl_comm* map_comm, bool pt2pt_op = false);
+    void* get_ptr(int rank,
+                  size_t buf_idx,
+                  const ccl_comm* map_comm,
+                  bool pt2pt_op = false,
+                  bool to_cache = true);
     void get(int rank,
              size_t buf_idx,
              ccl_buffer& buf,
              const ccl_comm* map_comm = nullptr,
-             bool pt2pt_op = false);
+             bool pt2pt_op = false,
+             bool to_cache = true /*can control unique buf to cache*/);
     void get(int rank,
              size_t buf_idx,
              ze_event_pool_handle_t& buf,
@@ -100,7 +104,7 @@ class ipc_handle_manager {
 
     void get_handle(void* ptr, ze_ipc_mem_handle_t* ipc_handle);
     void get_handle(ze_event_pool_handle_t pool, ze_ipc_event_pool_handle_t* ipc_handle);
-    void open_handle(ipc_handle_desc& info, void** ptr);
+    void open_handle(ipc_handle_desc& info, void** ptr, bool to_cache);
     void open_handle(const ze_ipc_event_pool_handle_t& ipc_handle, ze_event_pool_handle_t* pool);
 
     void get_address_range(const void* ptr, void** base_ptr, size_t* size);
diff --git a/src/sched/ze/ze_list_manager.cpp b/src/sched/ze/ze_list_manager.cpp
index a2e2f8bc3..b35bcf092 100644
--- a/src/sched/ze/ze_list_manager.cpp
+++ b/src/sched/ze/ze_list_manager.cpp
@@ -179,6 +179,16 @@ uint32_t queue_factory::get_ordinal() const {
     return queue_ordinal;
 }
 
+bool queue_factory::queue_group_usable(ze_device_handle_t device, queue_group_type type) {
+    ze_queue_properties_t queue_props;
+    get_queues_properties(device, &queue_props);
+    uint32_t ordinal = get_queue_group_ordinal(queue_props, type);
+    if (ordinal >= queue_props.size()) {
+        return false;
+    }
+    return true;
+}
+
 bool queue_factory::can_use_queue_group(ze_device_handle_t device,
                                         queue_group_type type,
                                         copy_engine_mode mode) {
@@ -204,14 +214,7 @@ bool queue_factory::can_use_queue_group(ze_device_handle_t device,
         return false;
     }
 
-    ze_queue_properties_t queue_props;
-    get_queues_properties(device, &queue_props);
-    uint32_t ordinal = get_queue_group_ordinal(queue_props, type);
-    if (ordinal >= queue_props.size()) {
-        return false;
-    }
-
-    return true;
+    return queue_group_usable(device, type);
 }
 
 list_factory::list_factory(ze_device_handle_t device, ze_context_handle_t context, bool is_copy)
@@ -270,12 +273,14 @@ list_manager::list_manager(const ccl_sched_base* sched, const ccl_stream* stream
 
     auto copy_engine_mode = sched->coll_param.comm->get_env()->get_ze_copy_engine();
 
+    main_queue_usable = queue_factory::queue_group_usable(device, queue_group_type::main);
+
     main_queue_available =
         queue_factory::can_use_queue_group(device, queue_group_type::main, copy_engine_mode);
 
     main_queue_available = main_queue_available || (h2d_copy_mode == h2d_copy_engine_mode::main);
 
-    if (main_queue_available) {
+    if (main_queue_available || main_queue_usable) {
         main_queue_factory =
             std::make_unique<queue_factory>(device, context, queue_group_type::main);
     }
@@ -287,7 +292,7 @@ list_manager::list_manager(const ccl_sched_base* sched, const ccl_stream* stream
             std::make_unique<queue_factory>(device, context, queue_group_type::link);
     }
 
-    use_copy_queue = main_queue_available || link_queue_available;
+    bool use_copy_queue = main_queue_available || link_queue_available || main_queue_usable;
     if (use_copy_queue) {
         copy_list_factory = std::make_unique<list_factory>(device, context, true);
     }
@@ -299,7 +304,8 @@ list_manager::~list_manager() {
 
 std::pair<queue_factory*, list_manager::queue_map_t*> list_manager::get_factory_and_map(
     bool is_copy,
-    copy_direction direction) const {
+    copy_direction direction,
+    queue_group_type force_queue_type) const {
     CCL_THROW_IF_NOT((!is_copy && direction == copy_direction::undefined) ||
                          (is_copy && direction != copy_direction::undefined),
                      "wrong direction");
@@ -307,26 +313,55 @@ std::pair<queue_factory*, list_manager::queue_map_t*> list_manager::get_factory_
     queue_factory* factory = nullptr;
     queue_map_t* queue_map = nullptr;
 
-    if (direction == copy_direction::c2c) {
-        if (link_queue_available) {
-            factory = link_queue_factory.get();
-            queue_map = const_cast<queue_map_t*>(&link_queue_map);
+    if ((force_queue_type == queue_group_type::unknown) ||
+        (force_queue_type == queue_group_type::main && !main_queue_usable)) {
+        if (direction == copy_direction::c2c) {
+            if (link_queue_available) {
+                factory = link_queue_factory.get();
+                queue_map = const_cast<queue_map_t*>(&link_queue_map);
+            }
+            else if (main_queue_available) {
+                factory = main_queue_factory.get();
+                queue_map = const_cast<queue_map_t*>(&main_queue_map);
+            }
         }
-        else if (main_queue_available) {
-            factory = main_queue_factory.get();
-            queue_map = const_cast<queue_map_t*>(&main_queue_map);
+        // h2d, d2h, d2d, t2t
+        else if (direction != copy_direction::undefined) {
+            if (direction == copy_direction::t2t) {
+                // TODO: t2t copy direction may produce wrong results for float16/bfloat16
+                // types and main/link copy engines - see MLSL-2739.
+                // Always fallback to compute kernels path.
+            }
+            else if (direction == copy_direction::d2d) {
+                auto d2d_copy_mode = global_data::env().ze_d2d_copy_engine;
+                if (main_queue_usable && d2d_copy_mode == d2d_copy_engine_mode::main) {
+                    factory = main_queue_factory.get();
+                    queue_map = const_cast<queue_map_t*>(&main_queue_map);
+                }
+            }
+            else {
+                const bool use_compute_fallback =
+                    ccl::global_data::env().ze_enable_ccs_fallback_for_copy &&
+                    !main_queue_available;
+
+                if (main_queue_available) {
+                    factory = main_queue_factory.get();
+                    queue_map = const_cast<queue_map_t*>(&main_queue_map);
+                }
+                else if (link_queue_available && !use_compute_fallback) {
+                    factory = link_queue_factory.get();
+                    queue_map = const_cast<queue_map_t*>(&link_queue_map);
+                }
+            }
         }
     }
-    // h2d, d2h, d2d, t2t
-    else if (direction != copy_direction::undefined) {
-        const bool use_compute_fallback =
-            ccl::global_data::env().ze_enable_ccs_fallback_for_copy && !main_queue_available;
-
-        if (main_queue_available) {
+    else {
+        // Force the queue group type
+        if (force_queue_type == queue_group_type::main) {
             factory = main_queue_factory.get();
             queue_map = const_cast<queue_map_t*>(&main_queue_map);
         }
-        else if (link_queue_available && !use_compute_fallback) {
+        else if (force_queue_type == queue_group_type::link) {
             factory = link_queue_factory.get();
             queue_map = const_cast<queue_map_t*>(&link_queue_map);
         }
@@ -346,9 +381,10 @@ list_info_t list_manager::get_list(const sched_entry* entry,
                                    uint32_t index,
                                    bool is_copy,
                                    const std::vector<ze_event_handle_t>& wait_events,
-                                   copy_direction direction) {
+                                   copy_direction direction,
+                                   queue_group_type force_type) {
     // get comp or copy primitives
-    auto factory_map_pair = get_factory_and_map(is_copy, direction);
+    auto factory_map_pair = get_factory_and_map(is_copy, direction, force_type);
     queue_factory* factory = factory_map_pair.first;
     queue_map_t* queue_map = factory_map_pair.second;
     auto queue = factory->get(index);
@@ -437,9 +473,11 @@ ze_command_list_handle_t list_manager::get_copy_list(
     const sched_entry* entry,
     const std::vector<ze_event_handle_t>& wait_events,
     copy_direction direction,
-    uint32_t index) {
-    if (link_queue_available || main_queue_available) {
-        auto list = get_list(entry, index, true, wait_events, direction);
+    uint32_t index,
+    queue_group_type force_queue_type) {
+    if (link_queue_available || main_queue_available ||
+        (force_queue_type == queue_group_type::main && main_queue_usable)) {
+        auto list = get_list(entry, index, true, wait_events, direction, force_queue_type);
         return list->get_native();
     }
     return get_comp_list(entry, wait_events, index);
diff --git a/src/sched/ze/ze_list_manager.hpp b/src/sched/ze/ze_list_manager.hpp
index a7e8baa6b..977d16466 100644
--- a/src/sched/ze/ze_list_manager.hpp
+++ b/src/sched/ze/ze_list_manager.hpp
@@ -80,6 +80,7 @@ class queue_factory {
 public:
     queue_factory(ze_device_handle_t device, ze_context_handle_t context, queue_group_type type);
     queue_factory& operator=(const queue_factory&) = delete;
+    queue_factory(const queue_factory&) = delete;
     queue_factory& operator=(queue_factory&&) = delete;
     ~queue_factory();
     queue_info_t get(uint32_t index);
@@ -87,6 +88,7 @@ class queue_factory {
 
     uint32_t get_ordinal() const;
 
+    static bool queue_group_usable(ze_device_handle_t device, queue_group_type type);
     static bool can_use_queue_group(ze_device_handle_t device,
                                     queue_group_type type,
                                     copy_engine_mode mode);
@@ -132,6 +134,7 @@ class list_manager {
     list_manager() = delete;
     explicit list_manager(const ccl_sched_base* sched, const ccl_stream* stream);
     list_manager(const list_manager&) = delete;
+    list_manager& operator=(const list_manager&) = delete;
     explicit list_manager(list_manager&&) = default;
     ~list_manager();
 
@@ -140,10 +143,12 @@ class list_manager {
     ze_command_list_handle_t get_comp_list(const sched_entry* entry = nullptr,
                                            const std::vector<ze_event_handle_t>& wait_events = {},
                                            uint32_t index = 0);
-    ze_command_list_handle_t get_copy_list(const sched_entry* entry = nullptr,
-                                           const std::vector<ze_event_handle_t>& wait_events = {},
-                                           copy_direction direction = copy_direction::d2d,
-                                           uint32_t index = 0);
+    ze_command_list_handle_t get_copy_list(
+        const sched_entry* entry = nullptr,
+        const std::vector<ze_event_handle_t>& wait_events = {},
+        copy_direction direction = copy_direction::d2d,
+        uint32_t index = 0,
+        queue_group_type force_queue_type = queue_group_type::unknown);
 
     void clear();
     void reset_execution_state();
@@ -184,17 +189,20 @@ class list_manager {
 
     std::list<std::pair<queue_info_t, list_info_t>> access_list;
     bool executed = false;
-    bool use_copy_queue = false;
     bool main_queue_available = false;
     bool link_queue_available = false;
+    bool main_queue_usable = false;
 
-    std::pair<queue_factory*, queue_map_t*> get_factory_and_map(bool is_copy,
-                                                                copy_direction direction) const;
+    std::pair<queue_factory*, queue_map_t*> get_factory_and_map(
+        bool is_copy,
+        copy_direction direction,
+        queue_group_type force_type = queue_group_type::unknown) const;
     list_info_t get_list(const sched_entry* entry,
                          uint32_t index,
                          bool is_copy,
                          const std::vector<ze_event_handle_t>& wait_events,
-                         copy_direction direction);
+                         copy_direction direction,
+                         queue_group_type force_type = queue_group_type::unknown);
 
     void execute_list(queue_info_t& queue, list_info_t& list);
 
diff --git a/src/topology/topo_manager.cpp b/src/topology/topo_manager.cpp
index 0f2bdb0b2..b83cf6a1f 100644
--- a/src/topology/topo_manager.cpp
+++ b/src/topology/topo_manager.cpp
@@ -241,6 +241,10 @@ bool topo_manager::has_p2p_access() const {
     return is_p2p_access_enabled;
 }
 
+bool topo_manager::has_all_vertices_connected() const {
+    return are_all_vertices_connected;
+}
+
 std::vector<ze_device_uuid_t> topo_manager::copy_dev_uuids(const rank_info_vec_t& info_vec) const {
     CCL_THROW_IF_NOT(!ze_rank_info_vec.empty());
     CCL_THROW_IF_NOT(!info_vec.empty());
@@ -344,6 +348,117 @@ p2p_matrix_t topo_manager::build_p2p_matrix(const std::vector<ze_device_handle_t
     return matrix;
 }
 
+bool topo_manager::build_fabric_connectivity_matrix(std::shared_ptr<atl_base_comm> comm) {
+    if (!ccl::global_data::env().enable_fabric_vertex_connection_check) {
+        return true;
+    }
+
+    auto driver = global_data::get().ze_data->drivers[0];
+    bool is_include_devices = true;
+    bool is_include_sub_devices_enabled = false;
+    bool is_matrix_connected = true;
+
+    uint32_t total_vertex_count = 0;
+    std::vector<std::pair<ze_fabric_vertex_handle_t, char>> all_vertices;
+    // get the total number of fabric vertices
+    ZE_CALL(zeFabricVertexGetExp, (driver, &total_vertex_count, nullptr));
+    std::vector<ze_fabric_vertex_handle_t> vertices(total_vertex_count);
+    // retrieve all fabric vertices
+    ZE_CALL(zeFabricVertexGetExp, (driver, &total_vertex_count, vertices.data()));
+
+    // iterate through each vertex
+    for (auto& vertex : vertices) {
+        if (is_include_devices) {
+            all_vertices.push_back(std::make_pair(vertex, 'R'));
+        }
+        // check if including sub-devices, false by default since our model is FLAT
+        // meaning that all the devices that do not have sub-devices
+        if (is_include_sub_devices_enabled) {
+            uint32_t subdev_vertex_count = 0;
+            // get the number of sub-vertices
+            ZE_CALL(zeFabricVertexGetSubVerticesExp, (vertex, &subdev_vertex_count, nullptr));
+            // retrieve sub-vertices
+            std::vector<ze_fabric_vertex_handle_t> subVertices(subdev_vertex_count);
+            ZE_CALL(zeFabricVertexGetSubVerticesExp,
+                    (vertex, &subdev_vertex_count, subVertices.data()));
+            // add sub-vertices to the list
+            for (auto& subVertex : subVertices) {
+                all_vertices.push_back(std::make_pair(subVertex, 'S'));
+            }
+        }
+    }
+
+    // check if there are no fabric vertices
+    if (all_vertices.size() == 0) {
+        is_matrix_connected = false;
+        LOG_INFO("No fabric vertices: ",
+                 all_vertices.size(),
+                 ", is_matrix_connected: ",
+                 is_matrix_connected);
+        return is_matrix_connected;
+    }
+
+    // create the fabric connectivity matrix
+    fabric_conn_matrix_t matrix(all_vertices.size(), std::vector<bool>(all_vertices.size(), false));
+
+    // populate the fabric connectivity matrix based on edges between vertices
+    for (uint32_t vertex_a_idx = 0; vertex_a_idx < all_vertices.size(); vertex_a_idx++) {
+        for (uint32_t vertex_b_idx = 0; vertex_b_idx < all_vertices.size(); vertex_b_idx++) {
+            // diagonal elements are set to true (self-connections)
+            if (vertex_a_idx == vertex_b_idx) {
+                matrix[vertex_a_idx][vertex_b_idx] = true;
+                continue;
+            }
+            // Get the number of edges between two vertices
+            uint32_t edge_count = 0;
+            ZE_CALL(zeFabricEdgeGetExp,
+                    (all_vertices[vertex_a_idx].first,
+                     all_vertices[vertex_b_idx].first,
+                     &edge_count,
+                     nullptr));
+            // set matrix element based on whether there are edges between vertices
+            matrix[vertex_a_idx][vertex_b_idx] = (edge_count > 0);
+            if (!matrix[vertex_a_idx][vertex_b_idx]) {
+                is_matrix_connected = false;
+                LOG_WARN(
+                    "topology recognition shows PCIe connection between devices."
+                    " If this is not correct, you can disable topology recognition,"
+                    " with CCL_TOPO_FABRIC_VERTEX_CONNECTION_CHECK=0. This will assume XeLinks across devices");
+            }
+        }
+    }
+
+    // print the final matrix
+    if (comm->get_rank() == 0) {
+        const uint32_t elementWidth = 15;
+        std::stringstream ss;
+        for (uint32_t i = 0; i < matrix.size(); i++) {
+            ss << std::setw(elementWidth) << "[" << all_vertices[i].second << "]"
+               << all_vertices[i].first;
+        }
+        ss << "\n";
+
+        for (uint32_t vertex_a_idx = 0; vertex_a_idx < matrix.size(); vertex_a_idx++) {
+            ss << "[" << all_vertices[vertex_a_idx].second << "]"
+               << all_vertices[vertex_a_idx].first;
+            for (uint32_t vertex_b_idx = 0; vertex_b_idx < matrix.size(); vertex_b_idx++) {
+                if (vertex_a_idx == vertex_b_idx) {
+                    ss << std::setw(elementWidth) << "X";
+                    continue;
+                }
+                ss << std::setw(elementWidth) << matrix[vertex_a_idx][vertex_b_idx];
+            }
+            ss << "\n";
+        }
+        LOG_INFO("all vertices connected: is_matrix_connected: ",
+                 is_matrix_connected,
+                 "\n fabric connectivity matrix: \n",
+                 ss.str());
+    }
+
+    return is_matrix_connected;
+}
+
 bool topo_manager::is_sub_vector(const std::vector<ze_device_uuid_t>& vec,
                                  const std::vector<ze_device_uuid_t>& sub_vec) {
     CCL_THROW_IF_NOT(!vec.empty());
@@ -831,12 +946,19 @@ fabric_ports_t topo_manager::get_fabric_ports() {
     std::vector<zes_fabric_port_handle_t> ports(port_count);
     ZE_CALL(zesDeviceEnumFabricPorts, ((zes_device_handle_t)ze_device, &port_count, ports.data()));
 
-    bool use_all_ports = (ccl::ze::get_device_family(ze_device) == ccl::device_family::family2 ||
-                          ccl::ze::get_device_family(ze_device) == ccl::device_family::family3);
+    bool use_all_ports = true;
+    if (ccl::ze::get_device_family(ze_device) == ccl::device_family::unknown) {
+        LOG_WARN("device_family is unknown, topology discovery could be incorrect,"
+                 " it might result in suboptimal performance");
+    }
+
     char* use_all_ports_env = getenv("CCL_TOPO_ALL_PORTS");
     if (use_all_ports_env) {
         use_all_ports = atoi(use_all_ports_env);
     }
+    if (ccl::global_data::env().atl_transport == ccl_atl_ofi) {
+        use_all_ports = true;
+    }
     LOG_DEBUG("use all fabric ports: ", use_all_ports);
 
     std::vector<topo_ze_port_info> my_ports;
@@ -893,7 +1015,7 @@ fabric_ports_t topo_manager::get_fabric_ports() {
         std::accumulate(all_port_counts.begin(), all_port_counts.end(), size_t(0));
 
     if (total_port_count == 0) {
-        LOG_DEBUG("no ports detected");
+        LOG_INFO("no ports detected");
         return {};
     }
     else {
@@ -1401,6 +1523,8 @@ void topo_manager::ze_base_init(std::shared_ptr<ccl::device> device,
     // build p2p connectivity info
     const auto& node_devices = global_data::get().ze_data->devices;
     p2p_matrix = build_p2p_matrix(get_filtered_devices(node_devices));
+    are_all_vertices_connected = build_fabric_connectivity_matrix(comm);
+
     is_p2p_access_enabled = check_p2p_access();
     LOG_DEBUG("p2p matrix: \n",
               ccl::to_string(p2p_matrix),
diff --git a/src/topology/topo_manager.hpp b/src/topology/topo_manager.hpp
index 9344f9981..0bd3aa8fd 100644
--- a/src/topology/topo_manager.hpp
+++ b/src/topology/topo_manager.hpp
@@ -100,6 +100,7 @@ std::string to_string(const ze_rank_info_vec_t& ze_rank_info_vec,
                       const host_info_vec_t& host_info_vec);
 
 using p2p_matrix_t = typename std::vector<std::vector<bool>>;
+using fabric_conn_matrix_t = typename std::vector<std::vector<bool>>;
 using fabric_ports_t = typename std::vector<std::vector<topo_ze_port_info>>;
 using plane_t = typename std::set<int>;
 
@@ -154,10 +155,13 @@ class topo_manager {
     enum class port_health_status { unknown, ok, fail };
     bool has_failed_ports() const;
     bool has_p2p_access() const;
+    bool has_all_vertices_connected() const;
     std::vector<ze_device_uuid_t> copy_dev_uuids(const rank_info_vec_t& info_vec) const;
     std::vector<ze_device_handle_t> get_filtered_devices(
         const std::vector<ze::device_info>& node_devices) const;
     static p2p_matrix_t build_p2p_matrix(const std::vector<ze_device_handle_t>& devices);
+    static bool build_fabric_connectivity_matrix(std::shared_ptr<atl_base_comm> comm);
+
     static bool is_sub_vector(const std::vector<ze_device_uuid_t>& vec,
                               const std::vector<ze_device_uuid_t>& sub_vec);
     static void detect_tune_port_count(const std::vector<ze::device_info>& devices);
@@ -242,6 +246,7 @@ class topo_manager {
     ze_rank_info_vec_t ze_rank_info_vec;
 
     bool is_p2p_access_enabled = false;
+    bool are_all_vertices_connected = false;
     port_health_status port_status = port_health_status::unknown;
     size_t unique_device_uuids_count = topo_manager::invalid_device_uuids_count;
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/tests/functional/lp_impl.hpp b/tests/functional/lp_impl.hpp
index fbfb2c307..adc488b7a 100644
--- a/tests/functional/lp_impl.hpp
+++ b/tests/functional/lp_impl.hpp
@@ -18,11 +18,11 @@
 template <typename T>
 void convert_fp32_to_lp_arrays(T* buf, short* lp_buf, size_t count, ccl_data_type dtype) {
     size_t floats_in_reg = (dtype == DATATYPE_BFLOAT16) ? FLOATS_IN_M512 : FLOATS_IN_M256;
-    short tail[floats_in_reg];
+    std::vector<short> tail(floats_in_reg);
 
     for (size_t i = 0; i < count; i += floats_in_reg) {
         if (i / floats_in_reg == count / floats_in_reg) {
-            convert_fp32_to_lp(buf + i, tail, dtype);
+            convert_fp32_to_lp(buf + i, tail.data(), dtype);
             for (size_t j = 0; j < (count - i); j++) {
                 lp_buf[i + j] = tail[j];
             }
@@ -36,11 +36,11 @@ void convert_fp32_to_lp_arrays(T* buf, short* lp_buf, size_t count, ccl_data_typ
 template <typename T>
 void convert_lp_to_fp32_arrays(short* lp_buf, T* buf, size_t count, ccl_data_type dtype) {
     size_t floats_in_reg = (dtype == DATATYPE_BFLOAT16) ? FLOATS_IN_M512 : FLOATS_IN_M256;
-    T tail[floats_in_reg];
+    std::vector<T> tail(floats_in_reg);
 
     for (size_t i = 0; i < count; i += floats_in_reg) {
         if (i / floats_in_reg == count / floats_in_reg) {
-            convert_lp_to_fp32(lp_buf + i, tail, dtype);
+            convert_lp_to_fp32(lp_buf + i, tail.data(), dtype);
             for (size_t j = 0; j < (count - i); j++) {
                 buf[i + j] = tail[j];
             }
diff --git a/tests/functional/transport.hpp b/tests/functional/transport.hpp
index 73303b44e..1234e8d10 100644
--- a/tests/functional/transport.hpp
+++ b/tests/functional/transport.hpp
@@ -46,6 +46,8 @@ class transport_data {
 
 private:
     transport_data();
+    transport_data(const transport_data& other) = delete;
+    transport_data& operator=(const transport_data& other) = delete;
     ~transport_data();
 
     void init_by_mpi();
diff --git a/third-party-programs.txt b/third-party-programs.txt
index b46edad08..15699512b 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -1,9 +1,9 @@
-Intel(R) oneAPI Collective Communications Library (oneCCL)
-2021.11.2 Third Party Programs File
+Intel(R) oneAPI Collective Communications Library (oneCCL) 
+2021.12.0 Third Party Programs File
 
-This file is the "third-party-programs.txt" file specified in the associated
+This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
-Third party programs and their corresponding required notices and/or license
+Third party programs and their corresponding required notices and/or license 
 terms are listed below.
 
 -------------------------------------------------------------------------------
@@ -22,8 +22,8 @@ the following conditions are met:
 * Redistributions must reproduce the above copyright notice and these terms
   of use in the Software and in the documentation and/or other materials
   provided with the distribution.
-* Neither the name of Intel nor the names of its suppliers may be used to
-  endorse or promote products derived from this Software without specific
+* Neither the name of Intel nor the names of its suppliers may be used to 
+  endorse or promote products derived from this Software without specific  
   prior written permission.
 * No reverse engineering, decompilation, or disassembly of the Software is
   permitted, nor any modification or alteration of the Software or its
@@ -128,7 +128,7 @@ SOFTWARE.
 
 3. Libfabric and OpenFabrics Interfaces (OFI)
 
-   Copyright (c) 2015-2019 Intel Corporation.  All rights reserved.
+   Copyright (c) 2015-2019 Intel Corporation.  All rights reserved. 
    Copyright (c) 2015-2019 Cisco Systems, Inc.  All rights reserved.
 
 
@@ -159,15 +159,15 @@ SOFTWARE.
 
 -------------------------------------------------------------------------------
 
-4. OpenSHMEM
+4. OpenSHMEM 
 
  Copyright 2011 Sandia Corporation. Under the terms of Contract
  DE-AC04-94AL85000 with Sandia Corporation, the U.S.  Government
  retains certain rights in this software.
-
+ 
  Copyright (c) 2017 Intel Corporation. All rights reserved.
  This software is available to you under the BSD license.
-
+ 
  COPYRIGHT
 
   Redistribution and use in source and binary forms, with or without
@@ -394,7 +394,7 @@ privately owned rights.
 
 ======================================================================
 
-The backtracing options used at src/backtrace.h are followed from GASNet and
+The backtracing options used at src/backtrace.h are followed from GASNet and 
 has the following license:
 
 "Copyright (c) 2000-2018 The Regents of the University of California.
@@ -513,12 +513,12 @@ Redistribution and use in source and binary forms, with or without modification,
 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
-IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
-OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 -------------------------------------------------------------------------------
@@ -649,9 +649,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 -------------------------------------------------------------------------------
 
-  The following third party programs have their own third party programs. These
+  The following third party programs have their own third party programs. These 
   additional third party program files are as follows:
-  1. Intel(R) Intel(R) MPI Library
+  1. Intel(R) Intel(R) MPI Library 
   <install_dir>/mpi/latest/share/doc/mpi/third-party-programs.txt
 
 -------------------------------------------------------------------------------