diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..5f2670a45
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,145 @@
+---
+Language: Cpp
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
+AlignEscapedNewlines: DontAlign
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Empty
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterObjCDeclaration: false
+  AfterStruct: false
+  AfterUnion: false
+  AfterExternBlock: false
+  BeforeCatch: true
+  BeforeElse: true
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakInheritanceList: BeforeColon
+BreakStringLiterals: false
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: true
+DisableFormat: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+ReflowComments: false
+SortIncludes: false
+SortUsingDeclarations: false
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth: 1
+UseTab: Never
+...
diff --git a/CMakeLists.txt b/CMakeLists.txt
index da4c9517e..e4dce0040 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,9 +34,6 @@ check_compiler_version()
 if (NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release")
 endif()
-if (NOT CMAKE_DISABLE_SYCL)
-    set(CMAKE_DISABLE_SYCL 0)
-endif()
 
 #make build variable case insensitive
 string( TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_CASE_INSENSITIVE)
@@ -47,6 +44,7 @@ if (${CMAKE_BUILD_TYPE_CASE_INSENSITIVE} STREQUAL "debug")
     set(USE_SECURITY_FLAGS FALSE)
 endif()
 
+option(BUILD_UT "Build unit tests" TRUE)
 option(USE_CODECOV_FLAGS "Calculate code coverage" FALSE)
 option(WITH_ASAN "Use address sanitizer, can only be used in Debug build" FALSE)
 
@@ -58,6 +56,7 @@ if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 endif()
 
 #show build info
+message(STATUS "Build unit tests: ${BUILD_UT}")
 message(STATUS "Installation directory: ${CMAKE_INSTALL_PREFIX}")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE_CASE_INSENSITIVE}")
 message(STATUS "C compiler : ${CMAKE_C_COMPILER}")
@@ -93,8 +92,6 @@ include_directories(${LIBFABRIC_INCLUDE_DIR})
 link_directories(${MPI_LIB_DIR})
 link_directories(${LIBFABRIC_LIB_DIR})
 
-
-
 set(CCL_INSTALL_UNIT_TESTS "${CMAKE_INSTALL_PREFIX}/tests/unit")
 
 set(CMAKE_SKIP_INSTALL_RPATH TRUE)
@@ -144,14 +141,40 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMP
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
+set(TRY_ENABLE_SYCL_L0 OFF)
+
 if (COMPUTE_RUNTIME)
     activate_compute_runtime("${CMAKE_CURRENT_LIST_DIR}/cmake" ${COMPUTE_RUNTIME})
-	set(PARENT_COMPUTE_RUNTIME_TARGET_NAME ${COMPUTE_RUNTIME_TARGET_NAME})
+    if (NOT COMPUTE_RUNTIME_TARGET_NAME)
+        message(FATAL_ERROR "Failed to find requested compute runtime: ${COMPUTE_RUNTIME}")
+    endif()
+    message(STATUS "COMPUTE_RUNTIME_TARGET_NAME: ${COMPUTE_RUNTIME_TARGET_NAME}")
     if (${CCL_ENABLE_SYCL_V} STREQUAL 1)
         option (CCL_ENABLE_SYCL "Enable CCL SYCL runtime" ON)
         message(STATUS "Enable CCL SYCL runtime")
+        if (${COMPUTE_RUNTIME_TARGET_NAME} STREQUAL "Intel::SYCL")
+            set (CCL_ENABLE_SYCL_CHECK_CONTRACT "#if defined(__cplusplus)\n#if !defined(__clang__) || __clang_major__ < 9 || !defined(SYCL_LANGUAGE_VERSION)\n#error This version of CCL configured only for oneAPI DPC++ Compiler\n#endif\n#endif")
+            execute_process(COMMAND dpcpp -v
+                OUTPUT_VARIABLE DPCPP_VERSION
+                ERROR_VARIABLE DPCPP_VERSION
+                OUTPUT_STRIP_TRAILING_WHITESPACE
+                ERROR_STRIP_TRAILING_WHITESPACE
+            )
+            message(STATUS "DPC++ compiler version:\n" "${DPCPP_VERSION}")
+        else(${COMPUTE_RUNTIME_TARGET_NAME} STREQUAL "Codeplay::ComputeCpp")
+            set (CCL_ENABLE_SYCL_CHECK_CONTRACT "#if defined(__cplusplus)\n#if !defined(__clang__) || __clang_major__ < 6\n#error This version of CCL configured only for oneAPI DPC++ Compiler\n#endif\n#endif")
+        endif()
     endif()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPUTE_RUNTIME_FLAGS}")
+    if ((${COMPUTE_RUNTIME_TARGET_NAME} STREQUAL "Intel::SYCL") AND
+        ${CCL_ENABLE_SYCL_L0} STREQUAL 1)
+        set(MULTI_GPU_SUPPORT ON)
+    elseif(${COMPUTE_RUNTIME_TARGET_NAME} STREQUAL "ze_loader")
+        set(MULTI_GPU_SUPPORT ON)
+    endif()
+    if (MULTI_GPU_SUPPORT)
+        message(STATUS "Enable multi GPU support using L0")
+    endif()
 endif()
 
 if(${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
@@ -210,6 +233,9 @@ if (CCL_BF16_COMPILER)
     endif()
 endif()
 
+add_definitions(-DCCL_GPU_BF16_TRUNCATE)
+set(CCL_GPU_BF16_TRUNCATE ON)
+
 set(CCL_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/src)
 
 enable_testing()
@@ -222,32 +248,6 @@ set(CMAKE_CLANG_FLAGS "${CMAKE_CLANG_FLAGS}")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
-if (COMPUTE_RUNTIME)
-    if ((${COMPUTE_RUNTIME_TARGET_NAME} STREQUAL "ze_loader")
-             OR (${COMPUTE_RUNTIME_TARGET_NAME} STREQUAL "Intel::SYCL"))
-        set(MULTI_GPU_SUPPORT ON)
-		activate_compute_runtime("${CMAKE_CURRENT_LIST_DIR}/cmake" L0)
-		message ("Enable multi GPU support: ${MULTI_GPU_SUPPORT}")
-		message ("COMPUTE_RUNTIME_TARGET_NAME: ${COMPUTE_RUNTIME_TARGET_NAME}")
-    endif()
-endif(COMPUTE_RUNTIME)
-
-if (MULTI_GPU_SUPPORT)
-    option(CCL_GPU_DEVICES_AFFINITY_ENABLE "Enable L0" ON)
-    if(CCL_GPU_DEVICES_AFFINITY_ENABLE)
-        set(CCL_GPU_DEVICES_AFFINITY_MASK_SIZE 4)
-        message ("Set L0 device mask affinity size: ${CCL_GPU_DEVICES_AFFINITY_MASK_SIZE}")
-    endif()
-endif(MULTI_GPU_SUPPORT)
-
-if (CCL_ENABLE_SYCL)
-    if (${PARENT_COMPUTE_RUNTIME_TARGET_NAME} STREQUAL "Intel::SYCL")
-        set (CCL_ENABLE_SYCL_CHECK_CONTRACT "#if defined(__cplusplus)\n#if !defined(__clang__) || __clang_major__ < 9 || !defined(CL_SYCL_LANGUAGE_VERSION)\n#error This version of CCL configured only for oneAPI DPC++ Compiler\n#endif\n#endif")
-    else(${PARENT_COMPUTE_RUNTIME_TARGET_NAME} STREQUAL "Codeplay::ComputeCpp")
-        set (CCL_ENABLE_SYCL_CHECK_CONTRACT "#if defined(__cplusplus)\n#if !defined(__clang__) || __clang_major__ < 6\n#error This version of CCL configured only for oneAPI DPC++ Compiler\n#endif\n#endif")
-    endif()
-endif()
-
 #generate & install vars.sh
 configure_file(cmake/vars.sh.in ${CMAKE_CURRENT_BINARY_DIR}/vars.sh @ONLY)
 configure_file(cmake/setvars.sh.in ${CMAKE_CURRENT_BINARY_DIR}/setvars.sh @ONLY)
@@ -259,15 +259,15 @@ install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/ccl DESTINATION ${CCL_INSTALL_MODUL
 install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/third-party-programs.txt DESTINATION ${CCL_INSTALL_LICENSE})
 install(PROGRAMS ${PROJECT_SOURCE_DIR}/LICENSE DESTINATION ${CCL_INSTALL_LICENSE})
 
-set(CCL_MAJOR_VERSION     "0")
-set(CCL_MINOR_VERSION     "10")
+set(CCL_MAJOR_VERSION     "2021")
+set(CCL_MINOR_VERSION     "1")
 set(CCL_UPDATE_VERSION    "0")
-set(CCL_PRODUCT_STATUS    "beta")
+set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
 get_vcs_properties("git")
 set(CCL_PRODUCT_FULL "${CCL_PRODUCT_STATUS}-${CCL_MAJOR_VERSION}.${CCL_MINOR_VERSION}.${CCL_UPDATE_VERSION} ${CCL_PRODUCT_BUILD_DATE} ${VCS_INFO}")
-configure_file(${PROJECT_SOURCE_DIR}/include/oneapi/ccl/ccl_config.h.in "${CMAKE_CURRENT_BINARY_DIR}/include/oneapi/ccl/ccl_config.h")
-file(COPY "${CMAKE_CURRENT_BINARY_DIR}/include/oneapi/ccl/ccl_config.h" DESTINATION ${PROJECT_SOURCE_DIR}/include/oneapi/ccl)
+configure_file(${PROJECT_SOURCE_DIR}/include/oneapi/ccl/config.h.in "${CMAKE_CURRENT_BINARY_DIR}/include/oneapi/ccl/config.h")
+file(COPY "${CMAKE_CURRENT_BINARY_DIR}/include/oneapi/ccl/config.h" DESTINATION ${PROJECT_SOURCE_DIR}/include/oneapi/ccl)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/include)
 
 
@@ -282,4 +282,7 @@ if (CCL_ENABLE_SYCL)
 endif()
 
 add_subdirectory(tests/functional)
-#add_subdirectory(tests/unit)
+
+if (BUILD_UT)
+    #add_subdirectory(tests/unit)
+endif()
diff --git a/cmake/FindIntelSYCL.cmake b/cmake/FindIntelSYCL.cmake
index 1a773f823..ed7775dd0 100644
--- a/cmake/FindIntelSYCL.cmake
+++ b/cmake/FindIntelSYCL.cmake
@@ -26,14 +26,13 @@ endif()
 
 set(OPENCLROOT "${dpcpp_root_hints}/include/sycl/CL/")
 
-if(MULTI_GPU_SUPPORT)
-    find_package(L0 REQUIRED)
+if(TRY_ENABLE_SYCL_L0)
+    find_package(L0)
     if(LevelZero_FOUND)
         set(COMPUTE_RUNTIME_NAME ze_loader)
     endif()
 endif()
 
-
 if (NOT COMPUTE_RUNTIME_NAME)
     message("Not OpenCL or L0")
 endif()
diff --git a/cmake/FindL0.cmake b/cmake/FindL0.cmake
index da69bf6ef..607b61de8 100644
--- a/cmake/FindL0.cmake
+++ b/cmake/FindL0.cmake
@@ -16,6 +16,10 @@ endif()
 
 list(INSERT CMAKE_PREFIX_PATH 0 ${l0_root_hints})
 
+if (TARGET ze_loader)
+    set(LevelZero_FOUND ON)
+endif()
+
 if(NOT TARGET ze_loader)
     find_path(LevelZero_INCLUDE_DIR
       NAMES ze_api.h
@@ -23,8 +27,10 @@ if(NOT TARGET ze_loader)
             ENV ZE_ROOT
             ${l0_root_hints}
       PATH_SUFFIXES
+            include
+            include/level_zero
             local/include
-            local/include/level_zero/
+            local/include/level_zero
       NO_DEFAULT_PATH
     )
 
@@ -35,8 +41,9 @@ if(NOT TARGET ze_loader)
             ${l0_root_hints}
       PATH_SUFFIXES
             lib
+            lib/x86_64-linux-gnu
+            lib/level_zero
             local/lib
-            lib/level_zero/
             local/lib/level_zero
       NO_DEFAULT_PATH
     )
@@ -58,15 +65,14 @@ if(NOT TARGET ze_loader)
             message("L0 is using OpenCL interoperability")
             list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
         endif()
+        add_library(ze_loader INTERFACE IMPORTED)
+        set_target_properties(ze_loader
+          PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
+        )
+        set_target_properties(ze_loader
+          PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
+        )
     endif()
-
-    add_library(ze_loader INTERFACE IMPORTED)
-    set_target_properties(ze_loader
-      PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
-    )
-    set_target_properties(ze_loader
-      PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
-    )
 endif()
 
 # Reverting the CMAKE_PREFIX_PATH to its original state
diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index 4002320ea..5d6cf2063 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -45,7 +45,10 @@ endfunction(get_vcs_properties)
 function(activate_compute_runtime MODULES_PATH COMPUTE_RUNTIME)
 
     string( TOLOWER "${COMPUTE_RUNTIME}" COMPUTE_RUNTIME)
+
     set(CCL_ENABLE_SYCL_V 0 PARENT_SCOPE)
+    set(CCL_ENABLE_SYCL_L0 0 PARENT_SCOPE)
+
     message("Search Compute Runtime by MODULES_PATH: ${MODULES_PATH}")
     list(APPEND CMAKE_MODULE_PATH "${MODULES_PATH}")
 
@@ -54,9 +57,12 @@ function(activate_compute_runtime MODULES_PATH COMPUTE_RUNTIME)
         SET (COMPUTE_RUNTIME_LOAD_MODULE "ComputeCpp"
                 CACHE STRING
              "COMPUTE_RUNTIME=${COMPUTE_RUNTIME} requested. Using ComputeCpp provider")
+
         find_package(${COMPUTE_RUNTIME_LOAD_MODULE} REQUIRED)
 
-        set (CCL_ENABLE_SYCL_V 1 PARENT_SCOPE)
+        if(NOT ComputeCpp_FOUND)
+            message(FATAL_ERROR "Failed to find ComputeCpp")
+        endif()
 
         # remember compilation flags, because flag required for OBJECTS target
         # but if we use `target_link_libraries`, then these flags applied to all compiler options
@@ -74,8 +80,17 @@ function(activate_compute_runtime MODULES_PATH COMPUTE_RUNTIME)
         SET (COMPUTE_RUNTIME_LOAD_MODULE "IntelSYCL"
                 CACHE STRING
              "COMPUTE_RUNTIME=${COMPUTE_RUNTIME} requested. Using DPC++ provider")
+
         find_package(${COMPUTE_RUNTIME_LOAD_MODULE} REQUIRED)
 
+        if(NOT IntelSYCL_FOUND)
+            message(FATAL_ERROR "Failed to find IntelSYCL")
+        endif()
+
+        if(LevelZero_FOUND)
+            set(CCL_ENABLE_SYCL_L0 1 PARENT_SCOPE)
+        endif()
+
         set(CCL_ENABLE_SYCL_V 1 PARENT_SCOPE)
 
         # remember compilation flags, because flag required for OBJECTS target
@@ -93,8 +108,14 @@ function(activate_compute_runtime MODULES_PATH COMPUTE_RUNTIME)
         SET (COMPUTE_RUNTIME_LOAD_MODULE "L0"
                 CACHE STRING
              "COMPUTE_RUNTIME=${COMPUTE_RUNTIME} requested")
+
         find_package(${COMPUTE_RUNTIME_LOAD_MODULE} REQUIRED)
 
+        if(NOT LevelZero_FOUND)
+            message(STATUS "Can not find level-zero")
+            return()
+        endif()
+
         # No compiler flags
         set (COMPUTE_RUNTIME_CXXFLAGS_LOCAL "")
 
@@ -103,6 +124,10 @@ function(activate_compute_runtime MODULES_PATH COMPUTE_RUNTIME)
         set (COMPUTE_RUNTIME_TARGET_NAME ze_loader PARENT_SCOPE)
     endif()
 
+    if (NOT COMPUTE_RUNTIME_TARGET_NAME)
+        message(FATAL_ERROR "Failed to find requested compute runtime: ${COMPUTE_RUNTIME}")
+    endif()
+
     # extract target properties
     get_target_property(COMPUTE_RUNTIME_INCLUDE_DIRS_LOCAL
                         ${COMPUTE_RUNTIME_TARGET_NAME} INTERFACE_INCLUDE_DIRECTORIES)
diff --git a/cmake/vars.sh.in b/cmake/vars.sh.in
index c77f71eab..ba9d5bc54 100644
--- a/cmake/vars.sh.in
+++ b/cmake/vars.sh.in
@@ -1,3 +1,4 @@
+#!/bin/bash
 #
 # Copyright 2016-2020 Intel Corporation
 # 
@@ -13,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-#!/bin/bash
 
 get_script_path() (
   script="$1"
diff --git a/doc/rst/source/device_communication.rst b/doc/rst/source/device_communication.rst
index 842bfa502..4e8ad3c0d 100644
--- a/doc/rst/source/device_communication.rst
+++ b/doc/rst/source/device_communication.rst
@@ -17,7 +17,7 @@ Consider a simple oneCCL ``allreduce`` example for GPU:
 
 .. code:: cpp
 
-    auto ccl_device_context = ccl::create_context(sycl_context);
+    auto ccl_context = ccl::create_context(sycl_context);
     auto ccl_device = ccl::create_device(sycl_device);
 
     auto comms = ccl::create_communicators(
@@ -56,9 +56,9 @@ Consider a simple oneCCL ``allreduce`` example for GPU:
 .. code:: cpp
 
     /* using SYCL buffer and accessor */
-    sycl_queue.submit([&](cl::sycl::handler& cgh) {
-        auto send_buf_dev_acc = send_buf.get_access<mode::write>(cgh);
-        cgh.parallel_for<class allreduce_send_buf_modify>(range<1>{elem_count}, [=](item<1> idx) {
+    sycl_queue.submit([&](cl::sycl::handler& h) {
+        auto send_buf_dev_acc = send_buf.get_access<mode::write>(h);
+        h.parallel_for(range<1>{elem_count}, [=](item<1> idx) {
             send_buf_dev_acc[idx] += 1;
         });
     });
@@ -100,9 +100,9 @@ Consider a simple oneCCL ``allreduce`` example for GPU:
     auto comm_size = comm.size();
     auto expected = comm_size * (comm_size + 1) / 2;
 
-    sycl_queue.submit([&](handler& cgh) {
-        auto recv_buf_dev_acc = recv_buf.get_access<mode::write>(cgh);
-        cgh.parallel_for<class allreduce_recv_buf_check>(range<1>{elem_count}, [=](item<1> idx) {
+    sycl_queue.submit([&](handler& h) {
+        auto recv_buf_dev_acc = recv_buf.get_access<mode::write>(h);
+        h.parallel_for(range<1>{elem_count}, [=](item<1> idx) {
             if (recv_buf_dev_acc[idx] != expected) {
                 recv_buf_dev_acc[idx] = -1;
             }
diff --git a/doc/rst/source/env_variables.rst b/doc/rst/source/env_variables.rst
index ec6374891..e5cf0f7d4 100755
--- a/doc/rst/source/env_variables.rst
+++ b/doc/rst/source/env_variables.rst
@@ -534,3 +534,28 @@ CCL_LOG_LEVEL
 **Description**
 
 Set this environment variable to control logging level.
+
+
+CCL_MAX_SHORT_SIZE
+##################
+**Syntax**
+
+:: 
+
+  CCL_MAX_SHORT_SIZE=<value>
+
+**Arguments**
+
+.. list-table:: 
+   :widths: 25 50
+   :header-rows: 1
+   :align: left
+   
+   * - <value> 
+     - Description
+   * - ``SIZE``
+     - Bytes threshold for a collective operation (``0`` if not specified). If the size of a communication buffer in bytes is less than or equal to ``SIZE``, then |product_short| does not split operation between workers. Applicable for ``allreduce``, ``reduce`` and ``broadcast``.
+
+**Description**
+
+Set this environment variable to specify the threshold of the number of bytes for a collective operation to be split.
diff --git a/doc/rst/source/index.rst b/doc/rst/source/index.rst
index 63e525be9..5c3430aa0 100755
--- a/doc/rst/source/index.rst
+++ b/doc/rst/source/index.rst
@@ -12,7 +12,7 @@
 - Optimized to drive scalability of communication patterns by allowing to easily trade-off compute for communication performance.
 - Enables a set of DL-specific optimizations, such as prioritization, persistent operations, or out-of-order execution.
 - Works across various interconnects: Intel(R) Omni-Path Architecture, InfiniBand*, and Ethernet.
-- Provides common API sufficient to support communication workflows within Deep Learning frameworks (such as PyTorch*, Horovod*).
+- Provides common API sufficient to support communication workflows within Deep Learning / distributed frameworks (such as PyTorch*, Horovod*).
 
 |product_short| package comprises the |product_short| Software Development Kit (SDK) and the Intel(R) MPI Library Runtime components.
 
@@ -34,6 +34,7 @@ Contents:
    specification.rst
    host_communication.rst
    device_communication.rst
+   limitations.rst
 
 .. toctree::
    :maxdepth: 1
diff --git a/doc/rst/source/installation.rst b/doc/rst/source/installation.rst
index 987a820e8..03faf19f0 100755
--- a/doc/rst/source/installation.rst
+++ b/doc/rst/source/installation.rst
@@ -10,11 +10,7 @@ Installation
 
 This page explains how to install and configure the |product_full| (|product_short|).
 
-|product_short| supports different installation scenarios:
-
-* `Installation using command line interface`_
-* `Installation using tar.gz`_
-* `Installation using RPM`_
+|product_short| supports different installation scenarios using command line interface.
 
 .. note:: Visit |sys_req|_ to learn about hardware and software requirements for |product_short|.
 
@@ -64,93 +60,30 @@ You can customize CLI-based installation (for example, specify directory, compil
 
   ::
 
-    cmake .. -DCMAKE_INSTALL_PREFIX=/path/to/installation/directory
+    cmake .. -DCMAKE_INSTALL_PREFIX=</path/to/installation/directory>
 
-  If no ``-DCMAKE_INSTALL_PREFIX`` is specified, |product_short| is installed into the ``_install`` subdirectory of the current build directory.
-  For example, ``ccl/build/_install``.
+  If no ``-DCMAKE_INSTALL_PREFIX`` is specified, |product_short| is installed into the ``_install`` subdirectory of the current build directory. For example, ``ccl/build/_install``.
 
 * To specify **compiler**, modify the ``cmake`` command:
 
   ::
 
-     cmake .. -DCMAKE_C_COMPILER=your_c_compiler -DCMAKE_CXX_COMPILER=your_cxx_compiler
-
-  If ``CMAKE_CXX_COMPILER`` requires ``SYCL`` cross-platform abstraction level it should be specified in ``-DCOMPUTE_RUNTIME`` ( ``compute++`` and ``dpcpp`` supported only):
-
-  ::
-
-     cmake .. -DCMAKE_C_COMPILER=your_c_compiler -DCMAKE_CXX_COMPILER=compute++ -DCOMPUTE_RUNTIME=computecpp
-     cmake .. -DCMAKE_C_COMPILER=your_c_compiler -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_RUNTIME=dpcpp
+     cmake .. -DCMAKE_C_COMPILER=<c_compiler> -DCMAKE_CXX_COMPILER=<cxx_compiler>
 
-  OpenCL search location path hint can be specified by using standart environment ``OPENCLROOT`` additionally:
+  To enable ``SYCL`` devices communication support specify ``SYCL`` compiler and set ``-DCOMPUTE_RUNTIME`` (DPC++ supported only):
 
   ::
 
-     OPENCLROOT=your_opencl_location cmake .. -DCMAKE_C_COMPILER=your_c_compiler -DCMAKE_CXX_COMPILER=compute++ -DCOMPUTE_RUNTIME=computecpp
-
+     cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=dpcpp -DCOMPUTE_RUNTIME=dpcpp
 
 * To specify the **build type**, modify the ``cmake`` command:
 
   ::
 
-     cmake .. -DCMAKE_BUILD_TYPE=[Debug|Release|RelWithDebInfo|MinSizeRel]
+     cmake .. -DCMAKE_BUILD_TYPE=[Debug|Release]
 
 * To enable ``make`` verbose output to see all parameters used by ``make`` during compilation and linkage, modify the ``make`` command as follows:
 
   ::
 
-     make -j VERBOSE=1
-
-* To archive installed files:
-
-  ::
-
-     make -j install
-
-* To build with Address Sanitizer, modify the ``cmake`` command as follow:
-
-  ::
-
-     cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_ASAN=true
-
-  Make sure that ``libasan.so`` exists.
-
-  .. note::
-
-     Address sanitizer only works in the debug build.
-
-Binary releases are available on our release page.
-
-Installation using tar.gz
-*************************
-
-To install |product_short| using the |tgz_file|_ in a user mode, execute the following commands:
-
-.. prompt:: bash
-
-   tar zxf l_ccl-devel-64-<version>.<update>.<package#>.tgz
-   cd l_ccl_<version>.<update>.<package#>
-   ./install.sh
-
-There is no uninstall script. To uninstall |product_short|, delete the whole installation directory.
-
-Installation using RPM
-**********************
-
-You can get |product_short| through the RPM Package Manager. To install the library in a root mode using RPM, follow these steps:
-
-#. Log in as root.
-
-#. Install the following package:
-
-  .. prompt:: bash
-
-     rpm -i intel-ccl-devel-64-<version>.<update>-<package#>.x86_64.rpm
-
-     where ``<version>.<update>-<package#>`` is a string. For example, ``2017.0-009``.
-
-To uninstall |product_short| using the RPM Package Manager, execute this command:
-
-  .. prompt:: bash
-
-     rpm -e intel-ccl-devel-64-<version>.<update>-<package#>.x86_64
+     make -j VERBOSE=1 install
diff --git a/doc/rst/source/limitations.rst b/doc/rst/source/limitations.rst
new file mode 100644
index 000000000..004e9e53f
--- /dev/null
+++ b/doc/rst/source/limitations.rst
@@ -0,0 +1,9 @@
+===========
+Limitations
+===========
+
+The list of scenarious not yet supported by oneCCL:
+
+- Creation of multiple ranks within single process
+- Handling of dependencies as operation parameter (for example, ``deps`` vector in ``ccl::allreduce(..., deps)``)
+- Float16 datatype support
diff --git a/doc/rst/source/sample.rst b/doc/rst/source/sample.rst
index e69e04958..f1db10d70 100755
--- a/doc/rst/source/sample.rst
+++ b/doc/rst/source/sample.rst
@@ -25,15 +25,23 @@ The sample code below shows how to use |product_short| API to perform allreduce
         MPI_Comm_size(MPI_COMM_WORLD, &size);
         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
+        atexit(mpi_finalize);
+
         queue q;
-        if (!create_sycl_queue(argc, argv, q)) {
-            MPI_Finalize();
+        if (!create_sycl_queue(argc, argv, rank, q)) {
             return -1;
         }
 
-        /* allocate USM buffers */
-        auto send_buf = aligned_alloc_shared<int>(64, count, q);
-        auto recv_buf = aligned_alloc_shared<int>(64, count, q);
+        buf_allocator<int> allocator(q);
+
+        auto usm_alloc_type = usm::alloc::shared;
+        if (argc > 2) {
+            usm_alloc_type = usm_alloc_type_from_string(argv[2]);
+        }
+
+        if (!check_sycl_usm(q, usm_alloc_type)) {
+            return -1;
+        }
 
         /* create kvs */
         ccl::shared_ptr_class<ccl::kvs> kvs;
@@ -56,18 +64,15 @@ The sample code below shows how to use |product_short| API to perform allreduce
         /* create stream */
         auto stream = ccl::create_stream(q);
 
-        {
-            /* open buffers and initialize them on the host side */
-            for (i = 0; i < count; i++) {
-                send_buf[i] = rank;
-                recv_buf[i] = -1;
-            }
-        }
+        /* create buffers */
+        auto send_buf = allocator.allocate(count, usm_alloc_type);
+        auto recv_buf = allocator.allocate(count, usm_alloc_type);
 
-        /* open send_buf and modify it on the device side */
-        q.submit([&](auto &h) {
+        /* open buffers and modify them on the device side */
+        auto e = q.submit([&](auto &h) {
             h.parallel_for(count, [=](auto id) {
-                send_buf[id] += 1;
+                send_buf[id] = rank + 1;
+                recv_buf[id] = -1;
             });
         });
 
@@ -105,21 +110,14 @@ The sample code below shows how to use |product_short| API to perform allreduce
             }
         }
 
-        free(send_buf, q);
-        free(recv_buf, q);
-
-        MPI_Finalize();
-
         return 0;
     }
 
 
-
-
 Build details
 *************
 
-#. |product_short| should be built with SYCL* support.
+#. |product_short| should be built with ``SYCL`` support (DPC++ supported only).
 
 #. Set up the library environment (see :doc:`prerequisites`).
 
diff --git a/doc/rst/source/sparse_collectives.rst b/doc/rst/source/sparse_collectives.rst
index 4e4440bcf..5c30baf9e 100755
--- a/doc/rst/source/sparse_collectives.rst
+++ b/doc/rst/source/sparse_collectives.rst
@@ -84,13 +84,13 @@ Completion callback should follow the signature:
 
         typedef void (*completion_fn)
         (
-            const void*,   // idx_buf
-            size_t,        // idx_count
-            ccl::datatype, // idx_dtype
-            const void*,   // val_buf
-            size_t,        // val_count
-            ccl::datatype, // val_dtype
-            const void*    // user_context
+            const void*,   /* idx_buf      */
+            size_t,        /* idx_count    */
+            ccl::datatype, /* idx_dtype    */
+            const void*,   /* val_buf      */
+            size_t,        /* val_count    */
+            ccl::datatype, /* val_dtype    */
+            const void*    /* user_context */
         );
 
 Note that ``idx_buf`` and ``val_buf`` are temporary buffers.
@@ -103,18 +103,14 @@ Allocation callback should follow the signature:
 
         typedef void (*alloc_fn)
         (
-            size_t,        // idx_count
-            ccl::datatype, // idx_dtype
-            size_t,        // val_count
-            ccl::datatype, // val_dtype
-            const void*,   // user_context
-            void**,        // out_idx_buf
-            void**         // out_val_buf
+            size_t,        /* idx_count    */
+            ccl::datatype, /* idx_dtype    */
+            size_t,        /* val_count    */
+            ccl::datatype, /* val_dtype    */
+            const void*,   /* user_context */
+            void**,        /* out_idx_buf  */
+            void**         /* out_val_buf  */
         );
 
-
-For more details, refer to `this example <https://github.com/oneapi-src/oneCCL/blob/master/examples/cpu/sparse_allreduce.cpp>`_.
-
-
 .. note::
     WARNING: ``ccl::sparse_allreduce`` is experimental and subject to change.
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 71f32028d..24ce78b4d 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -15,6 +15,11 @@
 #
 file(GLOB sources "./src/*.c" "./src/*.cpp")
 
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    set(CMAKE_CXX_STANDARD 17)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+
 include_directories(include)
 include_directories(src)
 
@@ -29,5 +34,5 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC dl)
     target_link_libraries(${executable} PRIVATE m)
     target_link_libraries(${executable} PUBLIC mpi)
-    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/benchmark)
+    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/benchmark OPTIONAL)
 endforeach()
diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
index 5a5fc9a09..8525b498c 100644
--- a/examples/benchmark/include/benchmark.hpp
+++ b/examples/benchmark/include/benchmark.hpp
@@ -36,134 +36,52 @@ using namespace cl::sycl;
 using namespace cl::sycl::access;
 #endif /* CCL_ENABLE_SYCL */
 
+#include "base.hpp"
 #include "base_utils.hpp"
 #include "bf16.hpp"
 #include "coll.hpp"
 #include "sparse_allreduce/sparse_detail.hpp"
 
-/* specific benchmark variables */
-// TODO: add ccl::bf16
-constexpr std::initializer_list<ccl::datatype> all_dtypes = {
-    ccl::datatype::int8,    ccl::datatype::int32, ccl::datatype::float32,
-    ccl::datatype::float64, ccl::datatype::int64, ccl::datatype::uint64
-};
-
-/* specific benchmark defines */
-
-#define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__);
-
-#ifndef PRINT_BY_ROOT
-#define PRINT_BY_ROOT(comm, fmt, ...) \
-    if (comm.rank() == 0) { \
-        printf(fmt "\n", ##__VA_ARGS__); \
-    }
-#endif //PRINT_BY_ROOT
-
-#define ASSERT(cond, fmt, ...) \
-    do { \
-        if (!(cond)) { \
-            printf("FAILED\n"); \
-            fprintf(stderr, "ASSERT '%s' FAILED " fmt "\n", #cond, ##__VA_ARGS__); \
-            throw std::runtime_error("ASSERT FAILED"); \
-        } \
-    } while (0)
-
-typedef enum { BACKEND_HOST, BACKEND_SYCL } backend_type_t;
-typedef enum { LOOP_REGULAR, LOOP_UNORDERED } loop_type_t;
-typedef enum { BUF_SINGLE, BUF_MULTI } buf_type_t;
-
-#define DEFAULT_BACKEND BACKEND_HOST
-#define DEFAULT_LOOP    LOOP_REGULAR
-#define DEFAULT_BUF     BUF_SINGLE
-
-std::map<backend_type_t, std::string> backend_names = {
-    std::make_pair(BACKEND_HOST, "host"),
-    std::make_pair(BACKEND_SYCL, "sycl")
-};
-
-std::map<loop_type_t, std::string> loop_names = { std::make_pair(LOOP_REGULAR, "regular"),
-                                                  std::make_pair(LOOP_UNORDERED, "unordered") };
-
-std::map<buf_type_t, std::string> buf_names = { std::make_pair(BUF_MULTI, "multi"),
-                                                std::make_pair(BUF_SINGLE, "single") };
-
-// TODO: add ccl::bf16
-std::map<ccl::datatype, std::string> dtype_names = {
-    std::make_pair(ccl::datatype::int8, "char"),
-    std::make_pair(ccl::datatype::int32, "int"),
-    std::make_pair(ccl::datatype::float32, "float"),
-    std::make_pair(ccl::datatype::float64, "double"),
-    std::make_pair(ccl::datatype::int64, "int64"),
-    std::make_pair(ccl::datatype::uint64, "uint64"),
-};
-
-std::map<ccl::reduction, std::string> reduction_names = {
-    std::make_pair(ccl::reduction::sum, "sum"),
-    std::make_pair(ccl::reduction::prod, "prod"),
-    std::make_pair(ccl::reduction::min, "min"),
-    std::make_pair(ccl::reduction::max, "max"),
-};
-
-// variables for setting dtypes to launch benchmark
-// TODO: add ccl::bf16
-template <class native_type>
-using checked_dtype_t = std::pair<bool, native_type>;
-using supported_dtypes_t = std::tuple<checked_dtype_t<char>,
-                                      checked_dtype_t<int>,
-                                      checked_dtype_t<float>,
-                                      checked_dtype_t<double>,
-                                      checked_dtype_t<int64_t>,
-                                      checked_dtype_t<uint64_t>>;
-supported_dtypes_t launch_dtypes;
-
-/* specific benchmark functions */
 void print_help_usage(const char* app) {
-    PRINT(
-        "\nUSAGE:\n"
-        "\t%s [OPTIONS]\n\n"
-        "OPTIONS:\n"
-        "\t[-b,--backend <backend>]: %s\n"
-        "\t[-e,--loop <execution loop>]: %s\n"
-        "\t[-l,--coll <collectives list>]: %s\n"
-        "\t[-i,--iters <iteration count>]: %d\n"
-        "\t[-w,--warmup_iters <warm up iteration count>]: %d\n"
-        "\t[-p,--buf_count <number of parallel operations within single collective>]: %d\n"
-        "\t[-f,--min_elem_count <minimum number of elements for single collective>]: %d\n"
-        "\t[-t,--max_elem_count <maximum number of elements for single collective>]: %d\n"
-        "\t[-c,--check <check result correctness>]: %d\n"
-        "\t[-v,--v2i_ratio <values to indices ratio in sparse_allreduce>]: %d\n"
-        "\t[-d,--dtype <datatypes list/all>]: %s\n"
-        "\t[-r,--reduction <reductions list/all>]: %s\n"
-        "\t[-n,--buf_type <buffer type>]: %s\n"
-        "\t[-o,--csv_filepath <file to store CSV-formatted data into>]: %s\n"
-        "\t[-h,--help]\n\n"
-        "example:\n\t--coll allgatherv,allreduce,sparse_allreduce,sparse_allreduce_bf16 --backend host --loop regular\n"
-        "example:\n\t--coll bcast,reduce --backend sycl --loop unordered \n",
-        app,
-        backend_names[DEFAULT_BACKEND].c_str(),
-        loop_names[DEFAULT_LOOP].c_str(),
-        DEFAULT_COLL_LIST,
-        DEFAULT_ITERS,
-        DEFAULT_WARMUP_ITERS,
-        DEFAULT_BUF_COUNT,
-        DEFAULT_MIN_ELEM_COUNT,
-        DEFAULT_MAX_ELEM_COUNT,
-        DEFAULT_CHECK_VALUES,
-        DEFAULT_V2I_RATIO,
-        DEFAULT_DTYPES_LIST,
-        DEFAULT_REDUCTIONS_LIST,
-        buf_names[DEFAULT_BUF_TYPE].c_str(),
-        DEFAULT_CSV_FILEPATH);
-}
-
-std::list<std::string> tokenize(const std::string& input, char delimeter) {
-    std::stringstream ss(input);
-    std::list<std::string> ret;
-    std::string value;
-    while (std::getline(ss, value, delimeter)) {
-        ret.push_back(value);
-    }
-    return ret;
+    PRINT("\nUSAGE:\n"
+          "\t%s [OPTIONS]\n\n"
+          "OPTIONS:\n"
+          "\t[-b,--backend <backend>]: %s\n"
+          "\t[-e,--loop <execution loop>]: %s\n"
+          "\t[-i,--iters <iteration count>]: %d\n"
+          "\t[-w,--warmup_iters <warm up iteration count>]: %d\n"
+          "\t[-p,--buf_count <number of parallel operations within single collective>]: %d\n"
+          "\t[-f,--min_elem_count <minimum number of elements for single collective>]: %d\n"
+          "\t[-t,--max_elem_count <maximum number of elements for single collective>]: %d\n"
+          "\t[-c,--check <check result correctness>]: %d\n"
+          "\t[-a,--sycl_dev_type <sycl device type>]: %s\n"
+          "\t[-m,--sycl_mem_type <sycl memory type>]: %s\n"
+          "\t[-u,--sycl_usm_type <sycl usm type>]: %s\n"
+          "\t[-k,--ranks_per_proc <rank count per process>]: %d\n"
+          "\t[-l,--coll <collectives list/all>]: %s\n"
+          "\t[-d,--dtype <datatypes list/all>]: %s\n"
+          "\t[-r,--reduction <reductions list/all>]: %s\n"
+          "\t[-o,--csv_filepath <file to store CSV-formatted data into>]: %s\n"
+          "\t[-h,--help]\n\n"
+          "example:\n\t--coll allgatherv,allreduce --backend host --loop regular\n"
+          "example:\n\t--coll bcast,reduce --backend sycl --loop unordered \n",
+          app,
+          backend_names[DEFAULT_BACKEND].c_str(),
+          loop_names[DEFAULT_LOOP].c_str(),
+          DEFAULT_ITERS,
+          DEFAULT_WARMUP_ITERS,
+          DEFAULT_BUF_COUNT,
+          DEFAULT_MIN_ELEM_COUNT,
+          DEFAULT_MAX_ELEM_COUNT,
+          DEFAULT_CHECK_VALUES,
+          sycl_dev_names[DEFAULT_SYCL_DEV_TYPE].c_str(),
+          sycl_mem_names[DEFAULT_SYCL_MEM_TYPE].c_str(),
+          sycl_usm_names[DEFAULT_SYCL_USM_TYPE].c_str(),
+          DEFAULT_RANKS_PER_PROC,
+          DEFAULT_COLL_LIST,
+          DEFAULT_DTYPES_LIST,
+          DEFAULT_REDUCTIONS_LIST,
+          DEFAULT_CSV_FILEPATH);
 }
 
 template <class Dtype, class Container>
@@ -237,95 +155,109 @@ int set_loop(const std::string& option_value, loop_type_t& loop) {
     return 0;
 }
 
-int set_buf_type(const std::string& option_value, buf_type_t& buf) {
-    std::string option_name = "buf_type";
-    std::set<std::string> supported_option_values{ buf_names[BUF_SINGLE], buf_names[BUF_MULTI] };
+int set_sycl_dev_type(const std::string& option_value, sycl_dev_type_t& dev) {
+    std::string option_name = "sycl_dev_type";
+    std::set<std::string> supported_option_values{ sycl_dev_names[SYCL_DEV_HOST],
+                                                   sycl_dev_names[SYCL_DEV_CPU],
+                                                   sycl_dev_names[SYCL_DEV_GPU] };
+
+    if (check_supported_options(option_name, option_value, supported_option_values))
+        return -1;
+
+    if (option_value == sycl_dev_names[SYCL_DEV_HOST])
+        dev = SYCL_DEV_HOST;
+    else if (option_value == sycl_dev_names[SYCL_DEV_CPU])
+        dev = SYCL_DEV_CPU;
+    else if (option_value == sycl_dev_names[SYCL_DEV_GPU])
+        dev = SYCL_DEV_GPU;
+
+    return 0;
+}
+
+int set_sycl_mem_type(const std::string& option_value, sycl_mem_type_t& mem) {
+    std::string option_name = "sycl_mem_type";
+    std::set<std::string> supported_option_values{ sycl_mem_names[SYCL_MEM_USM],
+                                                   sycl_mem_names[SYCL_MEM_BUF] };
 
     if (check_supported_options(option_name, option_value, supported_option_values))
         return -1;
 
-    buf = (option_value == buf_names[BUF_SINGLE]) ? BUF_SINGLE : BUF_MULTI;
+    mem = (option_value == sycl_mem_names[SYCL_MEM_USM]) ? SYCL_MEM_USM : SYCL_MEM_BUF;
 
     return 0;
 }
 
-// leave this dtype here because of tokenize() call
-typedef struct user_options_t {
-    backend_type_t backend;
-    loop_type_t loop;
-    size_t iters;
-    size_t warmup_iters;
-    size_t buf_count;
-    size_t min_elem_count;
-    size_t max_elem_count;
-    int check_values;
-    buf_type_t buf_type;
-    size_t v2i_ratio;
-    std::list<std::string> coll_names;
-    std::list<std::string> dtypes;
-    std::list<std::string> reductions;
-    std::string csv_filepath;
-
-    user_options_t() {
-        backend = DEFAULT_BACKEND;
-        loop = DEFAULT_LOOP;
-        coll_names = tokenize(DEFAULT_COLL_LIST, ',');
-        iters = DEFAULT_ITERS;
-        warmup_iters = DEFAULT_WARMUP_ITERS;
-        buf_count = DEFAULT_BUF_COUNT;
-        min_elem_count = DEFAULT_MIN_ELEM_COUNT;
-        max_elem_count = DEFAULT_MAX_ELEM_COUNT;
-        check_values = DEFAULT_CHECK_VALUES;
-        buf_type = DEFAULT_BUF_TYPE;
-        v2i_ratio = DEFAULT_V2I_RATIO;
-        dtypes = tokenize(DEFAULT_DTYPES_LIST, ',');
-        reductions = tokenize(DEFAULT_REDUCTIONS_LIST, ',');
-        csv_filepath = std::string(DEFAULT_CSV_FILEPATH);
+int set_sycl_usm_type(const std::string& option_value, sycl_usm_type_t& usm) {
+    std::string option_name = "sycl_usm_type";
+    std::set<std::string> supported_option_values{ sycl_usm_names[SYCL_USM_SHARED],
+                                                   sycl_usm_names[SYCL_USM_DEVICE] };
+
+    if (check_supported_options(option_name, option_value, supported_option_values))
+        return -1;
+
+    usm = (option_value == sycl_usm_names[SYCL_USM_SHARED]) ? SYCL_USM_SHARED : SYCL_USM_DEVICE;
+
+    return 0;
+}
+
+size_t get_iter_count(size_t bytes, size_t max_iter_count) {
+    size_t n, res = max_iter_count;
+    n = bytes >> 18;
+    while (n) {
+        res >>= 1;
+        n >>= 1;
     }
-} user_options_t;
 
-/* placing print_timings() here is because of declaration of user_options_t */
-// FIXME FS: what?
-void print_timings(const ccl::communicator& comm,
-                   const std::vector<double>& timer,
+    if (!res && max_iter_count)
+        res = 1;
+
+    return res;
+}
+
+/* timer array contains one number per collective, one collective corresponds to rank_per_proc ranks */
+void print_timings(ccl::communicator& comm,
+                   const std::vector<double>& local_timers,
                    const user_options_t& options,
                    const size_t elem_count,
+                   const size_t iter_count,
                    ccl::datatype dtype,
                    ccl::reduction op) {
-    const size_t buf_count = options.buf_type == BUF_SINGLE ? 1 : options.buf_count;
+    const size_t buf_count = options.buf_count;
     const size_t ncolls = options.coll_names.size();
     std::vector<double> all_timers(ncolls * comm.size());
     std::vector<size_t> recv_counts(comm.size());
 
-    size_t idx;
+    int idx;
     for (idx = 0; idx < comm.size(); idx++)
         recv_counts[idx] = ncolls;
 
-    ccl::allgatherv(timer.data(), ncolls, all_timers.data(), recv_counts, comm).wait();
+    ccl::allgatherv(local_timers.data(), ncolls, all_timers.data(), recv_counts, comm).wait();
 
     if (comm.rank() == 0) {
         std::vector<double> timers(comm.size(), 0);
-        for (size_t r = 0; r < comm.size(); ++r) {
+        for (int r = 0; r < comm.size(); ++r) {
             for (size_t c = 0; c < ncolls; ++c) {
                 timers[r] += all_timers[r * ncolls + c];
             }
         }
+
         double avg_timer(0);
         double avg_timer_per_buf(0);
         for (idx = 0; idx < comm.size(); idx++) {
             avg_timer += timers[idx];
         }
-        avg_timer /= (options.iters * comm.size());
+        avg_timer /= (iter_count * comm.size());
         avg_timer_per_buf = avg_timer / buf_count;
 
         double stddev_timer = 0;
         double sum = 0;
         for (idx = 0; idx < comm.size(); idx++) {
-            double val = timers[idx] / options.iters;
+            double val = timers[idx] / iter_count;
             sum += (val - avg_timer) * (val - avg_timer);
         }
+
         stddev_timer = sqrt(sum / comm.size()) / avg_timer * 100;
-        if (options.buf_type == BUF_SINGLE) {
+        if (buf_count == 1) {
             printf("%10zu %12.2lf %11.1lf\n",
                    elem_count * ccl::get_datatype_size(dtype) * buf_count,
                    avg_timer,
@@ -345,57 +277,41 @@ void print_timings(const ccl::communicator& comm,
         if (!options.csv_filepath.empty()) {
             std::ofstream csvf;
             csvf.open(options.csv_filepath, std::ios::app);
+
             if (csvf.is_open()) {
                 std::vector<double> avg_timer(ncolls, 0);
-                for (size_t r = 0; r < comm.size(); ++r) {
+
+                for (int r = 0; r < comm.size(); ++r) {
                     for (size_t c = 0; c < ncolls; ++c) {
                         avg_timer[c] += all_timers[r * ncolls + c];
                     }
                 }
+
                 for (size_t c = 0; c < ncolls; ++c) {
-                    avg_timer[c] /= (options.iters * comm.size());
+                    avg_timer[c] /= (iter_count * comm.size());
                 }
 
-                int idx = 0;
+                int i = 0;
                 for (auto cop = options.coll_names.begin(); cop != options.coll_names.end();
-                     ++cop, ++idx) {
+                     ++cop, ++i) {
                     csvf << comm.size() << "," << (*cop) << "," << reduction_names[op] << ","
-                         << dtype_names[dtype] << ","
-                         << ccl::get_datatype_size(dtype) << ","
-                         << elem_count << "," << buf_count << "," << avg_timer[idx] << std::endl;
+                         << dtype_names[dtype] << "," << ccl::get_datatype_size(dtype) << ","
+                         << elem_count << "," << buf_count << "," << avg_timer[i] << std::endl;
                 }
                 csvf.close();
             }
         }
     }
+
     ccl::barrier(comm);
 }
 
-/* specific benchmark functors */
-class set_dtypes_func {
-private:
-    const std::list<std::string>& dtypes;
-
-public:
-    set_dtypes_func(const std::list<std::string>& dtypes) : dtypes(dtypes) {}
-
-    template <class Dtype>
-    void operator()(checked_dtype_t<Dtype>& val) {
-        auto it = std::find(dtypes.begin(), dtypes.end(), ccl::native_type_info<Dtype>::name());
-        if (it != std::end(dtypes)) {
-            val.first = true;
-        }
-    }
-};
-
-int parse_user_options(int& argc,
-                       char**(&argv),
-                       user_options_t& options) {
+int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
     int ch;
     int errors = 0;
 
-    // values needed by getopt
-    const char* const short_options = "b:e:i:w:p:f:t:c:v:l:d:r:n:o:h:";
+    const char* const short_options = "b:e:i:w:p:f:t:c:v:o:a:m:u:k:l:d:r:h";
+
     struct option getopt_options[] = {
         { "backend", required_argument, 0, 'b' },
         { "loop", required_argument, 0, 'e' },
@@ -406,10 +322,13 @@ int parse_user_options(int& argc,
         { "max_elem_count", required_argument, 0, 't' },
         { "check", required_argument, 0, 'c' },
         { "v2i_ratio", required_argument, 0, 'v' },
+        { "sycl_dev_type", required_argument, 0, 'a' },
+        { "sycl_mem_type", required_argument, 0, 'm' },
+        { "sycl_usm_type", required_argument, 0, 'u' },
+        { "ranks", required_argument, 0, 'k' },
         { "coll", required_argument, 0, 'l' },
         { "dtype", required_argument, 0, 'd' },
         { "reduction", required_argument, 0, 'r' },
-        { "buf_type", required_argument, 0, 'n' },
         { "csv_filepath", required_argument, 0, 'o' },
         { "help", no_argument, 0, 'h' },
         { 0, 0, 0, 0 } // required at end of array.
@@ -418,12 +337,16 @@ int parse_user_options(int& argc,
     while ((ch = getopt_long(argc, argv, short_options, getopt_options, NULL)) != -1) {
         switch (ch) {
             case 'b':
-                if (set_backend(optarg, options.backend))
+                if (set_backend(optarg, options.backend)) {
+                    PRINT("failed to parse 'backend' option");
                     errors++;
+                }
                 break;
             case 'e':
-                if (set_loop(optarg, options.loop))
+                if (set_loop(optarg, options.loop)) {
+                    PRINT("failed to parse 'loop' option");
                     errors++;
+                }
                 break;
             case 'i': options.iters = atoll(optarg); break;
             case 'w': options.warmup_iters = atoll(optarg); break;
@@ -432,7 +355,32 @@ int parse_user_options(int& argc,
             case 't': options.max_elem_count = atoll(optarg); break;
             case 'c': options.check_values = atoi(optarg); break;
             case 'v': options.v2i_ratio = atoll(optarg); break;
-            case 'l': options.coll_names = tokenize(optarg, ','); break;
+            case 'a':
+                if (set_sycl_dev_type(optarg, options.sycl_dev_type)) {
+                    PRINT("failed to parse 'sycl_dev_type' option");
+                    errors++;
+                }
+                break;
+            case 'm':
+                if (set_sycl_mem_type(optarg, options.sycl_mem_type)) {
+                    PRINT("failed to parse 'sycl_mem_type' option");
+                    errors++;
+                }
+                break;
+            case 'u':
+                if (set_sycl_usm_type(optarg, options.sycl_usm_type)) {
+                    PRINT("failed to parse 'sycl_usm_type' option");
+                    errors++;
+                }
+                break;
+            case 'k': options.ranks_per_proc = atoll(optarg); break;
+            case 'l':
+                if (strcmp("all", optarg) == 0) {
+                    options.coll_names = tokenize(ALL_COLLS_LIST, ',');
+                }
+                else
+                    options.coll_names = tokenize(optarg, ',');
+                break;
             case 'd':
                 if (strcmp("all", optarg) == 0) {
                     options.dtypes = tokenize(ALL_DTYPES_LIST, ',');
@@ -447,13 +395,12 @@ int parse_user_options(int& argc,
                 else
                     options.reductions = tokenize(optarg, ',');
                 break;
-            case 'n':
-                if (set_buf_type(optarg, options.buf_type))
-                    errors++;
-                break;
             case 'o': options.csv_filepath = std::string(optarg); break;
-            case 'h': print_help_usage(argv[0]); return -1;
-            default: errors++; break;
+            case 'h': return -1;
+            default:
+                PRINT("failed to parse unknown option");
+                errors++;
+                break;
         }
     }
 
@@ -464,14 +411,23 @@ int parse_user_options(int& argc,
 
     if (errors > 0) {
         PRINT("found %d errors while parsing user options", errors);
+        for (int idx = 0; idx < argc; idx++) {
+            PRINT("arg %d: %s", idx, argv[idx]);
+        }
         return -1;
     }
 
+    /* adjust user options */
+    if (!options.min_elem_count)
+        options.min_elem_count = 1;
+
+    if (options.max_elem_count < options.min_elem_count)
+        options.max_elem_count = options.min_elem_count;
+
     return 0;
 }
 
-void print_user_options(const user_options_t& options,
-                        const ccl::communicator& comm) {
+void print_user_options(const user_options_t& options, const ccl::communicator& comm) {
     std::stringstream ss;
     ss << "colls:          ";
     std::copy(options.coll_names.begin(),
@@ -487,11 +443,14 @@ void print_user_options(const user_options_t& options,
 
     std::string backend_str = find_str_val(backend_names, options.backend);
     std::string loop_str = find_str_val(loop_names, options.loop);
-    std::string buf_type_str = find_str_val(buf_names, options.buf_type);
+
+    std::string sycl_dev_type_str = find_str_val(sycl_dev_names, options.sycl_dev_type);
+    std::string sycl_mem_type_str = find_str_val(sycl_mem_names, options.sycl_mem_type);
+    std::string sycl_usm_type_str = find_str_val(sycl_usm_names, options.sycl_usm_type);
 
     PRINT_BY_ROOT(comm,
                   "options:"
-                  "\n  ranks:          %zu"
+                  "\n  processes:      %d"
                   "\n  backend:        %s"
                   "\n  loop:           %s"
                   "\n  iters:          %zu"
@@ -500,8 +459,11 @@ void print_user_options(const user_options_t& options,
                   "\n  min_elem_count: %zu"
                   "\n  max_elem_count: %zu"
                   "\n  check:          %d"
-                  "\n  buf_type:       %s"
                   "\n  v2i_ratio:      %zu"
+                  "\n  sycl_dev_type:  %s"
+                  "\n  sycl_mem_type:  %s"
+                  "\n  sycl_usm_type:  %s"
+                  "\n  ranks_per_proc: %zu"
                   "\n  %s"
                   "\n  csv_filepath:   %s",
                   comm.size(),
@@ -513,8 +475,11 @@ void print_user_options(const user_options_t& options,
                   options.min_elem_count,
                   options.max_elem_count,
                   options.check_values,
-                  buf_type_str.c_str(),
                   options.v2i_ratio,
+                  sycl_dev_type_str.c_str(),
+                  sycl_mem_type_str.c_str(),
+                  sycl_usm_type_str.c_str(),
+                  options.ranks_per_proc,
                   ss.str().c_str(),
                   options.csv_filepath.c_str());
 }
diff --git a/examples/benchmark/include/coll.hpp b/examples/benchmark/include/coll.hpp
index 8959c141e..975c6923b 100644
--- a/examples/benchmark/include/coll.hpp
+++ b/examples/benchmark/include/coll.hpp
@@ -15,22 +15,23 @@
 */
 #pragma once
 
-#include "base.hpp"
 #include "config.hpp"
+#include "transport.hpp"
+#include "types.hpp"
 
 #ifdef CCL_ENABLE_SYCL
-#include "sycl_base.hpp"
 template <typename Dtype>
 using sycl_buffer_t = cl::sycl::buffer<Dtype, 1>;
 #endif
 
+#define COLL_ROOT (0)
+
 struct base_coll;
 
 using coll_list_t = std::vector<std::shared_ptr<base_coll>>;
 using req_list_t = std::vector<ccl::event>;
 
 typedef struct bench_exec_attr {
-
     bench_exec_attr() = default;
     template <ccl::operation_attr_id attrId, class value_t>
     struct setter {
@@ -44,8 +45,7 @@ typedef struct bench_exec_attr {
     struct factory {
         template <class attr_t>
         void operator()(ccl::shared_ptr_class<attr_t>& attr) {
-            attr = std::make_shared<attr_t>(
-                ccl::create_operation_attr<attr_t>());
+            attr = std::make_shared<attr_t>(ccl::create_operation_attr<attr_t>());
         }
     };
 
@@ -55,8 +55,8 @@ typedef struct bench_exec_attr {
                                            ccl::shared_ptr_class<ccl::alltoallv_attr>,
                                            ccl::shared_ptr_class<ccl::reduce_attr>,
                                            ccl::shared_ptr_class<ccl::broadcast_attr>,
-                                           ccl::shared_ptr_class<ccl::reduce_scatter_attr>,
-                                           ccl::shared_ptr_class<ccl::sparse_allreduce_attr>>;
+                                           ccl::shared_ptr_class<ccl::reduce_scatter_attr>/*,
+                                           ccl::shared_ptr_class<ccl::sparse_allreduce_attr>*/>;
 
     template <class attr_t>
     attr_t& get_attr() {
@@ -69,8 +69,8 @@ typedef struct bench_exec_attr {
     }
 
     template <ccl::operation_attr_id attrId, class Value>
-    typename ccl::details::ccl_api_type_attr_traits<ccl::operation_attr_id, attrId>::return_type
-    set(const Value& v) {
+    typename ccl::detail::ccl_api_type_attr_traits<ccl::operation_attr_id, attrId>::return_type set(
+        const Value& v) {
         ccl_tuple_for_each(coll_attrs, setter<attrId, Value>(v));
         return v;
     }
@@ -88,6 +88,9 @@ typedef struct bench_exec_attr {
 typedef struct bench_init_attr {
     size_t buf_count;
     size_t max_elem_count;
+    size_t ranks_per_proc;
+    sycl_mem_type_t sycl_mem_type;
+    sycl_usm_type_t sycl_usm_type;
     size_t v2i_ratio;
 } bench_init_attr;
 
@@ -96,6 +99,11 @@ struct base_coll {
     base_coll(bench_init_attr init_attr) : init_attr(init_attr) {
         send_bufs.resize(init_attr.buf_count);
         recv_bufs.resize(init_attr.buf_count);
+
+        for (size_t idx = 0; idx < init_attr.buf_count; idx++) {
+            send_bufs[idx].resize(init_attr.ranks_per_proc);
+            recv_bufs[idx].resize(init_attr.ranks_per_proc);
+        }
     }
 
     base_coll() = delete;
@@ -105,99 +113,74 @@ struct base_coll {
         return nullptr;
     };
 
-    virtual void prepare(size_t elem_count){};
-    virtual void finalize(size_t elem_count){};
+    virtual void prepare(size_t elem_count) {
+        auto& transport = transport_data::instance();
+        auto& comms = transport.get_comms();
+        auto streams = transport.get_bench_streams();
+        size_t ranks_per_proc = base_coll::get_ranks_per_proc();
+
+        for (size_t rank_idx = 0; rank_idx < ranks_per_proc; rank_idx++) {
+            prepare_internal(elem_count, comms[rank_idx], streams[rank_idx], rank_idx);
+        }
+    }
+
+    virtual void finalize(size_t elem_count) {
+        auto& transport = transport_data::instance();
+        auto& comms = transport.get_comms();
+        auto streams = transport.get_bench_streams();
+        size_t ranks_per_proc = base_coll::get_ranks_per_proc();
+
+        for (size_t rank_idx = 0; rank_idx < ranks_per_proc; rank_idx++) {
+            finalize_internal(elem_count, comms[rank_idx], streams[rank_idx], rank_idx);
+        }
+    }
+
+    virtual void prepare_internal(size_t elem_count,
+                                  ccl::communicator& comm,
+                                  ccl::stream& stream,
+                                  size_t rank_idx) = 0;
+
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) = 0;
 
     virtual ccl::datatype get_dtype() const = 0;
 
+    size_t get_dtype_size() const {
+        return ccl::get_datatype_size(get_dtype());
+    }
+
     virtual void start(size_t count,
                        size_t buf_idx,
                        const bench_exec_attr& attr,
                        req_list_t& reqs) = 0;
 
-    virtual void start_single(size_t count, const bench_exec_attr& attr, req_list_t& reqs) = 0;
-
     /* to get buf_count from initialized private member */
     size_t get_buf_count() const noexcept {
         return init_attr.buf_count;
     }
+
     size_t get_max_elem_count() const noexcept {
         return init_attr.max_elem_count;
     }
-    size_t get_single_buf_max_elem_count() const noexcept {
-        return init_attr.buf_count * init_attr.max_elem_count;
-    }
-
-    std::vector<void*> send_bufs;
-    std::vector<void*> recv_bufs;
-
-    void* single_send_buf = nullptr;
-    void* single_recv_buf = nullptr;
-
-private:
-    bench_init_attr init_attr;
-};
-
-struct host_data {
-    static ccl::shared_ptr_class<ccl::communicator> comm_ptr;
-    static void init(size_t size, size_t rank, ccl::shared_ptr_class<ccl::kvs_interface> kvs) {
-
-        if (comm_ptr) {
-            throw ccl::exception(std::string(__FUNCTION__) + " - reinit is not allowed");
-        }
-
-        comm_ptr = std::make_shared<ccl::communicator>(
-            ccl::create_communicator(size, rank, kvs));
-    }
 
-    static void deinit() {
-        comm_ptr.reset();
+    sycl_mem_type_t get_sycl_mem_type() const noexcept {
+        return init_attr.sycl_mem_type;
     }
-};
-
-ccl::shared_ptr_class<ccl::communicator> host_data::comm_ptr{};
-
-#ifdef CCL_ENABLE_SYCL
-struct device_data {
 
-    static ccl::shared_ptr_class<ccl::communicator> comm_ptr;
-    static ccl::shared_ptr_class<ccl::stream> stream_ptr;
-    static cl::sycl::queue sycl_queue;
-
-    static void init(size_t size,
-                     size_t rank,
-                     cl::sycl::device& device,
-                     cl::sycl::context& ctx,
-                     ccl::shared_ptr_class<ccl::kvs_interface> kvs) {
-
-        if (stream_ptr or comm_ptr) {
-            throw ccl::exception(std::string(__FUNCTION__) + " - reinit is not allowed");
-        }
-
-        auto ccl_dev = ccl::create_device(device);
-        auto ccl_ctx = ccl::create_context(ctx);
-
-        comm_ptr = std::make_shared<ccl::communicator>(
-            ccl::create_communicator(
-                size, rank,
-                ccl_dev,
-                ccl_ctx,
-                kvs));
-
-        sycl_queue = cl::sycl::queue(device);
-
-        stream_ptr =
-            std::make_shared<ccl::stream>(ccl::create_stream(sycl_queue));
+    sycl_usm_type_t get_sycl_usm_type() const noexcept {
+        return init_attr.sycl_usm_type;
     }
 
-    static void deinit() {
-        comm_ptr.reset();
-        stream_ptr.reset();
+    size_t get_ranks_per_proc() const noexcept {
+        return init_attr.ranks_per_proc;
     }
-};
 
-ccl::shared_ptr_class<ccl::communicator> device_data::comm_ptr{};
-ccl::shared_ptr_class<ccl::stream> device_data::stream_ptr{};
-cl::sycl::queue device_data::sycl_queue{};
+    // first dim - per buf_count, second dim - per local rank
+    std::vector<std::vector<void*>> send_bufs;
+    std::vector<std::vector<void*>> recv_bufs;
 
-#endif /* CCL_ENABLE_SYCL */
+private:
+    bench_init_attr init_attr;
+};
diff --git a/examples/benchmark/include/config.hpp b/examples/benchmark/include/config.hpp
index 3b801a140..9ccad6598 100644
--- a/examples/benchmark/include/config.hpp
+++ b/examples/benchmark/include/config.hpp
@@ -15,27 +15,28 @@
 */
 #pragma once
 
-#define ALIGNMENT      (4096)
-#define DTYPE          float
+#define ALIGNMENT (4096)
+#define DTYPE     float
 
-#define ALL_DTYPES_LIST     "char,int,float,double,int64_t,uint64_t"
+#define ALL_COLLS_LIST      "allgatherv,allreduce,alltoall,alltoallv,bcast,reduce,reduce_scatter"
+#define ALL_DTYPES_LIST     "int8,int32,int64,uint64,float32,float64"
 #define ALL_REDUCTIONS_LIST "sum,prod,min,max"
 
-#define DEFAULT_BACKEND         BACKEND_HOST
-#define DEFAULT_LOOP            LOOP_REGULAR
-#define DEFAULT_COLL_LIST \
-    "allgatherv,allreduce,alltoall,alltoallv,bcast,reduce," \
-    "reduce_scatter,sparse_allreduce,sparse_allreduce_bf16," \
-    "allgatherv,allreduce,alltoall,alltoallv,bcast,reduce," \
-    "reduce_scatter,sparse_allreduce,sparse_allreduce_bf16"
-#define DEFAULT_ITERS           (16)
-#define DEFAULT_WARMUP_ITERS    (16)
-#define DEFAULT_BUF_COUNT       (16)
-#define DEFAULT_MIN_ELEM_COUNT  (1)
-#define DEFAULT_MAX_ELEM_COUNT  (128)
-#define DEFAULT_CHECK_VALUES    (1)
-#define DEFAULT_BUF_TYPE        BUF_MULTI
-#define DEFAULT_V2I_RATIO       (128)
-#define DEFAULT_DTYPES_LIST     "float"
+#define DEFAULT_BACKEND        BACKEND_HOST
+#define DEFAULT_LOOP           LOOP_REGULAR
+#define DEFAULT_ITERS          (16)
+#define DEFAULT_WARMUP_ITERS   (16)
+#define DEFAULT_BUF_COUNT      (16)
+#define DEFAULT_MIN_ELEM_COUNT (1)
+#define DEFAULT_MAX_ELEM_COUNT (128)
+#define DEFAULT_CHECK_VALUES   (1)
+#define DEFAULT_V2I_RATIO      (128)
+#define DEFAULT_SYCL_DEV_TYPE  SYCL_DEV_GPU
+#define DEFAULT_SYCL_MEM_TYPE  SYCL_MEM_USM
+#define DEFAULT_SYCL_USM_TYPE  SYCL_USM_DEVICE
+#define DEFAULT_RANKS_PER_PROC (1)
+
+#define DEFAULT_COLL_LIST       "allreduce"
+#define DEFAULT_DTYPES_LIST     "float32"
 #define DEFAULT_REDUCTIONS_LIST "sum"
 #define DEFAULT_CSV_FILEPATH    ""
diff --git a/examples/benchmark/include/cpu_coll.hpp b/examples/benchmark/include/cpu_coll.hpp
index c064e3a53..dd33cfac4 100644
--- a/examples/benchmark/include/cpu_coll.hpp
+++ b/examples/benchmark/include/cpu_coll.hpp
@@ -19,48 +19,43 @@
 
 /* cpu-specific base implementation */
 template <class Dtype, class strategy>
-struct cpu_base_coll : base_coll, protected strategy, host_data {
+struct cpu_base_coll : base_coll, protected strategy {
     using coll_strategy = strategy;
 
     template <class... Args>
-    cpu_base_coll(bench_init_attr init_attr,
-                  size_t sbuf_multiplier,
-                  size_t rbuf_multiplier,
-                  Args&&... args)
+    cpu_base_coll(bench_init_attr init_attr, Args&&... args)
             : base_coll(init_attr),
-              coll_strategy(std::forward<Args>(args)...) {
+              coll_strategy() {
         int result = 0;
 
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            result =
-                posix_memalign((void**)&send_bufs[idx],
-                               ALIGNMENT,
-                               base_coll::get_max_elem_count() * sizeof(Dtype) * sbuf_multiplier);
-            result =
-                posix_memalign((void**)&recv_bufs[idx],
-                               ALIGNMENT,
-                               base_coll::get_max_elem_count() * sizeof(Dtype) * rbuf_multiplier);
+        size_t send_multiplier = coll_strategy::get_send_multiplier();
+        size_t recv_multiplier = coll_strategy::get_recv_multiplier();
+
+        for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) {
+            for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                result = posix_memalign(
+                    (void**)&(send_bufs[idx][rank_idx]),
+                    ALIGNMENT,
+                    base_coll::get_max_elem_count() * sizeof(Dtype) * send_multiplier);
+                result = posix_memalign(
+                    (void**)&(recv_bufs[idx][rank_idx]),
+                    ALIGNMENT,
+                    base_coll::get_max_elem_count() * sizeof(Dtype) * recv_multiplier);
+            }
         }
-        result = posix_memalign(
-            (void**)&single_send_buf,
-            ALIGNMENT,
-            base_coll::get_single_buf_max_elem_count() * sizeof(Dtype) * sbuf_multiplier);
-        result = posix_memalign(
-            (void**)&single_recv_buf,
-            ALIGNMENT,
-            base_coll::get_single_buf_max_elem_count() * sizeof(Dtype) * rbuf_multiplier);
-        (void)result;
+
+        ASSERT(result == 0, "failed to allocate buffers");
     }
 
     cpu_base_coll(bench_init_attr init_attr) : cpu_base_coll(init_attr, 1, 1) {}
 
     virtual ~cpu_base_coll() {
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            free(send_bufs[idx]);
-            free(recv_bufs[idx]);
+        for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) {
+            for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                free(send_bufs[idx][rank_idx]);
+                free(recv_bufs[idx][rank_idx]);
+            }
         }
-        free(single_send_buf);
-        free(single_recv_buf);
     }
 
     const char* name() const noexcept override {
@@ -71,35 +66,44 @@ struct cpu_base_coll : base_coll, protected strategy, host_data {
                        size_t buf_idx,
                        const bench_exec_attr& attr,
                        req_list_t& reqs) override {
-        coll_strategy::start_internal(comm(),
-                                      count,
-                                      static_cast<Dtype*>(send_bufs[buf_idx]),
-                                      static_cast<Dtype*>(recv_bufs[buf_idx]),
-                                      attr,
-                                      reqs,
-                                      coll_strategy::get_op_attr(attr));
-    }
+        auto& transport = transport_data::instance();
+        auto& comms = transport.get_comms();
+        size_t ranks_per_proc = base_coll::get_ranks_per_proc();
 
-    virtual void start_single(size_t count,
-                              const bench_exec_attr& attr,
-                              req_list_t& reqs) override {
-        coll_strategy::start_internal(comm(),
-                                      count,
-                                      static_cast<Dtype*>(single_send_buf),
-                                      static_cast<Dtype*>(single_recv_buf),
-                                      attr,
-                                      reqs,
-                                      coll_strategy::get_op_attr(attr));
+        for (size_t rank_idx = 0; rank_idx < ranks_per_proc; rank_idx++) {
+            coll_strategy::start_internal(comms[rank_idx],
+                                          count,
+                                          static_cast<Dtype*>(send_bufs[buf_idx][rank_idx]),
+                                          static_cast<Dtype*>(recv_bufs[buf_idx][rank_idx]),
+                                          attr,
+                                          reqs,
+                                          coll_strategy::get_op_attr(attr));
+        }
     }
 
-    ccl::datatype get_dtype() const override final {
-        return ccl::native_type_info<typename std::remove_pointer<Dtype>::type>::ccl_datatype_value;
-    }
+    virtual void prepare_internal(size_t elem_count,
+                                  ccl::communicator& comm,
+                                  ccl::stream& stream,
+                                  size_t rank_idx) override {
+        int local_rank = comm.rank();
+
+        size_t send_count = coll_strategy::get_send_multiplier() * elem_count;
+        size_t recv_count = coll_strategy::get_recv_multiplier() * elem_count;
 
-    /* global communicator for all cpu collectives */
-    static ccl::communicator& comm() {
-        if (!host_data::comm_ptr) {
+        size_t send_bytes = send_count * base_coll::get_dtype_size();
+        size_t recv_bytes = recv_count * base_coll::get_dtype_size();
+
+        std::vector<Dtype> fill_vector(send_count);
+        std::fill(fill_vector.begin(), fill_vector.end(), local_rank);
+
+        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
+            memcpy(send_bufs[b_idx][rank_idx], fill_vector.data(), send_bytes);
+
+            memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes);
         }
-        return *host_data::comm_ptr;
+    }
+
+    ccl::datatype get_dtype() const override final {
+        return ccl::native_type_info<typename std::remove_pointer<Dtype>::type>::dtype;
     }
 };
diff --git a/examples/benchmark/include/sycl_coll.hpp b/examples/benchmark/include/sycl_coll.hpp
index abb7a9a34..748bbcc3d 100644
--- a/examples/benchmark/include/sycl_coll.hpp
+++ b/examples/benchmark/include/sycl_coll.hpp
@@ -15,88 +15,83 @@
 */
 #pragma once
 
+#include <iostream>
+#include <map>
+#include <set>
+#include <string>
+
 #include "coll.hpp"
+#include "sycl_base.hpp" /* from examples/include */
 
 #ifdef CCL_ENABLE_SYCL
 
-cl::sycl::device get_device(const ccl::communicator& comm) {
-
-    // select requested platform by SYCL_BE: L0 or OpenCL
-    std::vector<cl::sycl::device> all_devices =
-        cl::sycl::device::get_devices(info::device_type::gpu);
-    std::vector<cl::sycl::device> selected_devices;
-    std::string backend;
+#include <CL/sycl.hpp>
 
-    if (getenv("SYCL_BE") == nullptr) {
-        backend = "Level-Zero";
-    }
-    else if (getenv("SYCL_BE") != nullptr) {
-        if (std::strcmp(getenv("SYCL_BE"), "PI_LEVEL_ZERO") == 0) {
-            backend = "Level-Zero";
-        }
-        else if (std::strcmp(getenv("SYCL_BE"), "PI_OPENCL") == 0) {
-            backend = "OpenCL";
-        }
-        else {
-            throw std::runtime_error("invalid backend: " + std::string(getenv("SYCL_BE")));
-        }
-    }
-
-    for (const auto& device : all_devices) {
-        auto platform = device.get_platform();
-        auto platform_name = platform.get_info<cl::sycl::info::platform::name>();
-        std::size_t found = platform_name.find(backend);
-        if (found != std::string::npos)
-            selected_devices.push_back(device);
-    }
-
-    if (selected_devices.size() <= 0) {
-        throw ccl::exception("no selected device found for SYCL backend: " + backend);
-    }
-
-    size_t idx = comm.rank() % selected_devices.size();
-    auto selected_device = selected_devices[idx];
-    std::cout << "\nrunning on: " << selected_device.get_info<cl::sycl::info::device::name>()
-              << ", device index: " << idx << "\n";
-
-    return selected_device;
-}
+using namespace sycl;
+using namespace sycl::access;
 
 /* sycl-specific base implementation */
 template <class Dtype, class strategy>
-struct sycl_base_coll : base_coll, private strategy, device_data {
+struct sycl_base_coll : base_coll, private strategy {
     using coll_strategy = strategy;
 
     template <class... Args>
-    sycl_base_coll(bench_init_attr init_attr,
-                   size_t sbuf_multiplier,
-                   size_t rbuf_multiplier,
-                   Args&&... args)
+    sycl_base_coll(bench_init_attr init_attr, Args&&... args)
             : base_coll(init_attr),
-              coll_strategy(std::forward<Args>(args)...) {
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            send_bufs[idx] =
-                new cl::sycl::buffer<Dtype, 1>(base_coll::get_max_elem_count() * sbuf_multiplier);
-            recv_bufs[idx] =
-                new cl::sycl::buffer<Dtype, 1>(base_coll::get_max_elem_count() * rbuf_multiplier);
+              coll_strategy() {
+        auto& transport = transport_data::instance();
+        auto streams = transport.get_bench_streams();
+
+        size_t send_multiplier = coll_strategy::get_send_multiplier();
+        size_t recv_multiplier = coll_strategy::get_recv_multiplier();
+
+        for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) {
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                allocators.push_back(buf_allocator<Dtype>(streams[rank_idx].get_native()));
+
+                auto& allocator = allocators[rank_idx];
+
+                sycl::usm::alloc usm_alloc_type;
+                auto bench_alloc_type = base_coll::get_sycl_usm_type();
+                if (bench_alloc_type == SYCL_USM_SHARED)
+                    usm_alloc_type = usm::alloc::shared;
+                else if (bench_alloc_type == SYCL_USM_DEVICE)
+                    usm_alloc_type = usm::alloc::device;
+                else
+                    ASSERT(0, "unexpected bench_alloc_type %d", bench_alloc_type);
+
+                for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                    send_bufs[idx][rank_idx] = allocator.allocate(
+                        base_coll::get_max_elem_count() * send_multiplier, usm_alloc_type);
+                    recv_bufs[idx][rank_idx] = allocator.allocate(
+                        base_coll::get_max_elem_count() * recv_multiplier, usm_alloc_type);
+                }
+            }
+            else {
+                for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                    send_bufs[idx][rank_idx] = new cl::sycl::buffer<Dtype, 1>(
+                        base_coll::get_max_elem_count() * send_multiplier);
+                    recv_bufs[idx][rank_idx] = new cl::sycl::buffer<Dtype, 1>(
+                        base_coll::get_max_elem_count() * recv_multiplier);
+                }
+            }
         }
 
-        single_send_buf = new cl::sycl::buffer<Dtype, 1>(
-            base_coll::get_single_buf_max_elem_count() * sbuf_multiplier);
-
-        single_recv_buf = new cl::sycl::buffer<Dtype, 1>(
-            base_coll::get_single_buf_max_elem_count() * rbuf_multiplier);
+        host_send_buf.resize(base_coll::get_max_elem_count() * send_multiplier);
+        host_recv_buf.resize(base_coll::get_max_elem_count() * recv_multiplier);
     }
 
     sycl_base_coll(bench_init_attr init_attr) : sycl_base_coll(init_attr, 1, 1) {}
 
     virtual ~sycl_base_coll() {
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            delete static_cast<sycl_buffer_t<Dtype>*>(send_bufs[idx]);
-            delete static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[idx]);
+        for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) {
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_BUF) {
+                for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                    delete static_cast<sycl_buffer_t<Dtype>*>(send_bufs[idx][rank_idx]);
+                    delete static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[idx][rank_idx]);
+                }
+            }
         }
-        delete static_cast<sycl_buffer_t<Dtype>*>(single_send_buf);
-        delete static_cast<sycl_buffer_t<Dtype>*>(single_recv_buf);
     }
 
     const char* name() const noexcept override {
@@ -107,50 +102,96 @@ struct sycl_base_coll : base_coll, private strategy, device_data {
                        size_t buf_idx,
                        const bench_exec_attr& attr,
                        req_list_t& reqs) override {
-        sycl_buffer_t<Dtype>& send_buf = *(static_cast<sycl_buffer_t<Dtype>*>(send_bufs[buf_idx]));
-        sycl_buffer_t<Dtype>& recv_buf = *(static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[buf_idx]));
-        coll_strategy::template start_internal<sycl_buffer_t<Dtype>&>(
-            comm(),
-            count,
-            send_buf,
-            recv_buf,
-            attr,
-            reqs,
-            stream(),
-            coll_strategy::get_op_attr(attr));
+        auto& transport = transport_data::instance();
+        auto& comms = transport.get_comms();
+        auto streams = transport.get_streams();
+        size_t ranks_per_proc = base_coll::get_ranks_per_proc();
+
+        for (size_t rank_idx = 0; rank_idx < ranks_per_proc; rank_idx++) {
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                coll_strategy::start_internal(comms[rank_idx],
+                                              count,
+                                              static_cast<Dtype*>(send_bufs[buf_idx][rank_idx]),
+                                              static_cast<Dtype*>(recv_bufs[buf_idx][rank_idx]),
+                                              attr,
+                                              reqs,
+                                              streams[rank_idx],
+                                              coll_strategy::get_op_attr(attr));
+            }
+            else {
+                sycl_buffer_t<Dtype>& send_buf =
+                    *(static_cast<sycl_buffer_t<Dtype>*>(send_bufs[buf_idx][rank_idx]));
+                sycl_buffer_t<Dtype>& recv_buf =
+                    *(static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[buf_idx][rank_idx]));
+                coll_strategy::template start_internal<sycl_buffer_t<Dtype>&>(
+                    comms[rank_idx],
+                    count,
+                    send_buf,
+                    recv_buf,
+                    attr,
+                    reqs,
+                    streams[rank_idx],
+                    coll_strategy::get_op_attr(attr));
+            }
+        }
     }
 
-    virtual void start_single(size_t count,
-                              const bench_exec_attr& attr,
-                              req_list_t& reqs) override {
-        sycl_buffer_t<Dtype>& send_buf = *(static_cast<sycl_buffer_t<Dtype>*>(single_send_buf));
-        sycl_buffer_t<Dtype>& recv_buf = *(static_cast<sycl_buffer_t<Dtype>*>(single_recv_buf));
-        coll_strategy::template start_internal<sycl_buffer_t<Dtype>&>(
-            comm(),
-            count,
-            send_buf,
-            recv_buf,
-            attr,
-            reqs,
-            stream(),
-            coll_strategy::get_op_attr(attr));
+    virtual void prepare_internal(size_t elem_count,
+                                  ccl::communicator& comm,
+                                  ccl::stream& stream,
+                                  size_t rank_idx) override {
+        int comm_rank = comm.rank();
+
+        size_t send_count = coll_strategy::get_send_multiplier() * elem_count;
+        size_t recv_count = coll_strategy::get_recv_multiplier() * elem_count;
+
+        size_t send_bytes = send_count * base_coll::get_dtype_size();
+        size_t recv_bytes = recv_count * base_coll::get_dtype_size();
+
+        std::fill(host_send_buf.begin(), host_send_buf.end(), comm_rank);
+
+        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                stream.get_native()
+                    .memcpy(send_bufs[b_idx][rank_idx], host_send_buf.data(), send_bytes)
+                    .wait();
+
+                stream.get_native().memset(recv_bufs[b_idx][rank_idx], 0, recv_bytes).wait();
+            }
+            else {
+                stream.get_native()
+                    .submit([&](handler& h) {
+                        auto send_buf =
+                            (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx][rank_idx]));
+                        auto send_buf_acc =
+                            send_buf->template get_access<mode::write>(h, send_count);
+                        h.fill(send_buf_acc, static_cast<Dtype>(comm_rank));
+                    })
+                    .wait();
+
+                stream.get_native()
+                    .submit([&](handler& h) {
+                        auto recv_buf =
+                            (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                        auto recv_buf_acc =
+                            recv_buf->template get_access<mode::write>(h, recv_count);
+                        h.fill(recv_buf_acc, static_cast<Dtype>(0));
+                    })
+                    .wait();
+            }
+        }
     }
 
     ccl::datatype get_dtype() const override final {
-        return ccl::native_type_info<typename std::remove_pointer<Dtype>::type>::ccl_datatype_value;
+        return ccl::native_type_info<typename std::remove_pointer<Dtype>::type>::dtype;
     }
 
-    /* global communicator & stream for all cpu collectives */
-    static ccl::communicator& comm() {
-        if (!device_data::comm_ptr) {
-        }
-        return *device_data::comm_ptr;
-    }
+    /* used on fill/check phases */
+    std::vector<Dtype> host_send_buf;
+    std::vector<Dtype> host_recv_buf;
 
-    static ccl::stream& stream() {
-        if (!device_data::stream_ptr) {
-        }
-        return *device_data::stream_ptr;
-    }
+private:
+    std::vector<buf_allocator<Dtype>> allocators;
 };
+
 #endif /* CCL_ENABLE_SYCL */
diff --git a/examples/benchmark/include/transport.hpp b/examples/benchmark/include/transport.hpp
index b1f2b27ed..b0202e1e8 100644
--- a/examples/benchmark/include/transport.hpp
+++ b/examples/benchmark/include/transport.hpp
@@ -15,23 +15,48 @@
 */
 #pragma once
 
-#include "base_utils.hpp"
+#include <map>
+#include <vector>
 
-class transport_settings {
+#include "oneapi/ccl.hpp"
+#include "types.hpp"
+
+class transport_data {
 public:
-    static transport_settings& instance();
+    static transport_data& instance();
+    static size_t get_comm_size();
+
     int get_rank() const noexcept;
     int get_size() const noexcept;
 
     ccl::shared_ptr_class<ccl::kvs> get_kvs();
+    ccl::communicator& get_service_comm();
+    void init_comms(user_options_t& options);
+    std::vector<ccl::communicator>& get_comms();
+
+    std::vector<ccl::stream>& get_streams();
+    std::vector<ccl::stream>& get_bench_streams();
 
 private:
-    transport_settings();
-    ~transport_settings();
+    transport_data();
+    ~transport_data();
 
     int rank;
     int size;
+
+    std::vector<size_t> local_ranks;
+
     ccl::shared_ptr_class<ccl::kvs> kvs;
+    std::vector<ccl::communicator> service_comms;
+    std::vector<ccl::communicator> comms;
+
+    /*
+       FIXME: explicitly separate CCL and bench streams
+              while runtime doesn't provide MT on the same queue
+    */
+    std::vector<ccl::stream> streams;
+    std::vector<ccl::stream> bench_streams;
+
     void init_by_mpi();
     void deinit_by_mpi();
 };
diff --git a/examples/benchmark/include/types.hpp b/examples/benchmark/include/types.hpp
new file mode 100644
index 000000000..7facfb8d8
--- /dev/null
+++ b/examples/benchmark/include/types.hpp
@@ -0,0 +1,133 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl.hpp"
+
+#define PRINT(fmt, ...) printf(fmt "\n", ##__VA_ARGS__);
+
+#ifndef PRINT_BY_ROOT
+#define PRINT_BY_ROOT(comm, fmt, ...) \
+    if (comm.rank() == 0) { \
+        printf(fmt "\n", ##__VA_ARGS__); \
+    }
+#endif //PRINT_BY_ROOT
+
+#define ASSERT(cond, fmt, ...) \
+    do { \
+        if (!(cond)) { \
+            printf("FAILED\n"); \
+            fprintf(stderr, "ASSERT '%s' FAILED " fmt "\n", #cond, ##__VA_ARGS__); \
+            throw std::runtime_error("ASSERT FAILED"); \
+        } \
+    } while (0)
+
+// TODO: add ccl::bfloat16
+constexpr std::initializer_list<ccl::datatype> all_dtypes = {
+    ccl::datatype::int8,    ccl::datatype::int32, ccl::datatype::float32,
+    ccl::datatype::float64, ccl::datatype::int64, ccl::datatype::uint64
+};
+
+typedef enum { BACKEND_HOST, BACKEND_SYCL } backend_type_t;
+typedef enum { LOOP_REGULAR, LOOP_UNORDERED } loop_type_t;
+
+typedef enum { SYCL_DEV_HOST, SYCL_DEV_CPU, SYCL_DEV_GPU } sycl_dev_type_t;
+typedef enum { SYCL_MEM_USM, SYCL_MEM_BUF } sycl_mem_type_t;
+typedef enum { SYCL_USM_SHARED, SYCL_USM_DEVICE } sycl_usm_type_t;
+
+std::map<backend_type_t, std::string> backend_names = { std::make_pair(BACKEND_HOST, "host"),
+                                                        std::make_pair(BACKEND_SYCL, "sycl") };
+
+std::map<loop_type_t, std::string> loop_names = { std::make_pair(LOOP_REGULAR, "regular"),
+                                                  std::make_pair(LOOP_UNORDERED, "unordered") };
+
+std::map<sycl_dev_type_t, std::string> sycl_dev_names = { std::make_pair(SYCL_DEV_HOST, "host"),
+                                                          std::make_pair(SYCL_DEV_CPU, "cpu"),
+                                                          std::make_pair(SYCL_DEV_GPU, "gpu") };
+
+std::map<sycl_mem_type_t, std::string> sycl_mem_names = { std::make_pair(SYCL_MEM_USM, "usm"),
+                                                          std::make_pair(SYCL_MEM_BUF, "buf") };
+
+std::map<sycl_usm_type_t, std::string> sycl_usm_names = { std::make_pair(SYCL_USM_SHARED, "shared"),
+                                                          std::make_pair(SYCL_USM_DEVICE,
+                                                                         "device") };
+
+// TODO: add ccl::bfloat16
+std::map<ccl::datatype, std::string> dtype_names = {
+    std::make_pair(ccl::datatype::int8, "int8"),
+    std::make_pair(ccl::datatype::int32, "int32"),
+    std::make_pair(ccl::datatype::int64, "int64"),
+    std::make_pair(ccl::datatype::uint64, "uint64"),
+    std::make_pair(ccl::datatype::float32, "float32"),
+    std::make_pair(ccl::datatype::float64, "float64")
+};
+
+std::map<ccl::reduction, std::string> reduction_names = {
+    std::make_pair(ccl::reduction::sum, "sum"),
+    std::make_pair(ccl::reduction::prod, "prod"),
+    std::make_pair(ccl::reduction::min, "min"),
+    std::make_pair(ccl::reduction::max, "max"),
+};
+
+std::list<std::string> tokenize(const std::string& input, char delimeter) {
+    std::stringstream ss(input);
+    std::list<std::string> ret;
+    std::string value;
+    while (std::getline(ss, value, delimeter)) {
+        ret.push_back(value);
+    }
+    return ret;
+}
+
+typedef struct user_options_t {
+    backend_type_t backend;
+    loop_type_t loop;
+    size_t iters;
+    size_t warmup_iters;
+    size_t buf_count;
+    size_t min_elem_count;
+    size_t max_elem_count;
+    int check_values;
+    size_t v2i_ratio;
+    sycl_dev_type_t sycl_dev_type;
+    sycl_mem_type_t sycl_mem_type;
+    sycl_usm_type_t sycl_usm_type;
+    size_t ranks_per_proc;
+    std::list<std::string> coll_names;
+    std::list<std::string> dtypes;
+    std::list<std::string> reductions;
+    std::string csv_filepath;
+
+    user_options_t() {
+        backend = DEFAULT_BACKEND;
+        loop = DEFAULT_LOOP;
+        iters = DEFAULT_ITERS;
+        warmup_iters = DEFAULT_WARMUP_ITERS;
+        buf_count = DEFAULT_BUF_COUNT;
+        min_elem_count = DEFAULT_MIN_ELEM_COUNT;
+        max_elem_count = DEFAULT_MAX_ELEM_COUNT;
+        check_values = DEFAULT_CHECK_VALUES;
+        v2i_ratio = DEFAULT_V2I_RATIO;
+        sycl_dev_type = DEFAULT_SYCL_DEV_TYPE;
+        sycl_mem_type = DEFAULT_SYCL_MEM_TYPE;
+        sycl_usm_type = DEFAULT_SYCL_USM_TYPE;
+        ranks_per_proc = DEFAULT_RANKS_PER_PROC;
+        coll_names = tokenize(DEFAULT_COLL_LIST, ',');
+        dtypes = tokenize(DEFAULT_DTYPES_LIST, ',');
+        reductions = tokenize(DEFAULT_REDUCTIONS_LIST, ',');
+        csv_filepath = std::string(DEFAULT_CSV_FILEPATH);
+    }
+} user_options_t;
diff --git a/examples/benchmark/src/allgatherv/allgatherv_strategy.hpp b/examples/benchmark/src/allgatherv/allgatherv_strategy.hpp
index b090c0fa4..895b0dfe9 100644
--- a/examples/benchmark/src/allgatherv/allgatherv_strategy.hpp
+++ b/examples/benchmark/src/allgatherv/allgatherv_strategy.hpp
@@ -16,41 +16,45 @@
 #pragma once
 
 struct allgatherv_strategy_impl {
-    size_t comm_size = 0;
     std::vector<size_t> recv_counts;
-    allgatherv_strategy_impl(size_t size) : comm_size(size) {
-        recv_counts.resize(size);
-        //int result = posix_memalign((void**)&recv_counts, ALIGNMENT, comm_size * sizeof(size_t));
-        //(void)result;
+
+    allgatherv_strategy_impl() {
+        recv_counts.resize(transport_data::get_comm_size());
     }
 
     allgatherv_strategy_impl(const allgatherv_strategy_impl&) = delete;
     allgatherv_strategy_impl& operator=(const allgatherv_strategy_impl&) = delete;
 
-    ~allgatherv_strategy_impl() {
-        //free(recv_counts);
-    }
+    ~allgatherv_strategy_impl() {}
 
     static constexpr const char* class_name() {
         return "allgatherv";
     }
 
+    size_t get_send_multiplier() {
+        return 1;
+    }
+
+    size_t get_recv_multiplier() {
+        return transport_data::get_comm_size();
+    }
+
     static const ccl::allgatherv_attr& get_op_attr(const bench_exec_attr& bench_attr) {
         return bench_attr.get_attr<ccl::allgatherv_attr>();
     }
 
-    template <class Dtype, class comm_t, class... Args>
-    void start_internal(comm_t& comm,
+    template <class Dtype, class... Args>
+    void start_internal(ccl::communicator& comm,
                         size_t count,
                         const Dtype send_buf,
                         Dtype recv_buf,
                         const bench_exec_attr& bench_attr,
                         req_list_t& reqs,
                         Args&&... args) {
-        for (size_t idx = 0; idx < comm_size; idx++) {
+        for (int idx = 0; idx < comm.size(); idx++) {
             recv_counts[idx] = count;
         }
-        reqs.push_back(
-            ccl::allgatherv(send_buf, count, recv_buf, recv_counts, comm, std::forward<Args>(args)...));
+        reqs.push_back(ccl::allgatherv(
+            send_buf, count, recv_buf, recv_counts, comm, std::forward<Args>(args)...));
     }
 };
diff --git a/examples/benchmark/src/allgatherv/cpu_allgatherv_coll.hpp b/examples/benchmark/src/allgatherv/cpu_allgatherv_coll.hpp
index e5aaebdd0..795e3259a 100644
--- a/examples/benchmark/src/allgatherv/cpu_allgatherv_coll.hpp
+++ b/examples/benchmark/src/allgatherv/cpu_allgatherv_coll.hpp
@@ -23,48 +23,35 @@ struct cpu_allgatherv_coll : cpu_base_coll<Dtype, allgatherv_strategy_impl> {
     using coll_base = cpu_base_coll<Dtype, allgatherv_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
 
-    cpu_allgatherv_coll(bench_init_attr init_attr)
-            : coll_base(init_attr, 1, coll_base::comm().size(), coll_base::comm().size()) {}
+    cpu_allgatherv_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                ((Dtype*)send_bufs[b_idx])[e_idx] = coll_base::comm().rank();
-            }
-
-            for (size_t idx = 0; idx < coll_base::comm().size(); idx++) {
-                for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                    ((Dtype*)recv_bufs[b_idx])[idx * elem_count + e_idx] = 0;
-                }
-            }
-        }
-    }
-
-    virtual void finalize(size_t elem_count) override {
-        Dtype sbuf_expected = coll_base::comm().rank();
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
         Dtype value;
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                value = ((Dtype*)send_bufs[b_idx])[e_idx];
+                value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
 
-            for (size_t idx = 0; idx < coll_base::comm().size(); idx++) {
+            for (int idx = 0; idx < comm.size(); idx++) {
                 Dtype rbuf_expected = idx;
                 for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                    value = ((Dtype*)recv_bufs[b_idx])[idx * elem_count + e_idx];
+                    value = ((Dtype*)recv_bufs[b_idx][rank_idx])[idx * elem_count + e_idx];
                     if (value != rbuf_expected) {
                         std::cout << this->name() << " recv_bufs: buf_idx " << b_idx
-                                  << ", elem_idx " << e_idx << ", expected " << rbuf_expected
-                                  << ", got " << value << std::endl;
+                                  << ", rank_idx " << rank_idx << ", elem_idx " << e_idx
+                                  << ", expected " << rbuf_expected << ", got " << value
+                                  << std::endl;
                         ASSERT(0, "unexpected value");
                     }
                 }
diff --git a/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp b/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
index b5bff6f62..c8d465bf6 100644
--- a/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
+++ b/examples/benchmark/src/allgatherv/sycl_allgatherv_coll.hpp
@@ -20,105 +20,77 @@
 #ifdef CCL_ENABLE_SYCL
 #include "sycl_coll.hpp"
 
-template <class Dtype>
-class allgatherv_buf_check {};
-
-template <class Dtype>
-class allatherv_buf_fill {};
-
 template <class Dtype>
 struct sycl_allgatherv_coll : sycl_base_coll<Dtype, allgatherv_strategy_impl> {
     using coll_base = sycl_base_coll<Dtype, allgatherv_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
+    using coll_base::host_send_buf;
+    using coll_base::host_recv_buf;
 
-    sycl_allgatherv_coll(bench_init_attr init_attr)
-            : coll_base(init_attr, 1, coll_base::comm().size(), coll_base::comm().size()) {}
+    sycl_allgatherv_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        size_t local_rank = coll_base::comm().rank();
-        size_t local_size = coll_base::comm().size();
-
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class allatherv_buf_fill<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx)
-                {
-                    send_buf_acc[e_idx] = local_rank;
-                    for (size_t idx = 0; idx < local_size; idx++) {
-                        recv_buf_acc[idx * elem_count + e_idx.get_id(0)] = 0;
-                    }
-                });
-            });
-        }
-    }
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        int comm_size = comm.size();
+        Dtype sbuf_expected = comm.rank();
 
-    virtual void finalize(size_t elem_count) override {
-        bool unexpected_device_value = false;
-        size_t local_size = coll_base::comm().size();
-        Dtype sbuf_expected = coll_base::comm().rank();
+        size_t send_bytes = elem_count * base_coll::get_dtype_size();
+        size_t recv_bytes = comm_size * elem_count * base_coll::get_dtype_size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class allgatherv_buf_check<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx) mutable
-                {
-                    Dtype value = send_buf_acc[e_idx];
-                    if (value != sbuf_expected)
-                        unexpected_device_value = true;
-
-                    for (size_t idx = 0; idx < local_size; idx++) {
-                        Dtype rbuf_expected = idx;
-                        value = recv_buf_acc[idx * elem_count + e_idx.get_id(0)];
-                        if (value != rbuf_expected)
-                            unexpected_device_value = true;
-                    }
-                });
-            });
-        }
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_bufs[b_idx][rank_idx], send_bytes)
+                    .wait();
+
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_bufs[b_idx][rank_idx], recv_bytes)
+                    .wait();
+            }
+            else {
+                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx][rank_idx]));
+                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                auto send_buf_acc = send_buf->template get_access<mode::read>();
+                auto recv_buf_acc = recv_buf->template get_access<mode::read>();
+
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_buf_acc.get_pointer(), send_bytes)
+                    .wait();
+
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_buf_acc.get_pointer(), recv_bytes)
+                    .wait();
+            }
 
-        Dtype value;
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-            auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-            auto send_buf_acc = send_buf->template get_access<mode::write>();
-            auto recv_buf_acc = recv_buf->template get_access<mode::write>();
+            Dtype value;
 
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                value = send_buf_acc[e_idx];
+                value = host_send_buf[e_idx];
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
 
-            for (size_t idx = 0; idx < coll_base::comm().size(); idx++) {
+            for (int idx = 0; idx < comm.size(); idx++) {
                 Dtype rbuf_expected = idx;
                 for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                    value = recv_buf_acc[idx * elem_count + e_idx];
+                    value = host_recv_buf[idx * elem_count + e_idx];
                     if (value != rbuf_expected) {
                         std::cout << this->name() << " recv_bufs: buf_idx " << b_idx
-                                  << ", elem_idx " << e_idx << ", expected " << rbuf_expected
-                                  << ", got " << value << std::endl;
+                                  << ", rank_idx " << rank_idx << ", elem_idx " << e_idx
+                                  << ", expected " << rbuf_expected << ", got " << value
+                                  << std::endl;
                         ASSERT(0, "unexpected value");
                     }
                 }
             }
         }
-
-        if (unexpected_device_value)
-            ASSERT(0, "unexpected value on device");
     }
 };
 
diff --git a/examples/benchmark/src/allreduce/allreduce_strategy.hpp b/examples/benchmark/src/allreduce/allreduce_strategy.hpp
index 61ef34097..b0ed32a0c 100644
--- a/examples/benchmark/src/allreduce/allreduce_strategy.hpp
+++ b/examples/benchmark/src/allreduce/allreduce_strategy.hpp
@@ -20,12 +20,20 @@ struct allreduce_strategy_impl {
         return "allreduce";
     }
 
+    size_t get_send_multiplier() {
+        return 1;
+    }
+
+    size_t get_recv_multiplier() {
+        return 1;
+    }
+
     static const ccl::allreduce_attr& get_op_attr(const bench_exec_attr& bench_attr) {
         return bench_attr.get_attr<ccl::allreduce_attr>();
     }
 
-    template <class Dtype, class comm_t, class... Args>
-    void start_internal(comm_t& comm,
+    template <class Dtype, class... Args>
+    void start_internal(ccl::communicator& comm,
                         size_t count,
                         const Dtype send_buf,
                         Dtype recv_buf,
diff --git a/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp b/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
index e0c6ae072..299c80692 100644
--- a/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
+++ b/examples/benchmark/src/allreduce/cpu_allreduce_coll.hpp
@@ -23,41 +23,31 @@ struct cpu_allreduce_coll : cpu_base_coll<Dtype, allreduce_strategy_impl> {
     using coll_base = cpu_base_coll<Dtype, allreduce_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
 
-    cpu_allreduce_coll(bench_init_attr init_attr)
-            : coll_base(init_attr) {}
+    cpu_allreduce_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                ((Dtype*)send_bufs[b_idx])[e_idx] = coll_base::comm().rank();
-                ((Dtype*)recv_bufs[b_idx])[e_idx] = 0;
-            }
-        }
-    }
-
-    virtual void finalize(size_t elem_count) override {
-        Dtype sbuf_expected = coll_base::comm().rank();
-        Dtype rbuf_expected =
-            (coll_base::comm().size() - 1) * ((float)coll_base::comm().size() / 2);
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
+        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
         Dtype value;
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                value = ((Dtype*)send_bufs[b_idx])[e_idx];
+                value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
 
-                value = ((Dtype*)recv_bufs[b_idx])[e_idx];
+                value = ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
diff --git a/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp b/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
index 2e4dcd63e..52b5aaf98 100644
--- a/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
+++ b/examples/benchmark/src/allreduce/sycl_allreduce_coll.hpp
@@ -20,93 +20,71 @@
 #ifdef CCL_ENABLE_SYCL
 #include "sycl_coll.hpp"
 
-template <class Dtype>
-class allreduce_buf_check {};
-
-template <class Dtype>
-class allreduce_buf_fill {};
-
 template <class Dtype>
 struct sycl_allreduce_coll : sycl_base_coll<Dtype, allreduce_strategy_impl> {
     using coll_base = sycl_base_coll<Dtype, allreduce_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
+    using coll_base::host_send_buf;
+    using coll_base::host_recv_buf;
 
-    sycl_allreduce_coll(bench_init_attr init_attr)
-            : coll_base(init_attr) {}
+    sycl_allreduce_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        size_t local_rank = coll_base::comm().rank();
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class allreduce_buf_fill<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx)
-                {
-                    send_buf_acc[e_idx] = local_rank;
-                    recv_buf_acc[e_idx] = 0;
-                });
-            });
-        }
-    }
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
+        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
 
-    virtual void finalize(size_t elem_count) override {
-        bool unexpected_device_value = false;
-        Dtype sbuf_expected = coll_base::comm().rank();
-        Dtype rbuf_expected =
-            (coll_base::comm().size() - 1) * ((float)coll_base::comm().size() / 2);
+        size_t send_bytes = elem_count * base_coll::get_dtype_size();
+        size_t recv_bytes = elem_count * base_coll::get_dtype_size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class allreduce_buf_check<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx) mutable
-                {
-                    Dtype value = send_buf_acc[e_idx];
-                    if (value != sbuf_expected)
-                        unexpected_device_value = true;
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_bufs[b_idx][rank_idx], send_bytes)
+                    .wait();
 
-                    value = recv_buf_acc[e_idx];
-                    if (value != rbuf_expected)
-                        unexpected_device_value = true;
-                });
-            });
-        }
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_bufs[b_idx][rank_idx], recv_bytes)
+                    .wait();
+            }
+            else {
+                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx][rank_idx]));
+                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                auto send_buf_acc = send_buf->template get_access<mode::read>();
+                auto recv_buf_acc = recv_buf->template get_access<mode::read>();
 
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-            auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-            auto send_buf_acc = send_buf->template get_access<mode::read>();
-            auto recv_buf_acc = recv_buf->template get_access<mode::read>();
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_buf_acc.get_pointer(), send_bytes)
+                    .wait();
+
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_buf_acc.get_pointer(), recv_bytes)
+                    .wait();
+            }
+
+            Dtype value;
 
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                Dtype value = send_buf_acc[e_idx];
+                value = host_send_buf[e_idx];
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
 
-                value = recv_buf_acc[e_idx];
+                value = host_recv_buf[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
         }
-
-        if (unexpected_device_value)
-            ASSERT(0, "unexpected value on device");
     }
 };
 #endif /* CCL_ENABLE_SYCL */
diff --git a/examples/benchmark/src/alltoall/alltoall_strategy.hpp b/examples/benchmark/src/alltoall/alltoall_strategy.hpp
index 9a83cbc6d..234ffefa3 100644
--- a/examples/benchmark/src/alltoall/alltoall_strategy.hpp
+++ b/examples/benchmark/src/alltoall/alltoall_strategy.hpp
@@ -20,12 +20,20 @@ struct alltoall_strategy_impl {
         return "alltoall";
     }
 
+    size_t get_send_multiplier() {
+        return transport_data::get_comm_size();
+    }
+
+    size_t get_recv_multiplier() {
+        return transport_data::get_comm_size();
+    }
+
     static const ccl::alltoall_attr& get_op_attr(const bench_exec_attr& bench_attr) {
         return bench_attr.get_attr<ccl::alltoall_attr>();
     }
 
-    template <class Dtype, class comm_t, class... Args>
-    void start_internal(comm_t& comm,
+    template <class Dtype, class... Args>
+    void start_internal(ccl::communicator& comm,
                         size_t count,
                         const Dtype send_buf,
                         Dtype recv_buf,
diff --git a/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp b/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
index 942c48776..6e4458ca2 100644
--- a/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
+++ b/examples/benchmark/src/alltoall/cpu_alltoall_coll.hpp
@@ -23,44 +23,34 @@ struct cpu_alltoall_coll : cpu_base_coll<Dtype, alltoall_strategy_impl> {
     using coll_base = cpu_base_coll<Dtype, alltoall_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
 
-    cpu_alltoall_coll(bench_init_attr init_attr)
-            : coll_base(init_attr, coll_base::comm().size(), coll_base::comm().size()) {}
+    cpu_alltoall_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            for (size_t idx = 0; idx < coll_base::comm().size(); idx++) {
-                for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                    ((Dtype*)send_bufs[b_idx])[idx * elem_count + e_idx] = coll_base::comm().rank();
-                    ((Dtype*)recv_bufs[b_idx])[idx * elem_count + e_idx] = 0;
-                }
-            }
-        }
-    }
-
-    virtual void finalize(size_t elem_count) override {
-        Dtype sbuf_expected = coll_base::comm().rank();
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
         Dtype rbuf_expected;
         Dtype value;
-        size_t comm_size = coll_base::comm().size();
+        int comm_size = comm.size();
+
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
-                value = ((Dtype*)send_bufs[b_idx])[e_idx];
+                value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
                 rbuf_expected = e_idx / elem_count;
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
 
-                value = ((Dtype*)recv_bufs[b_idx])[e_idx];
+                value = ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
diff --git a/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp b/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
index 7794884c5..9400551f8 100644
--- a/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
+++ b/examples/benchmark/src/alltoall/sycl_alltoall_coll.hpp
@@ -20,94 +20,72 @@
 #ifdef CCL_ENABLE_SYCL
 #include "sycl_coll.hpp"
 
-template <class Dtype>
-class alltoall_buf_check {};
-
-template <class Dtype>
-class alltoall_buf_fill {};
-
 template <class Dtype>
 struct sycl_alltoall_coll : sycl_base_coll<Dtype, alltoall_strategy_impl> {
     using coll_base = sycl_base_coll<Dtype, alltoall_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
+    using coll_base::host_send_buf;
+    using coll_base::host_recv_buf;
 
-    sycl_alltoall_coll(bench_init_attr init_attr)
-            : coll_base(init_attr, coll_base::comm().size(), coll_base::comm().size()) {}
+    sycl_alltoall_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        size_t local_rank = coll_base::comm().rank();
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class alltoall_buf_fill<Dtype>>(range<1>{elem_count*coll_base::comm().size()}, [=](item<1> e_idx)
-                {
-                    send_buf_acc[e_idx] = local_rank;
-                    recv_buf_acc[e_idx] = 0;
-                });
-            });
-        }
-    }
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
+        int comm_size = comm.size();
 
-    virtual void finalize(size_t elem_count) override {
-        bool unexpected_device_value = false;
-        Dtype sbuf_expected = coll_base::comm().rank();
-        size_t comm_size = coll_base::comm().size();
+        size_t send_bytes = comm_size * elem_count * base_coll::get_dtype_size();
+        size_t recv_bytes = comm_size * elem_count * base_coll::get_dtype_size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class alltoall_buf_check<Dtype>>(range<1>{elem_count * comm_size}, [=](item<1> e_idx) mutable
-                {
-                    Dtype value = send_buf_acc[e_idx];
-                    Dtype rbuf_expected = static_cast<Dtype>(e_idx.get_id(0) / elem_count);
-                    if (value != sbuf_expected)
-                        unexpected_device_value = true;
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_bufs[b_idx][rank_idx], send_bytes)
+                    .wait();
 
-                    value = recv_buf_acc[e_idx];
-                    if (value != rbuf_expected)
-                        unexpected_device_value = true;
-                });
-            });
-        }
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_bufs[b_idx][rank_idx], recv_bytes)
+                    .wait();
+            }
+            else {
+                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx][rank_idx]));
+                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                auto send_buf_acc = send_buf->template get_access<mode::read>();
+                auto recv_buf_acc = recv_buf->template get_access<mode::read>();
 
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-            auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-            auto send_buf_acc = send_buf->template get_access<mode::read>();
-            auto recv_buf_acc = recv_buf->template get_access<mode::read>();
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_buf_acc.get_pointer(), send_bytes)
+                    .wait();
+
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_buf_acc.get_pointer(), recv_bytes)
+                    .wait();
+            }
+
+            Dtype value;
 
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
-                Dtype value = send_buf_acc[e_idx];
+                value = host_send_buf[e_idx];
                 Dtype rbuf_expected = e_idx / elem_count;
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
 
-                value = recv_buf_acc[e_idx];
+                value = host_recv_buf[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
         }
-
-        if (unexpected_device_value)
-            ASSERT(0, "unexpected value on device");
     }
 };
 #endif /* CCL_ENABLE_SYCL */
diff --git a/examples/benchmark/src/alltoallv/alltoallv_strategy.hpp b/examples/benchmark/src/alltoallv/alltoallv_strategy.hpp
index 4456c229a..257f77067 100644
--- a/examples/benchmark/src/alltoallv/alltoallv_strategy.hpp
+++ b/examples/benchmark/src/alltoallv/alltoallv_strategy.hpp
@@ -16,13 +16,12 @@
 #pragma once
 
 struct alltoallv_strategy_impl {
-    size_t comm_size = 0;
     std::vector<size_t> send_counts;
     std::vector<size_t> recv_counts;
 
-    alltoallv_strategy_impl(size_t size) : comm_size(size) {
-        send_counts.resize(comm_size);
-        recv_counts.resize(comm_size);
+    alltoallv_strategy_impl() {
+        send_counts.resize(transport_data::get_comm_size());
+        recv_counts.resize(transport_data::get_comm_size());
     }
 
     alltoallv_strategy_impl(const alltoallv_strategy_impl&) = delete;
@@ -34,19 +33,27 @@ struct alltoallv_strategy_impl {
         return "alltoallv";
     }
 
+    size_t get_send_multiplier() {
+        return transport_data::get_comm_size();
+    }
+
+    size_t get_recv_multiplier() {
+        return transport_data::get_comm_size();
+    }
+
     static const ccl::alltoallv_attr& get_op_attr(const bench_exec_attr& bench_attr) {
         return bench_attr.get_attr<ccl::alltoallv_attr>();
     }
 
-    template <class Dtype, class comm_t, class... Args>
-    void start_internal(comm_t& comm,
+    template <class Dtype, class... Args>
+    void start_internal(ccl::communicator& comm,
                         size_t count,
                         const Dtype send_buf,
                         Dtype recv_buf,
                         const bench_exec_attr& bench_attr,
                         req_list_t& reqs,
                         Args&&... args) {
-        for (size_t idx = 0; idx < comm_size; idx++) {
+        for (int idx = 0; idx < comm.size(); idx++) {
             send_counts[idx] = count;
             recv_counts[idx] = count;
         }
diff --git a/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp b/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
index 50ac640ae..58eea5922 100644
--- a/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
+++ b/examples/benchmark/src/alltoallv/cpu_alltoallv_coll.hpp
@@ -23,48 +23,33 @@ struct cpu_alltoallv_coll : cpu_base_coll<Dtype, alltoallv_strategy_impl> {
     using coll_base = cpu_base_coll<Dtype, alltoallv_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
 
-    cpu_alltoallv_coll(bench_init_attr init_attr)
-            : coll_base(init_attr,
-                        coll_base::comm().size(),
-                        coll_base::comm().size(),
-                        coll_base::comm().size()) {}
+    cpu_alltoallv_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            for (size_t idx = 0; idx < coll_base::comm().size(); idx++) {
-                for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                    ((Dtype*)send_bufs[b_idx])[idx * elem_count + e_idx] = coll_base::comm().rank();
-                    ((Dtype*)recv_bufs[b_idx])[idx * elem_count + e_idx] = 0;
-                }
-            }
-        }
-    }
-
-    virtual void finalize(size_t elem_count) override {
-        Dtype sbuf_expected = coll_base::comm().rank();
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
         Dtype rbuf_expected;
         Dtype value;
-        size_t comm_size = coll_base::comm().size();
+        int comm_size = comm.size();
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
-                value = ((Dtype*)send_bufs[b_idx])[e_idx];
+                value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
                 rbuf_expected = e_idx / elem_count;
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
 
-                value = ((Dtype*)recv_bufs[b_idx])[e_idx];
+                value = ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
diff --git a/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp b/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
index 90a49218a..3b018de31 100644
--- a/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
+++ b/examples/benchmark/src/alltoallv/sycl_alltoallv_coll.hpp
@@ -20,97 +20,72 @@
 #ifdef CCL_ENABLE_SYCL
 #include "sycl_coll.hpp"
 
-template <class Dtype>
-class alltoallv_buf_check {};
-
-template <class Dtype>
-class alltoallv_buf_fill {};
-
 template <class Dtype>
 struct sycl_alltoallv_coll : sycl_base_coll<Dtype, alltoallv_strategy_impl> {
     using coll_base = sycl_base_coll<Dtype, alltoallv_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
+    using coll_base::host_send_buf;
+    using coll_base::host_recv_buf;
 
-    sycl_alltoallv_coll(bench_init_attr init_attr)
-            : coll_base(init_attr,
-                        coll_base::comm().size(),
-                        coll_base::comm().size(),
-                        coll_base::comm().size()) {}
+    sycl_alltoallv_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        size_t local_rank = coll_base::comm().rank();
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class alltoallv_buf_fill<Dtype>>(range<1>{elem_count*coll_base::comm().size()}, [=](item<1> e_idx)
-                {
-                    send_buf_acc[e_idx] = local_rank;
-                    recv_buf_acc[e_idx] = 0;
-                });
-            });
-        }
-    }
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
+        int comm_size = comm.size();
 
-    virtual void finalize(size_t elem_count) override {
-        bool unexpected_device_value = false;
-        Dtype sbuf_expected = coll_base::comm().rank();
-        size_t comm_size = coll_base::comm().size();
+        size_t send_bytes = comm_size * elem_count * base_coll::get_dtype_size();
+        size_t recv_bytes = comm_size * elem_count * base_coll::get_dtype_size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class alltoallv_buf_check<Dtype>>(range<1>{elem_count * comm_size}, [=](item<1> e_idx) mutable
-                {
-                    Dtype value = send_buf_acc[e_idx];
-                    Dtype rbuf_expected = static_cast<Dtype>(e_idx.get_id(0) / elem_count);
-                    if (value != sbuf_expected)
-                        unexpected_device_value = true;
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_bufs[b_idx][rank_idx], send_bytes)
+                    .wait();
 
-                    value = recv_buf_acc[e_idx];
-                    if (value != rbuf_expected)
-                        unexpected_device_value = true;
-                });
-            });
-        }
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_bufs[b_idx][rank_idx], recv_bytes)
+                    .wait();
+            }
+            else {
+                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx][rank_idx]));
+                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                auto send_buf_acc = send_buf->template get_access<mode::read>();
+                auto recv_buf_acc = recv_buf->template get_access<mode::read>();
 
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-            auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-            auto send_buf_acc = send_buf->template get_access<mode::read>();
-            auto recv_buf_acc = recv_buf->template get_access<mode::read>();
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_buf_acc.get_pointer(), send_bytes)
+                    .wait();
+
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_buf_acc.get_pointer(), recv_bytes)
+                    .wait();
+            }
+
+            Dtype value;
 
             for (size_t e_idx = 0; e_idx < elem_count * comm_size; e_idx++) {
-                Dtype value = send_buf_acc[e_idx];
+                value = host_send_buf[e_idx];
                 Dtype rbuf_expected = e_idx / elem_count;
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
 
-                value = recv_buf_acc[e_idx];
+                value = host_recv_buf[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
         }
-
-        if (unexpected_device_value)
-            ASSERT(0, "unexpected value on device");
     }
 };
 #endif /* CCL_ENABLE_SYCL */
diff --git a/examples/benchmark/src/bcast/bcast_strategy.hpp b/examples/benchmark/src/bcast/bcast_strategy.hpp
index 0a1623b9f..369afd28d 100644
--- a/examples/benchmark/src/bcast/bcast_strategy.hpp
+++ b/examples/benchmark/src/bcast/bcast_strategy.hpp
@@ -20,12 +20,20 @@ struct bcast_strategy_impl {
         return "bcast";
     }
 
+    size_t get_send_multiplier() {
+        return 1;
+    }
+
+    size_t get_recv_multiplier() {
+        return 1;
+    }
+
     static const ccl::broadcast_attr& get_op_attr(const bench_exec_attr& bench_attr) {
         return bench_attr.get_attr<ccl::broadcast_attr>();
     }
 
-    template <class Dtype, class comm_t, class... Args>
-    void start_internal(comm_t& comm,
+    template <class Dtype, class... Args>
+    void start_internal(ccl::communicator& comm,
                         size_t count,
                         Dtype send_buf,
                         Dtype recv_buf,
@@ -33,6 +41,7 @@ struct bcast_strategy_impl {
                         req_list_t& reqs,
                         Args&&... args) {
         (void)send_buf;
-        reqs.push_back(ccl::broadcast(recv_buf, count, COLL_ROOT, comm, std::forward<Args>(args)...));
+        reqs.push_back(
+            ccl::broadcast(recv_buf, count, COLL_ROOT, comm, std::forward<Args>(args)...));
     }
 };
diff --git a/examples/benchmark/src/bcast/cpu_bcast_coll.hpp b/examples/benchmark/src/bcast/cpu_bcast_coll.hpp
index cf1066db2..d6b2d8945 100644
--- a/examples/benchmark/src/bcast/cpu_bcast_coll.hpp
+++ b/examples/benchmark/src/bcast/cpu_bcast_coll.hpp
@@ -22,31 +22,35 @@ template <class Dtype>
 struct cpu_bcast_coll : cpu_base_coll<Dtype, bcast_strategy_impl> {
     using coll_base = cpu_base_coll<Dtype, bcast_strategy_impl>;
     using coll_base::recv_bufs;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
 
-    cpu_bcast_coll(bench_init_attr init_attr)
-            : coll_base(init_attr) {}
+    cpu_bcast_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
+    virtual void prepare_internal(size_t elem_count,
+                                  ccl::communicator& comm,
+                                  ccl::stream& stream,
+                                  size_t rank_idx) override {
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                if (coll_base::comm().rank() == COLL_ROOT)
-                    ((Dtype*)recv_bufs[b_idx])[e_idx] = e_idx;
+                if (comm.rank() == COLL_ROOT)
+                    ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx] = e_idx;
                 else
-                    ((Dtype*)recv_bufs[b_idx])[e_idx] = 0;
+                    ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx] = 0;
             }
         }
     }
 
-    virtual void finalize(size_t elem_count) override {
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
         Dtype value;
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                value = ((Dtype*)recv_bufs[b_idx])[e_idx];
+                value = ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx];
                 if (static_cast<size_t>(value) != e_idx) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << e_idx << ", got " << value << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected " << e_idx
+                              << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
diff --git a/examples/benchmark/src/bcast/sycl_bcast_coll.hpp b/examples/benchmark/src/bcast/sycl_bcast_coll.hpp
index 428e6cb95..863f62a01 100644
--- a/examples/benchmark/src/bcast/sycl_bcast_coll.hpp
+++ b/examples/benchmark/src/bcast/sycl_bcast_coll.hpp
@@ -20,71 +20,85 @@
 #ifdef CCL_ENABLE_SYCL
 #include "sycl_coll.hpp"
 
-template <class Dtype>
-class bcast_buf_check {};
-
-template <class Dtype>
-class bcast_buf_fill {};
-
 template <class Dtype>
 struct sycl_bcast_coll : sycl_base_coll<Dtype, bcast_strategy_impl> {
     using coll_base = sycl_base_coll<Dtype, bcast_strategy_impl>;
     using coll_base::recv_bufs;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
+    using coll_base::host_recv_buf;
 
-    sycl_bcast_coll(bench_init_attr init_attr)
-            : coll_base(init_attr) {}
+    sycl_bcast_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
+
+    virtual void prepare_internal(size_t elem_count,
+                                  ccl::communicator& comm,
+                                  ccl::stream& stream,
+                                  size_t rank_idx) override {
+        int comm_rank = comm.rank();
+
+        size_t count = elem_count;
+        size_t bytes = count * base_coll::get_dtype_size();
+
+        std::iota(host_recv_buf.begin(), host_recv_buf.end(), 0);
 
-    virtual void prepare(size_t elem_count) override {
-        size_t local_rank = coll_base::comm().rank();
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class bcast_buf_fill<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx)
-                {
-                    if (local_rank == COLL_ROOT)
-                        recv_buf_acc[e_idx] = e_idx.get_id(0);
-                    else
-                        recv_buf_acc[e_idx] = 0;
-                });
-            });
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                if (comm_rank == COLL_ROOT)
+                    stream.get_native()
+                        .memcpy(recv_bufs[b_idx][rank_idx], host_recv_buf.data(), bytes)
+                        .wait();
+                else
+                    stream.get_native().memset(recv_bufs[b_idx][rank_idx], 0, bytes).wait();
+            }
+            else {
+                stream.get_native()
+                    .submit([&](handler& h) {
+                        auto recv_buf =
+                            (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                        auto recv_buf_acc = recv_buf->template get_access<mode::write>(h);
+                        h.parallel_for(range<1>{ elem_count }, [=](item<1> e_idx) {
+                            if (comm_rank == COLL_ROOT)
+                                recv_buf_acc[e_idx] = e_idx.get_id(0);
+                            else
+                                recv_buf_acc[e_idx] = 0;
+                        });
+                    })
+                    .wait();
+            }
         }
     }
 
-    virtual void finalize(size_t elem_count) override {
-        bool unexpected_device_value = false;
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        size_t bytes = elem_count * base_coll::get_dtype_size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class bcast_buf_check<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx) mutable
-                {
-                    if (recv_buf_acc[e_idx] != e_idx.get_id(0))
-                        unexpected_device_value = true;
-                });
-            });
-        }
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_bufs[b_idx][rank_idx], bytes)
+                    .wait();
+            }
+            else {
+                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                auto recv_buf_acc = recv_buf->template get_access<mode::read>();
 
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-            auto recv_buf_acc = recv_buf->template get_access<mode::read>();
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_buf_acc.get_pointer(), bytes)
+                    .wait();
+            }
+
+            Dtype value;
 
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                Dtype value = recv_buf_acc[e_idx];
+                value = host_recv_buf[e_idx];
                 if (value != e_idx) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << (Dtype)e_idx << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected " << (Dtype)e_idx
+                              << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
         }
-
-        if (unexpected_device_value)
-            ASSERT(0, "unexpected value on device");
     }
 };
 #endif /* CCL_ENABLE_SYCL */
diff --git a/examples/benchmark/src/benchmark.cpp b/examples/benchmark/src/benchmark.cpp
index b9a3597f6..314d7aa32 100644
--- a/examples/benchmark/src/benchmark.cpp
+++ b/examples/benchmark/src/benchmark.cpp
@@ -27,15 +27,13 @@
 
 #include "benchmark.hpp"
 #include "declarations.hpp"
-
 #include "transport_impl.hpp"
 
-void do_regular(const ccl::communicator& comm,
+void do_regular(ccl::communicator& service_comm,
                 bench_exec_attr& bench_attr,
                 coll_list_t& all_colls,
                 req_list_t& reqs,
                 const user_options_t& options) {
-
     std::stringstream match_id_stream;
 
     for (auto dtype : all_dtypes) {
@@ -59,187 +57,121 @@ void do_regular(const ccl::communicator& comm,
             if (!find_key_val(reduction_op, reduction_names, reduction))
                 continue;
 
-            PRINT_BY_ROOT(
-                comm, "\ndtype: %s\nreduction: %s\n", dtype_name.c_str(), reduction.c_str());
+            PRINT_BY_ROOT(service_comm,
+                          "\ndtype: %s\nreduction: %s\n",
+                          dtype_name.c_str(),
+                          reduction.c_str());
 
             reqs.reserve(colls.size() * options.buf_count);
 
-            /* warm up */
-            PRINT_BY_ROOT(comm, "do warm up");
-
             bench_attr.reduction = reduction_op;
             bench_attr.set<ccl::operation_attr_id::to_cache>(true);
 
-            ccl::barrier(comm);
-
-            for (size_t count = options.min_elem_count; count <= options.max_elem_count;
-                 count *= 2) {
-                for (size_t iter_idx = 0; iter_idx < options.warmup_iters; iter_idx++) {
-                    for (size_t coll_idx = 0; coll_idx < colls.size(); coll_idx++) {
-                        auto& coll = colls[coll_idx];
-                        for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) {
-                            match_id_stream << "coll_" << coll->name()
-                                            << "_" << coll_idx << "_count_" << count 
-                                            << "_buf_" << buf_idx;
-                            bench_attr.set<ccl::operation_attr_id::match_id>(match_id_stream.str());
-                            match_id_stream.str("");
-                            coll->start(count, buf_idx, bench_attr, reqs);
-                        }
-                    }
-                    for (auto& req : reqs) {
-                        req.wait();
-                    }
-                    reqs.clear();
-                }
-            }
-
             std::ostringstream scolls;
             std::copy(options.coll_names.begin(),
                       options.coll_names.end(),
                       std::ostream_iterator<std::string>{ scolls, " " });
 
-            ccl::barrier(comm);
+            ccl::barrier(service_comm);
 
             /* benchmark with multiple equal sized buffer per collective */
-            if (options.buf_type == BUF_MULTI) {
-                PRINT_BY_ROOT(comm,
-                              "do multi-buffers benchmark\n"
-                              "#------------------------------------------------------------\n"
-                              "# Benchmarking: %s\n"
-                              "# ranks: %zu\n"
-                              "#------------------------------------------------------------\n"
+            PRINT_BY_ROOT(service_comm,
+                          "#------------------------------------------------------------\n"
+                          "# Benchmarking: %s\n"
+                          "# processes: %d\n"
+                          "#------------------------------------------------------------\n",
+                          scolls.str().c_str(),
+                          service_comm.size());
+
+            if (options.buf_count == 1) {
+                PRINT_BY_ROOT(service_comm, "%10s %12s %11s", "#bytes", "avg[usec]", "stddev[%]");
+            }
+            else {
+                PRINT_BY_ROOT(service_comm,
                               "%10s %13s %18s %11s",
-                              scolls.str().c_str(),
-                              comm.size(),
                               "#bytes",
                               "avg[usec]",
                               "avg_per_buf[usec]",
                               "stddev[%]");
-                bench_attr.set<ccl::operation_attr_id::to_cache>(true);
-                for (size_t count = options.min_elem_count; count <= options.max_elem_count;
-                     count *= 2) {
-                    try {
-                        // we store times for each collective separately,
-                        // but aggregate over buffers and iterations
-                        std::vector<double> coll_timers(colls.size(), 0);
-                        for (size_t coll_idx = 0; coll_idx < colls.size(); coll_idx++) {
-                            ccl::barrier(comm);
-
-                            double t1 = 0, t2 = 0, t = 0;
-
-                            for (size_t iter_idx = 0; iter_idx < options.iters; iter_idx++) {
-                                auto& coll = colls[coll_idx];
-                                // collective is configured to handle only
-                                // options.buf_count many buffers/executions 'at once'.
-                                // -> check cannot combine executions over iterations
-                                // -> wait and check and must be in this loop nest
-                                if (options.check_values) {
-                                    coll->prepare(count);
-                                }
-
-                                ccl::barrier(comm);
-
-                                t1 = when();
-
-                                for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) {
-                                    match_id_stream << "coll_" << coll->name()
-                                                    << "_" << coll_idx << "_count_" << count 
-                                                    << "_buf_" << buf_idx;
-                                    bench_attr.set<ccl::operation_attr_id::match_id>(match_id_stream.str());
-                                    match_id_stream.str("");
-                                    coll->start(count, buf_idx, bench_attr, reqs);
-                                }
-
-                                for (auto& req : reqs) {
-                                    req.wait();
-                                }
-                                reqs.clear();
-
-                                t2 = when();
-                                t += (t2 - t1);
-
-                                if (options.check_values) {
-                                    coll->finalize(count);
-                                }
-                            }
-                            coll_timers[coll_idx] += t;
-                        }
-                        print_timings(comm, coll_timers, options, count, dtype, reduction_op);
-                    }
-                    catch (const std::exception& ex) {
-                        ASSERT(0, "error on count %zu, reason: %s", count, ex.what());
-                    }
-                }
             }
-            else {
-                /* benchmark with single buffer per collective */
-                PRINT_BY_ROOT(comm,
-                              "do single-buffer benchmark\n"
-                              "#--------------------------------------\n"
-                              "# Benchmarking: %s\n"
-                              "# ranks: %zu\n"
-                              "#--------------------------------------\n"
-                              "%10s %12s %11s",
-                              scolls.str().c_str(),
-                              comm.size(),
-                              "#bytes",
-                              "avg[usec]",
-                              "stddev[%]");
-                size_t min_elem_count = options.min_elem_count * options.buf_count;
-                size_t max_elem_count = options.max_elem_count * options.buf_count;
 
-                bench_attr.set<ccl::operation_attr_id::to_cache>(true);
-                for (size_t count = min_elem_count; count <= max_elem_count; count *= 2) {
-                    try {
-                        // we store times for each collective separately,
-                        // but aggregate over iterations
-                        std::vector<double> coll_timers(colls.size(), 0);
+            for (size_t count = options.min_elem_count; count <= options.max_elem_count;
+                 count *= 2) {
+                size_t iter_count =
+                    get_iter_count(count * ccl::get_datatype_size(dtype), options.iters);
 
-                        double t1 = 0, t2 = 0;
+                size_t warmup_iter_count =
+                    get_iter_count(count * ccl::get_datatype_size(dtype), options.warmup_iters);
 
-                        for (size_t coll_idx = 0; coll_idx < colls.size(); coll_idx++) {
-                            auto& coll = colls[coll_idx];
+                try {
+                    // we store times for each collective separately,
+                    // but aggregate over buffers and iterations
+                    std::vector<double> coll_timers(colls.size(), 0);
+                    for (size_t coll_idx = 0; coll_idx < colls.size(); coll_idx++) {
+                        auto& coll = colls[coll_idx];
+
+                        ccl::barrier(service_comm);
+
+                        double t1 = 0, t2 = 0, t = 0;
+
+                        for (size_t iter_idx = 0; iter_idx < (iter_count + warmup_iter_count);
+                             iter_idx++) {
+                            // collective is configured to handle only
+                            // options.buf_count many buffers/executions 'at once'.
+                            // -> check cannot combine executions over iterations
+                            // -> wait and check and must be in this loop nest
+                            if (options.check_values) {
+                                coll->prepare(count);
+                            }
 
-                            ccl::barrier(comm);
+                            ccl::barrier(service_comm);
 
                             t1 = when();
 
-                            for (size_t iter_idx = 0; iter_idx < options.iters; iter_idx++) {
-                                match_id_stream << "coll_" << coll->name()
-                                                << "_" << coll_idx << "_single_count_" << count;
-                                bench_attr.set<ccl::operation_attr_id::match_id>(match_id_stream.str());
+                            for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) {
+                                match_id_stream << "coll_" << coll->name() << "_" << coll_idx
+                                                << "_count_" << count << "_buf_" << buf_idx;
+                                bench_attr.set<ccl::operation_attr_id::match_id>(
+                                    ccl::string_class(match_id_stream.str()));
                                 match_id_stream.str("");
-                                coll->start_single(count, bench_attr, reqs);
-                                for (auto& req : reqs) {
-                                    req.wait();
-                                }
-                                reqs.clear();
+                                coll->start(count, buf_idx, bench_attr, reqs);
                             }
 
+                            for (auto& req : reqs) {
+                                req.wait();
+                            }
+                            reqs.clear();
+
                             t2 = when();
 
-                            coll_timers[coll_idx] += (t2 - t1);
-                        }
+                            if (iter_idx >= warmup_iter_count) {
+                                t += (t2 - t1);
+                            }
 
-                        print_timings(comm, coll_timers, options, count, dtype, reduction_op);
-                    }
-                    catch (...) {
-                        ASSERT(0, "error on count %zu", count);
+                            if (options.check_values) {
+                                coll->finalize(count);
+                            }
+                        }
+                        coll_timers[coll_idx] += t;
                     }
+
+                    print_timings(
+                        service_comm, coll_timers, options, count, iter_count, dtype, reduction_op);
+                }
+                catch (const std::exception& ex) {
+                    ASSERT(0, "error on count %zu, reason: %s", count, ex.what());
                 }
-                PRINT_BY_ROOT(comm, "PASSED\n");
             }
         }
     }
 }
 
-void do_unordered(const ccl::communicator& comm,
+void do_unordered(ccl::communicator& service_comm,
                   bench_exec_attr& bench_attr,
                   coll_list_t& all_colls,
                   req_list_t& reqs,
                   const user_options_t& options) {
-
-    std::set<std::string> match_ids;
+    std::set<ccl::string_class> match_ids;
     std::stringstream match_id_stream;
 
     for (auto dtype : all_dtypes) {
@@ -263,14 +195,16 @@ void do_unordered(const ccl::communicator& comm,
             if (!find_key_val(reduction_op, reduction_names, reduction))
                 continue;
 
-            PRINT_BY_ROOT(
-                comm, "\ndtype: %s\nreduction: %s\n", dtype_name.c_str(), reduction.c_str());
+            PRINT_BY_ROOT(service_comm,
+                          "\ndtype: %s\nreduction: %s\n",
+                          dtype_name.c_str(),
+                          reduction.c_str());
 
-            size_t rank = comm.rank();
+            int rank = service_comm.rank();
 
             reqs.reserve(colls.size() * options.buf_count * (log2(options.max_elem_count) + 1));
 
-            PRINT_BY_ROOT(comm, "do unordered test");
+            PRINT_BY_ROOT(service_comm, "do unordered test");
             bench_attr.reduction = reduction_op;
             bench_attr.set<ccl::operation_attr_id::to_cache>(true);
 
@@ -281,14 +215,13 @@ void do_unordered(const ccl::communicator& comm,
                         for (size_t coll_idx = 0; coll_idx < colls.size(); coll_idx++) {
                             auto& coll = colls[coll_idx];
                             for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) {
-                                match_id_stream << "coll_" << coll->name()
-                                                << "_" << coll_idx << "_count_" << count 
-                                                << "_buf_" << buf_idx;
-                                bench_attr.set<ccl::operation_attr_id::match_id>(match_id_stream.str());
+                                match_id_stream << "coll_" << coll->name() << "_" << coll_idx
+                                                << "_count_" << count << "_buf_" << buf_idx;
+                                bench_attr.set<ccl::operation_attr_id::match_id>(
+                                    ccl::string_class(match_id_stream.str()));
                                 match_ids.insert(match_id_stream.str());
                                 match_id_stream.str("");
                                 coll->start(count, buf_idx, bench_attr, reqs);
-                                
                             }
                         }
                     }
@@ -298,10 +231,10 @@ void do_unordered(const ccl::communicator& comm,
                             auto& coll = colls[real_coll_idx];
                             for (size_t buf_idx = 0; buf_idx < options.buf_count; buf_idx++) {
                                 size_t real_buf_idx = options.buf_count - buf_idx - 1;
-                                match_id_stream << "coll_" << coll->name()
-                                                << "_" << real_coll_idx << "_count_" << count 
-                                                << "_buf_" << real_buf_idx;
-                                bench_attr.set<ccl::operation_attr_id::match_id>(match_id_stream.str());
+                                match_id_stream << "coll_" << coll->name() << "_" << real_coll_idx
+                                                << "_count_" << count << "_buf_" << real_buf_idx;
+                                bench_attr.set<ccl::operation_attr_id::match_id>(
+                                    ccl::string_class(match_id_stream.str()));
                                 match_ids.insert(match_id_stream.str());
                                 match_id_stream.str("");
                                 coll->start(count, real_buf_idx, bench_attr, reqs);
@@ -327,20 +260,18 @@ void do_unordered(const ccl::communicator& comm,
             catch (...) {
                 ASSERT(0, "error on coll completion");
             }
-            PRINT_BY_ROOT(comm, "PASSED\n");
+            PRINT_BY_ROOT(service_comm, "PASSED\n");
         }
     }
 }
 
 template <class Dtype>
-void create_cpu_colls(bench_init_attr& init_attr,
-                      user_options_t& options,
-                      coll_list_t& colls) {
-    using namespace sparse_detail;
-    using incremental_index_int_sparse_strategy =
-        sparse_allreduce_strategy_impl<int, sparse_detail::incremental_indices_distributor>;
-    using incremental_index_bf16_sparse_strategy =
-        sparse_allreduce_strategy_impl<ccl::bf16, sparse_detail::incremental_indices_distributor>;
+void create_cpu_colls(bench_init_attr& init_attr, user_options_t& options, coll_list_t& colls) {
+    // using namespace sparse_detail;
+    // using incremental_index_int_sparse_strategy =
+    //     sparse_allreduce_strategy_impl<int, sparse_detail::incremental_indices_distributor>;
+    // using incremental_index_bf16_sparse_strategy =
+    //     sparse_allreduce_strategy_impl<ccl::bfloat16, sparse_detail::incremental_indices_distributor>;
 
     std::stringstream error_messages_stream;
 
@@ -367,35 +298,35 @@ void create_cpu_colls(bench_init_attr& init_attr,
         else if (name == reduce_scatter_strategy_impl::class_name()) {
             colls.emplace_back(new cpu_reduce_scatter_coll<Dtype>(init_attr));
         }
-        else if (name.find(incremental_index_int_sparse_strategy::class_name()) !=
-                 std::string::npos) {
-            if (name.find(incremental_index_bf16_sparse_strategy::class_name()) !=
-                std::string::npos) {
-                if (is_bf16_enabled() == 0) {
-                    error_messages_stream << "bfloat16 is not supported for current CPU, skipping "
-                                          << name << ".\n";
-                    names_it = options.coll_names.erase(names_it);
-                    continue;
-                }
-#ifdef CCL_bf16_COMPILER
-                colls.emplace_back(
-                    new cpu_sparse_allreduce_coll<ccl::bf16,
-                                                  int64_t,
-                                                  sparse_detail::incremental_indices_distributor>(
-                        init_attr,
-                        sizeof(float) / sizeof(ccl::bf16),
-                        sizeof(float) / sizeof(ccl::bf16)));
-#else
-                error_messages_stream << "bfloat16 is not supported by current compiler, skipping "
-                                      << name << ".\n";
-                names_it = options.coll_names.erase(names_it);
-                continue;
-#endif
-            }
-            else {
-                colls.emplace_back(new cpu_sparse_allreduce_coll<Dtype, int64_t>(init_attr));
-            }
-        }
+        //         else if (name.find(incremental_index_int_sparse_strategy::class_name()) !=
+        //                  std::string::npos) {
+        //             if (name.find(incremental_index_bf16_sparse_strategy::class_name()) !=
+        //                 std::string::npos) {
+        //                 if (is_bf16_enabled() == 0) {
+        //                     error_messages_stream << "bfloat16 is not supported for current CPU, skipping "
+        //                                           << name << ".\n";
+        //                     names_it = options.coll_names.erase(names_it);
+        //                     continue;
+        //                 }
+        // #ifdef CCL_bf16_COMPILER
+        //                 colls.emplace_back(
+        //                     new cpu_sparse_allreduce_coll<ccl::bfloat16,
+        //                                                   int64_t,
+        //                                                   sparse_detail::incremental_indices_distributor>(
+        //                         init_attr,
+        //                         sizeof(float) / sizeof(ccl::bfloat16),
+        //                         sizeof(float) / sizeof(ccl::bfloat16)));
+        // #else
+        //                 error_messages_stream << "bfloat16 is not supported by current compiler, skipping "
+        //                                       << name << ".\n";
+        //                 names_it = options.coll_names.erase(names_it);
+        //                 continue;
+        // #endif
+        //             }
+        //             else {
+        //                 colls.emplace_back(new cpu_sparse_allreduce_coll<Dtype, int64_t>(init_attr));
+        //             }
+        //         }
         else {
             ASSERT(0, "create_colls error, unknown coll name: %s", name.c_str());
         }
@@ -415,13 +346,11 @@ void create_cpu_colls(bench_init_attr& init_attr,
 
 #ifdef CCL_ENABLE_SYCL
 template <class Dtype>
-void create_sycl_colls(bench_init_attr& init_attr,
-                       user_options_t& options,
-                       coll_list_t& colls) {
-    using incremental_index_int_sparse_strategy =
-        sparse_allreduce_strategy_impl<int, sparse_detail::incremental_indices_distributor>;
-    using incremental_index_bf16_sparse_strategy =
-        sparse_allreduce_strategy_impl<ccl::bf16, sparse_detail::incremental_indices_distributor>;
+void create_sycl_colls(bench_init_attr& init_attr, user_options_t& options, coll_list_t& colls) {
+    // using incremental_index_int_sparse_strategy =
+    //     sparse_allreduce_strategy_impl<int, sparse_detail::incremental_indices_distributor>;
+    // using incremental_index_bf16_sparse_strategy =
+    //     sparse_allreduce_strategy_impl<ccl::bfloat16, sparse_detail::incremental_indices_distributor>;
 
     std::stringstream error_messages_stream;
 
@@ -449,48 +378,48 @@ void create_sycl_colls(bench_init_attr& init_attr,
         else if (name == reduce_scatter_strategy_impl::class_name()) {
             colls.emplace_back(new sycl_reduce_scatter_coll<Dtype>(init_attr));
         }
-        else if (name.find(incremental_index_int_sparse_strategy::class_name()) !=
-                 std::string::npos) {
-            // TODO case is not supported yet
-            if (true) {
-                error_messages_stream << "SYCL coll: skipping " << name
-                                      << ", because it is not supported yet.\n";
-                names_it = options.coll_names.erase(names_it);
-                continue;
-            }
-            colls.emplace_back(new sycl_sparse_allreduce_coll<Dtype, int>(init_attr));
-        }
-        else if (name.find(incremental_index_bf16_sparse_strategy::class_name()) !=
-                 std::string::npos) {
-            // TODO case is not supported yet
-            if (true) {
-                error_messages_stream << "SYCL coll: skipping " << name
-                                      << ", because it is not supported yet.\n";
-                names_it = options.coll_names.erase(names_it);
-                continue;
-            }
-
-            if (is_bf16_enabled() == 0) {
-                error_messages_stream << "SYCL bf16 is not supported for current CPU, skipping "
-                                      << name << ".\n";
-                names_it = options.coll_names.erase(names_it);
-                continue;
-            }
-#ifdef CCL_bf16_COMPILER
-            colls.emplace_back(
-                new sycl_sparse_allreduce_coll<ccl::bf16,
-                                               int64_t,
-                                               sparse_detail::incremental_indices_distributor>(
-                    init_attr,
-                    sizeof(float) / sizeof(ccl::bf16),
-                    sizeof(float) / sizeof(ccl::bf16)));
-#else
-            error_messages_stream << "SYCL bf16 is not supported by current compiler, skipping "
-                                  << name << ".\n";
-            names_it = options.coll_names.erase(names_it);
-            continue;
-#endif
-        }
+        //         else if (name.find(incremental_index_int_sparse_strategy::class_name()) !=
+        //                  std::string::npos) {
+        //             // TODO case is not supported yet
+        //             if (true) {
+        //                 error_messages_stream << "SYCL coll: skipping " << name
+        //                                       << ", because it is not supported yet.\n";
+        //                 names_it = options.coll_names.erase(names_it);
+        //                 continue;
+        //             }
+        //             colls.emplace_back(new sycl_sparse_allreduce_coll<Dtype, int>(init_attr));
+        //         }
+        //         else if (name.find(incremental_index_bf16_sparse_strategy::class_name()) !=
+        //                  std::string::npos) {
+        //             // TODO case is not supported yet
+        //             if (true) {
+        //                 error_messages_stream << "SYCL coll: skipping " << name
+        //                                       << ", because it is not supported yet.\n";
+        //                 names_it = options.coll_names.erase(names_it);
+        //                 continue;
+        //             }
+
+        //             if (is_bf16_enabled() == 0) {
+        //                 error_messages_stream << "SYCL bf16 is not supported for current CPU, skipping "
+        //                                       << name << ".\n";
+        //                 names_it = options.coll_names.erase(names_it);
+        //                 continue;
+        //             }
+        // #ifdef CCL_bf16_COMPILER
+        //             colls.emplace_back(
+        //                 new sycl_sparse_allreduce_coll<ccl::bfloat16,
+        //                                                int64_t,
+        //                                                sparse_detail::incremental_indices_distributor>(
+        //                     init_attr,
+        //                     sizeof(float) / sizeof(ccl::bfloat16),
+        //                     sizeof(float) / sizeof(ccl::bfloat16)));
+        // #else
+        //             error_messages_stream << "SYCL bf16 is not supported by current compiler, skipping "
+        //                                   << name << ".\n";
+        //             names_it = options.coll_names.erase(names_it);
+        //             continue;
+        // #endif
+        //         }
         else {
             ASSERT(0, "create_colls error, unknown coll name: %s", name.c_str());
         }
@@ -513,9 +442,7 @@ void create_sycl_colls(bench_init_attr& init_attr,
 template <class Dtype>
 void create_colls(bench_init_attr& init_attr, user_options_t& options, coll_list_t& colls) {
     switch (options.backend) {
-        case BACKEND_HOST:
-            create_cpu_colls<Dtype>(init_attr, options, colls);
-            break;
+        case BACKEND_HOST: create_cpu_colls<Dtype>(init_attr, options, colls); break;
         case BACKEND_SYCL:
 #ifdef CCL_ENABLE_SYCL
             create_sycl_colls<Dtype>(init_attr, options, colls);
@@ -527,28 +454,24 @@ void create_colls(bench_init_attr& init_attr, user_options_t& options, coll_list
     }
 }
 
-/* Reason to leave a functor here: In order to call a function (create_colls())
- * with all dtypes (from ccl::datatype) the functor requires the implementation
- * of that function. */
-class create_colls_func {
-private:
-    bench_init_attr& init_attr;
-    user_options_t& options;
-    coll_list_t& colls;
-
-public:
-    create_colls_func(bench_init_attr& init_attr, user_options_t& options, coll_list_t& colls)
-            : init_attr(init_attr),
-              options(options),
-              colls(colls) {}
-
-    template <class Dtype>
-    void operator()(const Dtype& value) {
-        if (true == std::get<0>(value)) {
-            create_colls<typename Dtype::second_type>(init_attr, options, colls);
-        }
+void create_all_colls(bench_init_attr& init_attr, user_options_t& options, coll_list_t& colls) {
+    for (auto& dtype : options.dtypes) {
+        if (dtype == dtype_names[ccl::datatype::int8])
+            create_colls<int8_t>(init_attr, options, colls);
+        else if (dtype == dtype_names[ccl::datatype::int32])
+            create_colls<int32_t>(init_attr, options, colls);
+        else if (dtype == dtype_names[ccl::datatype::int64])
+            create_colls<int64_t>(init_attr, options, colls);
+        else if (dtype == dtype_names[ccl::datatype::uint64])
+            create_colls<uint64_t>(init_attr, options, colls);
+        else if (dtype == dtype_names[ccl::datatype::float32])
+            create_colls<float>(init_attr, options, colls);
+        else if (dtype == dtype_names[ccl::datatype::float64])
+            create_colls<double>(init_attr, options, colls);
+        else
+            ASSERT(0, "unexpected datatype %s", dtype.c_str());
     }
-};
+}
 
 int main(int argc, char* argv[]) {
     user_options_t options;
@@ -558,34 +481,24 @@ int main(int argc, char* argv[]) {
     bench_init_attr init_attr;
 
     if (parse_user_options(argc, argv, options)) {
-        PRINT("failed to parse user options");
         print_help_usage(argv[0]);
+        return -1;
     }
 
+    auto& transport = transport_data::instance();
+    transport.init_comms(options);
+
+    ccl::communicator& service_comm = transport.get_service_comm();
+
     init_attr.buf_count = options.buf_count;
     init_attr.max_elem_count = options.max_elem_count;
+    init_attr.ranks_per_proc = options.ranks_per_proc;
+    init_attr.sycl_mem_type = options.sycl_mem_type;
+    init_attr.sycl_usm_type = options.sycl_usm_type;
     init_attr.v2i_ratio = options.v2i_ratio;
 
-    host_data::init(transport_settings::instance().get_size(),
-                            transport_settings::instance().get_rank(),
-                            transport_settings::instance().get_kvs());
-#ifdef CCL_ENABLE_SYCL
-    if (options.backend == BACKEND_SYCL) {
-
-        auto dev = get_device(*host_data::comm_ptr);
-        cl::sycl::context ctx(dev);
-
-        device_data::init(transport_settings::instance().get_size(),
-                                   transport_settings::instance().get_rank(),
-                                   dev,
-                                   ctx,
-                                   transport_settings::instance().get_kvs());
-    }
-#endif
-
     try {
-        ccl_tuple_for_each(launch_dtypes, set_dtypes_func(options.dtypes));
-        ccl_tuple_for_each(launch_dtypes, create_colls_func(init_attr, options, colls));
+        create_all_colls(init_attr, options, colls);
     }
     catch (const std::runtime_error& e) {
         ASSERT(0, "cannot create coll objects: %s\n", e.what());
@@ -595,25 +508,23 @@ int main(int argc, char* argv[]) {
         return -1;
     }
 
-    ccl::communicator& comm = *host_data::comm_ptr;
-
     bench_exec_attr bench_attr{};
     bench_attr.init_all();
 
-    print_user_options(options, comm);
+    print_user_options(options, service_comm);
 
     if (options.coll_names.empty()) {
-        PRINT_BY_ROOT(comm, "empty coll list");
+        PRINT_BY_ROOT(service_comm, "empty coll list");
         print_help_usage(argv[0]);
         return -1;
     }
 
-    ccl::barrier(comm);
+    ccl::barrier(service_comm);
 
     switch (options.loop) {
         case LOOP_REGULAR: {
             // open and truncate CSV file if csv-output is requested
-            if (comm.rank() == 0 && !options.csv_filepath.empty()) {
+            if (service_comm.rank() == 0 && !options.csv_filepath.empty()) {
                 std::ofstream csvf;
                 csvf.open(options.csv_filepath, std::ios::trunc);
                 if (!csvf.is_open()) {
@@ -626,22 +537,18 @@ int main(int argc, char* argv[]) {
                      << std::endl;
                 csvf.close();
             }
-            ccl::barrier(comm);
-            do_regular(comm, bench_attr, colls, reqs, options);
+            ccl::barrier(service_comm);
+            do_regular(service_comm, bench_attr, colls, reqs, options);
             break;
         }
         case LOOP_UNORDERED: {
             // no timing is printed or exported here
-            ccl::barrier(comm);
-            do_unordered(comm, bench_attr, colls, reqs, options);
+            ccl::barrier(service_comm);
+            do_unordered(service_comm, bench_attr, colls, reqs, options);
             break;
         }
         default: ASSERT(0, "unknown loop %d", options.loop); break;
     }
 
-#ifdef CCL_ENABLE_SYCL
-    device_data::deinit();
-#endif
-    host_data::deinit();
     return 0;
 }
diff --git a/examples/benchmark/src/declarations.hpp b/examples/benchmark/src/declarations.hpp
index 72263566f..7bcd15950 100755
--- a/examples/benchmark/src/declarations.hpp
+++ b/examples/benchmark/src/declarations.hpp
@@ -51,7 +51,7 @@
 #include "reduce_scatter/sycl_reduce_scatter_coll.hpp"
 
 /* sparse_allreduce implementation */
-#include "sparse_allreduce/sparse_allreduce_base.hpp"
-#include "sparse_allreduce/sparse_allreduce_strategy.hpp"
-#include "sparse_allreduce/cpu_sparse_allreduce_coll.hpp"
-#include "sparse_allreduce/sycl_sparse_allreduce_coll.hpp"
+// #include "sparse_allreduce/sparse_allreduce_base.hpp"
+// #include "sparse_allreduce/sparse_allreduce_strategy.hpp"
+// #include "sparse_allreduce/cpu_sparse_allreduce_coll.hpp"
+// #include "sparse_allreduce/sycl_sparse_allreduce_coll.hpp"
diff --git a/examples/benchmark/src/reduce/cpu_reduce_coll.hpp b/examples/benchmark/src/reduce/cpu_reduce_coll.hpp
index e049fde6b..0a0c9d445 100644
--- a/examples/benchmark/src/reduce/cpu_reduce_coll.hpp
+++ b/examples/benchmark/src/reduce/cpu_reduce_coll.hpp
@@ -23,43 +23,34 @@ struct cpu_reduce_coll : cpu_base_coll<Dtype, reduce_strategy_impl> {
     using coll_base = cpu_base_coll<Dtype, reduce_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
 
-    cpu_reduce_coll(bench_init_attr init_attr)
-            : coll_base(init_attr) {}
-    virtual void prepare(size_t elem_count) override {
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                ((Dtype*)send_bufs[b_idx])[e_idx] = coll_base::comm().rank();
-                ((Dtype*)recv_bufs[b_idx])[e_idx] = 0;
-            }
-        }
-    }
+    cpu_reduce_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void finalize(size_t elem_count) override {
-        Dtype sbuf_expected = coll_base::comm().rank();
-        Dtype rbuf_expected =
-            (coll_base::comm().size() - 1) * ((float)coll_base::comm().size() / 2);
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
+        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
         Dtype value;
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                value = ((Dtype*)send_bufs[b_idx])[e_idx];
+                value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
 
-                if (coll_base::comm().rank() != COLL_ROOT)
+                if (comm.rank() != COLL_ROOT)
                     continue;
 
-                value = ((Dtype*)recv_bufs[b_idx])[e_idx];
+                value = ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
diff --git a/examples/benchmark/src/reduce/reduce_strategy.hpp b/examples/benchmark/src/reduce/reduce_strategy.hpp
index 5efa01b70..fb3f5efeb 100644
--- a/examples/benchmark/src/reduce/reduce_strategy.hpp
+++ b/examples/benchmark/src/reduce/reduce_strategy.hpp
@@ -23,12 +23,20 @@ struct reduce_strategy_impl {
         return "reduce";
     }
 
+    size_t get_send_multiplier() {
+        return 1;
+    }
+
+    size_t get_recv_multiplier() {
+        return 1;
+    }
+
     static const ccl::reduce_attr& get_op_attr(const bench_exec_attr& bench_attr) {
         return bench_attr.get_attr<ccl::reduce_attr>();
     }
 
-    template <class Dtype, class comm_t, class... Args>
-    void start_internal(comm_t& comm,
+    template <class Dtype, class... Args>
+    void start_internal(ccl::communicator& comm,
                         size_t count,
                         const Dtype send_buf,
                         Dtype recv_buf,
diff --git a/examples/benchmark/src/reduce/sycl_reduce_coll.hpp b/examples/benchmark/src/reduce/sycl_reduce_coll.hpp
index c714cedc4..4b87ce244 100644
--- a/examples/benchmark/src/reduce/sycl_reduce_coll.hpp
+++ b/examples/benchmark/src/reduce/sycl_reduce_coll.hpp
@@ -20,99 +20,76 @@
 #ifdef CCL_ENABLE_SYCL
 #include "sycl_coll.hpp"
 
-template <class Dtype>
-class reduce_buf_check {};
-
-template <class Dtype>
-class reduce_buf_fill {};
-
 template <class Dtype>
 struct sycl_reduce_coll : sycl_base_coll<Dtype, reduce_strategy_impl> {
     using coll_base = sycl_base_coll<Dtype, reduce_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
+    using coll_base::host_send_buf;
+    using coll_base::host_recv_buf;
 
-    sycl_reduce_coll(bench_init_attr init_attr)
-            : coll_base(init_attr) {}
+    sycl_reduce_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        size_t local_rank = coll_base::comm().rank();
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class reduce_buf_fill<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx)
-                {
-                    send_buf_acc[e_idx] = local_rank;
-                    recv_buf_acc[e_idx] = 0;
-                });
-            });
-        }
-    }
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
+        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
+
+        int comm_rank = comm.rank();
 
-    virtual void finalize(size_t elem_count) override {
-        bool unexpected_device_value = false;
-        Dtype sbuf_expected = coll_base::comm().rank();
-        Dtype rbuf_expected =
-            (coll_base::comm().size() - 1) * ((float)coll_base::comm().size() / 2);
-        size_t local_rank = coll_base::comm().rank();
+        size_t send_bytes = elem_count * base_coll::get_dtype_size();
+        size_t recv_bytes = elem_count * base_coll::get_dtype_size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class reduce_buf_check<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx) mutable
-                {
-                    Dtype value = send_buf_acc[e_idx];
-                    if (value != sbuf_expected)
-                        unexpected_device_value = true;
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_bufs[b_idx][rank_idx], send_bytes)
+                    .wait();
 
-                    if (local_rank == COLL_ROOT) {
-                        value = recv_buf_acc[e_idx];
-                        if (value != rbuf_expected)
-                            unexpected_device_value = true;
-                    }
-                });
-            });
-        }
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_bufs[b_idx][rank_idx], recv_bytes)
+                    .wait();
+            }
+            else {
+                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx][rank_idx]));
+                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                auto send_buf_acc = send_buf->template get_access<mode::read>();
+                auto recv_buf_acc = recv_buf->template get_access<mode::read>();
 
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-            auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-            auto send_buf_acc = send_buf->template get_access<mode::read>();
-            auto recv_buf_acc = recv_buf->template get_access<mode::read>();
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_buf_acc.get_pointer(), send_bytes)
+                    .wait();
+
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_buf_acc.get_pointer(), recv_bytes)
+                    .wait();
+            }
+
+            Dtype value;
 
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                Dtype value = send_buf_acc[e_idx];
+                value = host_send_buf[e_idx];
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
 
-                if (local_rank != COLL_ROOT)
+                if (comm_rank != COLL_ROOT)
                     continue;
 
-                value = recv_buf_acc[e_idx];
+                value = host_recv_buf[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
         }
-
-        if (unexpected_device_value)
-            ASSERT(0, "unexpected value on device");
     }
 };
 #endif /* CCL_ENABLE_SYCL */
diff --git a/examples/benchmark/src/reduce_scatter/cpu_reduce_scatter_coll.hpp b/examples/benchmark/src/reduce_scatter/cpu_reduce_scatter_coll.hpp
index 93138ce12..f9bf0107a 100644
--- a/examples/benchmark/src/reduce_scatter/cpu_reduce_scatter_coll.hpp
+++ b/examples/benchmark/src/reduce_scatter/cpu_reduce_scatter_coll.hpp
@@ -23,49 +23,36 @@ struct cpu_reduce_scatter_coll : cpu_base_coll<Dtype, reduce_scatter_strategy_im
     using coll_base = cpu_base_coll<Dtype, reduce_scatter_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
 
-    cpu_reduce_scatter_coll(bench_init_attr init_attr)
-            : coll_base(init_attr) {}
+    cpu_reduce_scatter_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                ((Dtype*)send_bufs[b_idx])[e_idx] = coll_base::comm().rank();
-                ((Dtype*)recv_bufs[b_idx])[e_idx] = 0;
-            }
-        }
-    }
-
-    virtual void finalize(size_t elem_count) override {
-        Dtype sbuf_expected = coll_base::comm().rank();
-        Dtype rbuf_expected =
-            (coll_base::comm().size() - 1) * ((float)coll_base::comm().size() / 2);
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
+        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
         Dtype value;
 
-        size_t recv_elem_count = elem_count / coll_base::comm().size();
+        size_t recv_elem_count = elem_count / comm.size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-
-                value = ((Dtype*)send_bufs[b_idx])[e_idx];
+                value = ((Dtype*)send_bufs[b_idx][rank_idx])[e_idx];
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
 
             for (size_t e_idx = 0; e_idx < recv_elem_count; e_idx++) {
-
-                value = ((Dtype*)recv_bufs[b_idx])[e_idx];
+                value = ((Dtype*)recv_bufs[b_idx][rank_idx])[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
diff --git a/examples/benchmark/src/reduce_scatter/reduce_scatter_strategy.hpp b/examples/benchmark/src/reduce_scatter/reduce_scatter_strategy.hpp
index 01a5c8adb..f71eb2d71 100644
--- a/examples/benchmark/src/reduce_scatter/reduce_scatter_strategy.hpp
+++ b/examples/benchmark/src/reduce_scatter/reduce_scatter_strategy.hpp
@@ -23,19 +23,26 @@ struct reduce_scatter_strategy_impl {
         return "reduce_scatter";
     }
 
+    size_t get_send_multiplier() {
+        return 1;
+    }
+
+    size_t get_recv_multiplier() {
+        return 1;
+    }
+
     static const ccl::reduce_scatter_attr& get_op_attr(const bench_exec_attr& bench_attr) {
         return bench_attr.get_attr<ccl::reduce_scatter_attr>();
     }
 
-    template <class Dtype, class comm_t, class... Args>
-    void start_internal(comm_t& comm,
+    template <class Dtype, class... Args>
+    void start_internal(ccl::communicator& comm,
                         size_t send_count,
                         const Dtype send_buf,
                         Dtype recv_buf,
                         const bench_exec_attr& bench_attr,
                         req_list_t& reqs,
                         Args&&... args) {
-
         size_t recv_count = send_count / comm.size();
 
         if (recv_count == 0) {
diff --git a/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp b/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
index 9fa8701c9..186b02a15 100644
--- a/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
+++ b/examples/benchmark/src/reduce_scatter/sycl_reduce_scatter_coll.hpp
@@ -20,105 +20,75 @@
 #ifdef CCL_ENABLE_SYCL
 #include "sycl_coll.hpp"
 
-template <class Dtype>
-class reduce_scatter_sbuf_check {};
-
-template <class Dtype>
-class reduce_scatter_rbuf_check {};
-
-template <class Dtype>
-class reduce_scatter_buf_fill {};
-
 template <class Dtype>
 struct sycl_reduce_scatter_coll : sycl_base_coll<Dtype, reduce_scatter_strategy_impl> {
     using coll_base = sycl_base_coll<Dtype, reduce_scatter_strategy_impl>;
     using coll_base::send_bufs;
     using coll_base::recv_bufs;
-    using coll_base::single_send_buf;
-    using coll_base::single_recv_buf;
-    using coll_base::comm;
+    using coll_base::host_send_buf;
+    using coll_base::host_recv_buf;
 
-    sycl_reduce_scatter_coll(bench_init_attr init_attr)
-            : coll_base(init_attr) {}
+    sycl_reduce_scatter_coll(bench_init_attr init_attr) : coll_base(init_attr) {}
 
-    virtual void prepare(size_t elem_count) override {
-        size_t local_rank = coll_base::comm().rank();
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class reduce_scatter_buf_fill<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx)
-                {
-                    send_buf_acc[e_idx] = local_rank;
-                    recv_buf_acc[e_idx] = 0;
-                });
-            });
-        }
-    }
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        Dtype sbuf_expected = comm.rank();
+        Dtype rbuf_expected = (comm.size() - 1) * ((float)comm.size() / 2);
 
-    virtual void finalize(size_t elem_count) override {
-        bool unexpected_device_value = false;
-        Dtype sbuf_expected = coll_base::comm().rank();
-        Dtype rbuf_expected =
-            (coll_base::comm().size() - 1) * ((float)coll_base::comm().size() / 2);
+        size_t recv_elem_count = elem_count / comm.size();
 
-        size_t recv_elem_count = elem_count / coll_base::comm().size();
+        size_t send_bytes = elem_count * base_coll::get_dtype_size();
+        size_t recv_bytes = elem_count * base_coll::get_dtype_size();
 
         for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-                auto send_buf_acc = send_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class reduce_scatter_sbuf_check<Dtype>>(range<1>{elem_count}, [=](item<1> e_idx) mutable
-                {
-                    Dtype value = send_buf_acc[e_idx];
-                    if (value != sbuf_expected)
-                        unexpected_device_value = true;
-                });
-            });
-
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-                auto recv_buf_acc = recv_buf->template get_access<mode::write>(cgh);
-                cgh.parallel_for<class reduce_scatter_rbuf_check<Dtype>>(range<1>{recv_elem_count}, [=](item<1> e_idx) mutable
-                {
-                    Dtype value = recv_buf_acc[e_idx];
-                    if (value != rbuf_expected)
-                        unexpected_device_value = true;
-                });
-            });
-        }
+            if (base_coll::get_sycl_mem_type() == SYCL_MEM_USM) {
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_bufs[b_idx][rank_idx], send_bytes)
+                    .wait();
+
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_bufs[b_idx][rank_idx], recv_bytes)
+                    .wait();
+            }
+            else {
+                auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx][rank_idx]));
+                auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx][rank_idx]));
+                auto send_buf_acc = send_buf->template get_access<mode::read>();
+                auto recv_buf_acc = recv_buf->template get_access<mode::read>();
+
+                stream.get_native()
+                    .memcpy(host_send_buf.data(), send_buf_acc.get_pointer(), send_bytes)
+                    .wait();
+
+                stream.get_native()
+                    .memcpy(host_recv_buf.data(), recv_buf_acc.get_pointer(), recv_bytes)
+                    .wait();
+            }
 
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            auto send_buf = (static_cast<sycl_buffer_t<Dtype>*>(send_bufs[b_idx]));
-            auto recv_buf = (static_cast<sycl_buffer_t<Dtype>*>(recv_bufs[b_idx]));
-            auto send_buf_acc = send_buf->template get_access<mode::read>();
-            auto recv_buf_acc = recv_buf->template get_access<mode::read>();
+            Dtype value;
 
             for (size_t e_idx = 0; e_idx < elem_count; e_idx++) {
-                Dtype value = send_buf_acc[e_idx];
+                value = host_send_buf[e_idx];
                 if (value != sbuf_expected) {
-                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << sbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " send_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << sbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
 
             for (size_t e_idx = 0; e_idx < recv_elem_count; e_idx++) {
-                Dtype value = recv_buf_acc[e_idx];
+                Dtype value = host_recv_buf[e_idx];
                 if (value != rbuf_expected) {
-                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", elem_idx "
-                              << e_idx << ", expected " << rbuf_expected << ", got " << value
-                              << std::endl;
+                    std::cout << this->name() << " recv_bufs: buf_idx " << b_idx << ", rank_idx "
+                              << rank_idx << ", elem_idx " << e_idx << ", expected "
+                              << rbuf_expected << ", got " << value << std::endl;
                     ASSERT(0, "unexpected value");
                 }
             }
         }
-
-        if (unexpected_device_value)
-            ASSERT(0, "unexpected value on device");
     }
 };
 #endif /* CCL_ENABLE_SYCL */
diff --git a/examples/benchmark/src/sparse_allreduce/cpu_sparse_allreduce_coll.hpp b/examples/benchmark/src/sparse_allreduce/cpu_sparse_allreduce_coll.hpp
index 0aef29bba..1445a852a 100644
--- a/examples/benchmark/src/sparse_allreduce/cpu_sparse_allreduce_coll.hpp
+++ b/examples/benchmark/src/sparse_allreduce/cpu_sparse_allreduce_coll.hpp
@@ -13,6 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#if 0
+
 #pragma once
 
 template <class VType,
@@ -20,8 +22,7 @@ template <class VType,
           template <class> class IndicesDistributorType =
               sparse_detail::incremental_indices_distributor>
 struct cpu_sparse_allreduce_coll
-        : base_sparse_allreduce_coll<VType*, IType*, IndicesDistributorType>,
-          host_data {
+        : base_sparse_allreduce_coll<VType*, IType*, IndicesDistributorType> {
     using coll_base = base_sparse_allreduce_coll<VType*, IType*, IndicesDistributorType>;
     using coll_strategy = typename coll_base::coll_strategy;
 
@@ -33,140 +34,121 @@ struct cpu_sparse_allreduce_coll
     using coll_base::recv_vcount;
     using coll_base::fn_ctxs;
 
-    using coll_base::single_send_ibuf;
-    using coll_base::single_send_vbuf;
-    using coll_base::single_recv_ibuf;
-    using coll_base::single_recv_vbuf;
-    using coll_base::single_recv_icount;
-    using coll_base::single_recv_vcount;
-    using coll_base::single_fn_ctx;
-
     cpu_sparse_allreduce_coll(bench_init_attr init_attr,
                               size_t sbuf_size_modifier = 1,
                               size_t rbuf_size_modifier = 1)
-            : coll_base(init_attr, comm().size()) {
+            : coll_base(init_attr, transport_data::get_comm_size()) {
         int result = 0;
 
+        int comm_size = transport_data::get_comm_size();
+
         size_t max_elem_count = base_coll::get_max_elem_count();
-        size_t single_buf_max_elem_count = base_coll::get_single_buf_max_elem_count();
-
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            result = posix_memalign((void**)&send_ibufs[idx],
-                                    ALIGNMENT,
-                                    max_elem_count * sizeof(IType) * sbuf_size_modifier);
-            result |= posix_memalign((void**)&send_vbufs[idx],
-                                     ALIGNMENT,
-                                     max_elem_count * sizeof(VType) * sbuf_size_modifier);
-            result |=
-                posix_memalign((void**)&recv_ibufs[idx],
-                               ALIGNMENT,
-                               max_elem_count * sizeof(IType) * rbuf_size_modifier * comm().size());
-            result |=
-                posix_memalign((void**)&recv_vbufs[idx],
-                               ALIGNMENT,
-                               max_elem_count * sizeof(VType) * rbuf_size_modifier * comm().size());
-            if (result != 0) {
-                std::cerr << __FUNCTION__ << " - posix_memalign error: " << strerror(errno)
-                          << ", on buffer idx: " << idx << std::endl;
+
+        for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) {
+
+            for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                result = posix_memalign((void**)&(send_ibufs[idx][rank_idx]),
+                                        ALIGNMENT,
+                                        max_elem_count * sizeof(IType) * sbuf_size_modifier);
+                result |= posix_memalign((void**)&(send_vbufs[idx][rank_idx]),
+                                         ALIGNMENT,
+                                         max_elem_count * sizeof(VType) * sbuf_size_modifier);
+                result |=
+                    posix_memalign((void**)&(recv_ibufs[idx][rank_idx]),
+                                   ALIGNMENT,
+                                   max_elem_count * sizeof(IType) * rbuf_size_modifier * comm_size);
+                result |=
+                    posix_memalign((void**)&(recv_vbufs[idx][rank_idx]),
+                                   ALIGNMENT,
+                                   max_elem_count * sizeof(VType) * rbuf_size_modifier * comm_size);
+                if (result != 0) {
+                    std::cerr << __FUNCTION__ << " - posix_memalign error: " << strerror(errno)
+                              << ", on buffer idx: " << idx << std::endl;
+                }
             }
-        }
 
-        result = posix_memalign((void**)&single_send_ibuf,
-                                ALIGNMENT,
-                                single_buf_max_elem_count * sizeof(IType) * sbuf_size_modifier);
-        result |= posix_memalign((void**)&single_send_vbuf,
-                                 ALIGNMENT,
-                                 single_buf_max_elem_count * sizeof(VType) * sbuf_size_modifier);
-
-        result |= posix_memalign(
-            (void**)&single_recv_ibuf,
-            ALIGNMENT,
-            single_buf_max_elem_count * sizeof(IType) * rbuf_size_modifier * comm().size());
-        result |= posix_memalign(
-            (void**)&single_recv_vbuf,
-            ALIGNMENT,
-            single_buf_max_elem_count * sizeof(VType) * rbuf_size_modifier * comm().size());
-
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            std::memset(send_ibufs[idx], 0, max_elem_count * sizeof(IType));
-            std::memset(send_vbufs[idx], 0, max_elem_count * sizeof(VType) * sbuf_size_modifier);
-
-            std::memset(recv_ibufs[idx],
-                        0,
-                        max_elem_count * sizeof(IType) * rbuf_size_modifier * comm().size());
-            std::memset(recv_vbufs[idx],
-                        0,
-                        max_elem_count * sizeof(VType) * rbuf_size_modifier * comm().size());
-        }
+            for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                std::memset(send_ibufs[idx][rank_idx], 0, max_elem_count * sizeof(IType));
+                std::memset(send_vbufs[idx][rank_idx], 0, max_elem_count * sizeof(VType) * sbuf_size_modifier);
 
-        std::memset(
-            single_send_ibuf, 0, single_buf_max_elem_count * sizeof(IType) * sbuf_size_modifier);
-        std::memset(
-            single_send_vbuf, 0, single_buf_max_elem_count * sizeof(VType) * sbuf_size_modifier);
-
-        std::memset(single_recv_ibuf,
-                    0,
-                    single_buf_max_elem_count * sizeof(IType) * rbuf_size_modifier * comm().size());
-        std::memset(single_recv_vbuf,
-                    0,
-                    single_buf_max_elem_count * sizeof(VType) * rbuf_size_modifier * comm().size());
-
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            fn_ctxs[idx].recv_ibuf = (void**)(&(recv_ibufs[idx]));
-            fn_ctxs[idx].recv_vbuf = (void**)(&(recv_vbufs[idx]));
-            fn_ctxs[idx].recv_ibuf_count = max_elem_count * rbuf_size_modifier * comm().size();
-            fn_ctxs[idx].recv_vbuf_count = max_elem_count * rbuf_size_modifier * comm().size();
+                std::memset(recv_ibufs[idx][rank_idx],
+                            0,
+                            max_elem_count * sizeof(IType) * rbuf_size_modifier * comm_size);
+                std::memset(recv_vbufs[idx][rank_idx],
+                            0,
+                            max_elem_count * sizeof(VType) * rbuf_size_modifier * comm_size);
+            }
+
+            for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                fn_ctxs[idx][rank_idx].recv_ibuf = (void**)(&(recv_ibufs[idx][rank_idx]));
+                fn_ctxs[idx][rank_idx].recv_vbuf = (void**)(&(recv_vbufs[idx][rank_idx]));
+                fn_ctxs[idx][rank_idx].recv_ibuf_count = max_elem_count * rbuf_size_modifier * comm_size;
+                fn_ctxs[idx][rank_idx].recv_vbuf_count = max_elem_count * rbuf_size_modifier * comm_size;
+            }
         }
-        single_fn_ctx.recv_ibuf = (void**)(&single_recv_ibuf);
-        single_fn_ctx.recv_vbuf = (void**)(&single_recv_vbuf);
-        single_fn_ctx.recv_ibuf_count =
-            single_buf_max_elem_count * rbuf_size_modifier * comm().size();
-        single_fn_ctx.recv_vbuf_count =
-            single_buf_max_elem_count * rbuf_size_modifier * comm().size();
     }
 
     ~cpu_sparse_allreduce_coll() {
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            free(send_ibufs[idx]);
-            free(send_vbufs[idx]);
-            free(recv_ibufs[idx]);
-            free(recv_vbufs[idx]);
-        }
 
-        free(single_send_ibuf);
-        free(single_send_vbuf);
-        free(single_recv_ibuf);
-        free(single_recv_vbuf);
+        for (size_t rank_idx = 0; rank_idx < base_coll::get_ranks_per_proc(); rank_idx++) {
+
+            for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+                free(send_ibufs[idx][rank_idx]);
+                free(send_vbufs[idx][rank_idx]);
+                free(recv_ibufs[idx][rank_idx]);
+                free(recv_vbufs[idx][rank_idx]);
+            }
+        }
     }
 
     virtual void prepare(size_t elem_count) override {
-        this->init_distributor({ 0, elem_count });
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            sparse_detail::fill_sparse_data(this->get_expected_recv_counts(elem_count),
-                                            *this->indices_distributor_impl,
-                                            elem_count,
-                                            send_ibufs[b_idx],
-                                            reinterpret_cast<VType*>(send_vbufs[b_idx]),
-                                            reinterpret_cast<VType*>(recv_vbufs[b_idx]),
-                                            fn_ctxs[b_idx].recv_vbuf_count,
-                                            recv_icount[b_idx],
-                                            recv_vcount[b_idx],
-                                            comm().rank());
+
+        auto& transport = transport_data::instance();
+        auto& comms = transport.get_comms();
+        size_t ranks_per_proc = base_coll::get_ranks_per_proc();
+
+        for (size_t rank_idx = 0; rank_idx < ranks_per_proc; rank_idx++) {
+
+            auto& comm = comms[rank_idx];
+
+            this->init_distributor({ 0, elem_count });
+            for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
+                sparse_detail::fill_sparse_data(this->get_expected_recv_counts(elem_count),
+                                                *this->indices_distributor_impl,
+                                                elem_count,
+                                                send_ibufs[b_idx][rank_idx],
+                                                reinterpret_cast<VType*>(send_vbufs[b_idx][rank_idx]),
+                                                reinterpret_cast<VType*>(recv_vbufs[b_idx][rank_idx]),
+                                                fn_ctxs[b_idx][rank_idx].recv_vbuf_count,
+                                                recv_icount[b_idx],
+                                                recv_vcount[b_idx],
+                                                comm.rank());
+            }
         }
     }
 
     virtual void finalize(size_t elem_count) override {
-        for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
-            sparse_detail::check_sparse_result(this->get_expected_recv_counts(elem_count),
-                                               elem_count,
-                                               send_ibufs[b_idx],
-                                               static_cast<const VType*>(send_vbufs[b_idx]),
-                                               recv_ibufs[b_idx],
-                                               static_cast<const VType*>(recv_vbufs[b_idx]),
-                                               recv_icount[b_idx],
-                                               recv_vcount[b_idx],
-                                               comm().size(),
-                                               comm().rank());
+
+        auto& transport = transport_data::instance();
+        auto& comms = transport.get_comms();
+        size_t ranks_per_proc = base_coll::get_ranks_per_proc();
+
+        for (size_t rank_idx = 0; rank_idx < ranks_per_proc; rank_idx++) {
+
+            auto& comm = comms[rank_idx];
+
+            for (size_t b_idx = 0; b_idx < base_coll::get_buf_count(); b_idx++) {
+                sparse_detail::check_sparse_result(this->get_expected_recv_counts(elem_count),
+                                                   elem_count,
+                                                   send_ibufs[b_idx][rank_idx],
+                                                   static_cast<const VType*>(send_vbufs[b_idx][rank_idx]),
+                                                   recv_ibufs[b_idx][rank_idx],
+                                                   static_cast<const VType*>(recv_vbufs[b_idx][rank_idx]),
+                                                   recv_icount[b_idx],
+                                                   recv_vcount[b_idx],
+                                                   comm.size(),
+                                                   comm.rank());
+            }
         }
     }
 
@@ -174,43 +156,30 @@ struct cpu_sparse_allreduce_coll
                        size_t buf_idx,
                        const bench_exec_attr& attr,
                        req_list_t& reqs) override {
-        coll_strategy::start_internal(comm(),
-                                      send_ibufs[buf_idx],
-                                      count,
-                                      send_vbufs[buf_idx],
-                                      count,
-                                      recv_ibufs[buf_idx],
-                                      recv_icount[buf_idx],
-                                      recv_vbufs[buf_idx],
-                                      recv_vcount[buf_idx],
-                                      attr,
-                                      reqs,
-                                      fn_ctxs[buf_idx],
-                                      coll_strategy::get_op_attr(attr));
-    }
 
-    virtual void start_single(size_t count,
-                              const bench_exec_attr& attr,
-                              req_list_t& reqs) override {
-        coll_strategy::start_internal(comm(),
-                                      single_send_ibuf,
-                                      count,
-                                      single_send_vbuf,
-                                      count,
-                                      static_cast<IType*>(single_recv_ibuf),
-                                      single_recv_icount,
-                                      reinterpret_cast<VType*>(single_recv_vbuf),
-                                      single_recv_vcount,
-                                      attr,
-                                      reqs,
-                                      single_fn_ctx,
-                                      coll_strategy::get_op_attr(attr));
-    }
-
-    /* global communicator for cpu collectives */
-    static ccl::communicator& comm() {
-        if (!host_data::comm_ptr) {
+        auto& transport = transport_data::instance();
+        auto& comms = transport.get_comms();
+        size_t ranks_per_proc = base_coll::get_ranks_per_proc();
+
+        for (size_t rank_idx = 0; rank_idx < ranks_per_proc; rank_idx++) {
+
+            auto& comm = comms[rank_idx];
+
+            coll_strategy::start_internal(comm,
+                                          send_ibufs[buf_idx][rank_idx],
+                                          count,
+                                          send_vbufs[buf_idx][rank_idx],
+                                          count,
+                                          recv_ibufs[buf_idx][rank_idx],
+                                          recv_icount[buf_idx],
+                                          recv_vbufs[buf_idx][rank_idx],
+                                          recv_vcount[buf_idx],
+                                          attr,
+                                          reqs,
+                                          fn_ctxs[buf_idx][rank_idx],
+                                          coll_strategy::get_op_attr(attr));
         }
-        return *host_data::comm_ptr;
     }
 };
+
+#endif
diff --git a/examples/benchmark/src/sparse_allreduce/sparse_allreduce_base.hpp b/examples/benchmark/src/sparse_allreduce/sparse_allreduce_base.hpp
index 25a0a90df..8f809f68b 100644
--- a/examples/benchmark/src/sparse_allreduce/sparse_allreduce_base.hpp
+++ b/examples/benchmark/src/sparse_allreduce/sparse_allreduce_base.hpp
@@ -27,24 +27,16 @@ struct base_sparse_allreduce_coll
     using coll_base = base_coll;
     using coll_strategy = sparse_allreduce_strategy_impl<IType, IndicesDistributorType>;
 
-    std::vector<ITypeNonMod*> send_ibufs;
-    std::vector<VTypeNonMod*> send_vbufs;
+    std::vector<std::vector<ITypeNonMod*>> send_ibufs;
+    std::vector<std::vector<VTypeNonMod*>> send_vbufs;
 
     /* buffers from these arrays will be reallocated inside completion callback */
-    std::vector<ITypeNonMod*> recv_ibufs;
-    std::vector<VTypeNonMod*> recv_vbufs;
+    std::vector<std::vector<ITypeNonMod*>> recv_ibufs;
+    std::vector<std::vector<VTypeNonMod*>> recv_vbufs;
 
     size_t* recv_icount = nullptr;
     size_t* recv_vcount = nullptr;
-    std::vector<sparse_allreduce_fn_ctx_t> fn_ctxs;
-
-    ITypeNonMod* single_send_ibuf = nullptr;
-    VTypeNonMod* single_send_vbuf = nullptr;
-    ITypeNonMod* single_recv_ibuf = nullptr;
-    VTypeNonMod* single_recv_vbuf = nullptr;
-    size_t single_recv_icount{};
-    size_t single_recv_vcount{};
-    sparse_allreduce_fn_ctx_t single_fn_ctx;
+    std::vector<std::vector<sparse_allreduce_fn_ctx_t>> fn_ctxs;
 
     base_sparse_allreduce_coll(bench_init_attr init_attr, size_t size)
             : base_coll(init_attr),
@@ -65,6 +57,14 @@ struct base_sparse_allreduce_coll
         send_vbufs.resize(init_attr.buf_count);
         recv_ibufs.resize(init_attr.buf_count);
         recv_vbufs.resize(init_attr.buf_count);
+
+        for (size_t idx = 0; idx < init_attr.buf_count; idx++) {
+            fn_ctxs[idx].resize(init_attr.ranks_per_proc);
+            send_ibufs[idx].resize(init_attr.ranks_per_proc);
+            send_vbufs[idx].resize(init_attr.ranks_per_proc);
+            recv_ibufs[idx].resize(init_attr.ranks_per_proc);
+            recv_vbufs[idx].resize(init_attr.ranks_per_proc);
+        }
     }
 
     virtual ~base_sparse_allreduce_coll() {
@@ -78,6 +78,20 @@ struct base_sparse_allreduce_coll
     }
 
     ccl::datatype get_dtype() const override final {
-        return ccl::native_type_info<typename std::remove_pointer<VType>::type>::ccl_datatype_value;
+        return ccl::native_type_info<typename std::remove_pointer<VType>::type>::dtype;
+    }
+
+    virtual void prepare_internal(size_t elem_count,
+                                  ccl::communicator& comm,
+                                  ccl::stream& stream,
+                                  size_t rank_idx) override {
+        ASSERT(0, "unexpected");
+    }
+
+    virtual void finalize_internal(size_t elem_count,
+                                   ccl::communicator& comm,
+                                   ccl::stream& stream,
+                                   size_t rank_idx) override {
+        ASSERT(0, "unexpected");
     }
 };
diff --git a/examples/benchmark/src/sparse_allreduce/sparse_allreduce_strategy.hpp b/examples/benchmark/src/sparse_allreduce/sparse_allreduce_strategy.hpp
index dc6292475..c8d4a3c43 100644
--- a/examples/benchmark/src/sparse_allreduce/sparse_allreduce_strategy.hpp
+++ b/examples/benchmark/src/sparse_allreduce/sparse_allreduce_strategy.hpp
@@ -13,6 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#if 0
+
 #pragma once
 
 template <class type>
@@ -23,7 +25,7 @@ struct type_printer {
 };
 
 template <>
-struct type_printer<ccl::bf16> {
+struct type_printer<ccl::bfloat16> {
     static constexpr const char* sparse_class_name() {
         return "sparse_allreduce_bf16";
     }
@@ -130,7 +132,7 @@ struct sparse_allreduce_strategy_impl {
     using IndicesDistributor = IndicesDistributorType<remove_all_t<IType>>;
 
     size_t v2i_ratio;
-    size_t comm_size;
+    int comm_size;
     const size_t minimal_indices_count = 1;
 
     void init_distributor(const std::pair<size_t, size_t>& elem_range) {
@@ -138,7 +140,7 @@ struct sparse_allreduce_strategy_impl {
         indices_distributor_impl.reset(new IndicesDistributor(elem_range.first, indices_count));
     }
 
-    sparse_allreduce_strategy_impl(size_t v2i_ratio, size_t comm_size)
+    sparse_allreduce_strategy_impl(size_t v2i_ratio, int comm_size)
             : v2i_ratio(v2i_ratio),
               comm_size(comm_size) {}
 
@@ -153,8 +155,8 @@ struct sparse_allreduce_strategy_impl {
         return std::tuple<size_t, size_t>(indices_count, indices_count * vdim_count);
     }
 
-    template <class VType, class comm_t, class... Args>
-    void start_internal(comm_t& comm,
+    template <class VType, class... Args>
+    void start_internal(ccl::communicator& comm,
                         const IType send_ibuf,
                         size_t send_icount,
                         const VType send_vbuf,
@@ -197,3 +199,5 @@ struct sparse_allreduce_strategy_impl {
 
     std::unique_ptr<IndicesDistributor> indices_distributor_impl;
 };
+
+#endif
diff --git a/examples/benchmark/src/sparse_allreduce/sparse_detail.hpp b/examples/benchmark/src/sparse_allreduce/sparse_detail.hpp
index 4f07238ac..9a33c16e8 100644
--- a/examples/benchmark/src/sparse_allreduce/sparse_detail.hpp
+++ b/examples/benchmark/src/sparse_allreduce/sparse_detail.hpp
@@ -13,6 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#if 0
+
 #pragma once
 
 #include <algorithm>
@@ -89,14 +91,14 @@ void fill_sparse_data(const std::tuple<size_t, size_t>& expected_recv_counts,
     std::fill(recv_vbuf, recv_vbuf + recv_vbuf_count, ValueType{ 0 });
 }
 
-// override for ccl::bf16
+// override for ccl::bfloat16
 template <class IndexType, class IndicesDistributorType>
 void fill_sparse_data(const std::tuple<size_t, size_t>& expected_recv_counts,
                       IndicesDistributorType& generator,
                       size_t elem_count,
                       IndexType* send_ibuf,
-                      ccl::bf16* send_vbuf,
-                      ccl::bf16* recv_vbuf,
+                      ccl::bfloat16* send_vbuf,
+                      ccl::bfloat16* recv_vbuf,
                       size_t recv_vbuf_count,
                       size_t& recv_icount,
                       size_t& recv_vcount,
@@ -117,7 +119,7 @@ void fill_sparse_data(const std::tuple<size_t, size_t>& expected_recv_counts,
         }
     }
 
-    std::fill(recv_vbuf, recv_vbuf + recv_vbuf_count, ccl::bf16{ 0 });
+    std::fill(recv_vbuf, recv_vbuf + recv_vbuf_count, ccl::bfloat16{ 0 });
 
     // convert send_vbuf from float to send_vbuf in bf16
     convert_fp32_to_bf16_arrays(send_vbuf_from.data(), send_vbuf, elem_count);
@@ -132,8 +134,8 @@ void check_sparse_result(const std::tuple<size_t, size_t>& expected_recv_counts,
                          const ValueType* recv_vbuf,
                          size_t recv_icount,
                          size_t recv_vcount,
-                         size_t comm_size,
-                         size_t comm_rank) {
+                         int comm_size,
+                         int comm_rank) {
     size_t indices_count, vdim_count;
     std::tie(indices_count, vdim_count) = expected_recv_counts;
     vdim_count = vdim_count / indices_count;
@@ -150,7 +152,7 @@ void check_sparse_result(const std::tuple<size_t, size_t>& expected_recv_counts,
                    base_send_data.begin(),
                    std::bind(std::minus<ValueType>(), std::placeholders::_1, comm_rank));
 
-    for (size_t rank_index = 0; rank_index < comm_size; rank_index++) {
+    for (int rank_index = 0; rank_index < comm_size; rank_index++) {
         std::copy(send_ibuf, send_ibuf + indices_count, std::back_inserter(aggregated_indices));
 
         std::transform(base_send_data.begin(),
@@ -248,18 +250,18 @@ void check_sparse_result(const std::tuple<size_t, size_t>& expected_recv_counts,
     }
 }
 
-// override for ccl::bf16
+// override for ccl::bfloat16
 template <class IndexType>
 void check_sparse_result(const std::tuple<size_t, size_t>& expected_recv_counts,
                          size_t elem_count,
                          const IndexType* send_ibuf,
-                         const ccl::bf16* send_vbuf,
+                         const ccl::bfloat16* send_vbuf,
                          const IndexType* recv_ibuf,
-                         const ccl::bf16* recv_vbuf,
+                         const ccl::bfloat16* recv_vbuf,
                          size_t recv_icount,
                          size_t recv_vcount,
-                         size_t comm_size,
-                         size_t comm_rank) {
+                         int comm_size,
+                         int comm_rank) {
     size_t indices_count, vdim_count;
     std::tie(indices_count, vdim_count) = expected_recv_counts;
     vdim_count = vdim_count / indices_count;
@@ -270,7 +272,7 @@ void check_sparse_result(const std::tuple<size_t, size_t>& expected_recv_counts,
     std::vector<float> aggregated_values;
     aggregated_values.reserve(indices_count * vdim_count * comm_size);
 
-    for (size_t rank_index = 0; rank_index < comm_size; rank_index++) {
+    for (int rank_index = 0; rank_index < comm_size; rank_index++) {
         std::copy(send_ibuf, send_ibuf + indices_count, std::back_inserter(aggregated_indices));
 
         for (size_t i_idx = 0; i_idx < indices_count; i_idx++) {
@@ -312,7 +314,7 @@ void check_sparse_result(const std::tuple<size_t, size_t>& expected_recv_counts,
 
     // check received values
     std::vector<float> recv_vbuf_float(recv_vcount, float{ 0 });
-    convert_bf16_to_fp32_arrays(reinterpret_cast<void*>(const_cast<ccl::bf16*>(recv_vbuf)),
+    convert_bf16_to_fp32_arrays(reinterpret_cast<void*>(const_cast<ccl::bfloat16*>(recv_vbuf)),
                                  recv_vbuf_float.data(),
                                  recv_vcount);
 
@@ -387,3 +389,5 @@ void check_sparse_result(const std::tuple<size_t, size_t>& expected_recv_counts,
     }
 }
 } /* namespace sparse_detail */
+
+#endif
diff --git a/examples/benchmark/src/sparse_allreduce/sycl_sparse_allreduce_coll.hpp b/examples/benchmark/src/sparse_allreduce/sycl_sparse_allreduce_coll.hpp
index efd2c5a1c..ea7507a89 100644
--- a/examples/benchmark/src/sparse_allreduce/sycl_sparse_allreduce_coll.hpp
+++ b/examples/benchmark/src/sparse_allreduce/sycl_sparse_allreduce_coll.hpp
@@ -13,25 +13,21 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#if 0
+
 #pragma once
 
 #ifdef CCL_ENABLE_SYCL
 
 #include "sycl_coll.hpp"
 
-template <class kernel_value_type, class kernel_index_type>
-struct sparse_allreduce_kernel_name_bufs {};
-template <class kernel_value_type, class kernel_index_type>
-struct sparse_allreduce_kernel_name_single_bufs {};
-
 template <class VType,
           class IType,
           template <class> class IndicesDistributorType =
               sparse_detail::incremental_indices_distributor>
 struct sycl_sparse_allreduce_coll : base_sparse_allreduce_coll<cl::sycl::buffer<VType, 1>,
                                                                cl::sycl::buffer<IType, 1>,
-                                                               IndicesDistributorType>,
-                                    device_data {
+                                                               IndicesDistributorType> {
     using sycl_indices_t = cl::sycl::buffer<IType, 1>;
     using sycl_values_t = cl::sycl::buffer<VType, 1>;
     using coll_base =
@@ -46,94 +42,51 @@ struct sycl_sparse_allreduce_coll : base_sparse_allreduce_coll<cl::sycl::buffer<
     using coll_base::recv_vcount;
     using coll_base::fn_ctxs;
 
-    using coll_base::single_send_ibuf;
-    using coll_base::single_send_vbuf;
-    using coll_base::single_recv_ibuf;
-    using coll_base::single_recv_vbuf;
-    using coll_base::single_recv_icount;
-    using coll_base::single_recv_vcount;
-    using coll_base::single_fn_ctx;
-
     sycl_sparse_allreduce_coll(bench_init_attr init_attr,
                                size_t sbuf_size_modifier = 1,
                                size_t rbuf_size_modifier = 1)
-            : coll_base(init_attr, comm().size()) {
-        size_t max_elem_count = base_coll::get_max_elem_count();
-        size_t single_buf_max_elem_count = base_coll::get_single_buf_max_elem_count();
-
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            send_ibufs[idx] = new sycl_indices_t(max_elem_count * sbuf_size_modifier);
-            send_vbufs[idx] = new sycl_values_t(max_elem_count * sbuf_size_modifier);
-
-            recv_ibufs[idx] =
-                new sycl_indices_t(max_elem_count * rbuf_size_modifier * comm().size());
-            recv_vbufs[idx] =
-                new sycl_values_t(max_elem_count * rbuf_size_modifier * comm().size());
-
-            device_data::sycl_queue.submit([&](handler& cgh) {
-                auto send_ibuf = (static_cast<sycl_indices_t*>(send_ibufs[idx]));
-                auto send_vbuf = (static_cast<sycl_values_t*>(send_vbufs[idx]));
-
-                auto recv_ibuf = (static_cast<sycl_indices_t*>(recv_ibufs[idx]));
-                auto recv_vbuf = (static_cast<sycl_values_t*>(recv_vbufs[idx]));
-
-                auto send_ibuf_acc = send_ibuf->template get_access<mode::write>(cgh);
-                auto send_vbuf_acc = send_vbuf->template get_access<mode::write>(cgh);
-                auto recv_ibuf_acc = recv_ibuf->template get_access<mode::write>(cgh);
-                auto recv_vbuf_acc = recv_vbuf->template get_access<mode::write>(cgh);
-
-                cgh.parallel_for<struct sparse_allreduce_kernel_name_bufs<VType, IType>>
-                        (range<1>{max_elem_count*comm().size()}, [=](item<1> e_idx)
-                {
-                    if (e_idx.get_linear_id() < max_elem_count) {
-                        send_ibuf_acc[e_idx] = 0;
-                        send_vbuf_acc[e_idx] = 0;
-                    }
-                    recv_ibuf_acc[e_idx] = 0;
-                    recv_vbuf_acc[e_idx] = 0;
-                });
-            });
-        }
-
-        single_send_ibuf = new sycl_indices_t(single_buf_max_elem_count * sbuf_size_modifier);
-        single_send_vbuf = new sycl_values_t(single_buf_max_elem_count * sbuf_size_modifier);
-
-        single_recv_ibuf =
-            new sycl_indices_t(single_buf_max_elem_count * rbuf_size_modifier * comm().size());
-        single_recv_vbuf =
-            new sycl_values_t(single_buf_max_elem_count * rbuf_size_modifier * comm().size());
-
-        device_data::sycl_queue.submit([&](handler& cgh) {
-            auto send_ibuf = (static_cast<sycl_indices_t*>(single_send_ibuf));
-            auto send_vbuf = (static_cast<sycl_values_t*>(single_send_vbuf));
-
-            auto recv_ibuf = (static_cast<sycl_indices_t*>(single_recv_ibuf));
-            auto recv_vbuf = (static_cast<sycl_values_t*>(single_recv_vbuf));
-
-            auto send_ibuf_acc = send_ibuf->template get_access<mode::write>(cgh);
-            auto send_vbuf_acc = send_vbuf->template get_access<mode::write>(cgh);
-
-            auto recv_ibuf_acc = recv_ibuf->template get_access<mode::write>(cgh);
-            auto recv_vbuf_acc = recv_vbuf->template get_access<mode::write>(cgh);
-
-            cgh.parallel_for<struct sparse_allreduce_kernel_name_single_bufs<VType, IType>>
-                    (range<1>{ single_buf_max_elem_count * comm().size() }, [=](item<1> e_idx)
-            {
-                if (e_idx.get_linear_id() < single_buf_max_elem_count) {
-                    send_ibuf_acc[e_idx] = 0;
-                    send_vbuf_acc[e_idx] = 0;
-                }
-                recv_ibuf_acc[e_idx] = 0;
-                recv_vbuf_acc[e_idx] = 0;
-            });
-        });
-
-        for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
-            fn_ctxs[idx].recv_ibuf = (void**)(&(recv_ibufs[idx]));
-            fn_ctxs[idx].recv_vbuf = (void**)(&(recv_vbufs[idx]));
-        }
-        single_fn_ctx.recv_ibuf = (void**)(&single_recv_ibuf);
-        single_fn_ctx.recv_vbuf = (void**)(&single_recv_vbuf);
+            : coll_base(init_attr, transport_data::get_comm_size()) {
+        // size_t max_elem_count = base_coll::get_max_elem_count();
+
+        // int comm_size = transport_data::get_comm_size();
+
+        // for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+        //     send_ibufs[idx] = new sycl_indices_t(max_elem_count * sbuf_size_modifier);
+        //     send_vbufs[idx] = new sycl_values_t(max_elem_count * sbuf_size_modifier);
+
+        //     recv_ibufs[idx] =
+        //         new sycl_indices_t(max_elem_count * rbuf_size_modifier * comm_size);
+        //     recv_vbufs[idx] =
+        //         new sycl_values_t(max_elem_count * rbuf_size_modifier * comm_size);
+
+        //     stream.get_native().submit([&](handler& h) {
+        //         auto send_ibuf = (static_cast<sycl_indices_t*>(send_ibufs[idx]));
+        //         auto send_vbuf = (static_cast<sycl_values_t*>(send_vbufs[idx]));
+
+        //         auto recv_ibuf = (static_cast<sycl_indices_t*>(recv_ibufs[idx]));
+        //         auto recv_vbuf = (static_cast<sycl_values_t*>(recv_vbufs[idx]));
+
+        //         auto send_ibuf_acc = send_ibuf->template get_access<mode::write>(h);
+        //         auto send_vbuf_acc = send_vbuf->template get_access<mode::write>(h);
+        //         auto recv_ibuf_acc = recv_ibuf->template get_access<mode::write>(h);
+        //         auto recv_vbuf_acc = recv_vbuf->template get_access<mode::write>(h);
+
+        //         h.parallel_for(range<1>{max_elem_count*comm_size}, [=](item<1> e_idx)
+        //         {
+        //             if (e_idx.get_linear_id() < max_elem_count) {
+        //                 send_ibuf_acc[e_idx] = 0;
+        //                 send_vbuf_acc[e_idx] = 0;
+        //             }
+        //             recv_ibuf_acc[e_idx] = 0;
+        //             recv_vbuf_acc[e_idx] = 0;
+        //         });
+        //     }).wait();
+        // }
+
+        // for (size_t idx = 0; idx < base_coll::get_buf_count(); idx++) {
+        //     fn_ctxs[idx].recv_ibuf = (void**)(&(recv_ibufs[idx]));
+        //     fn_ctxs[idx].recv_vbuf = (void**)(&(recv_vbufs[idx]));
+        // }
     }
 
     virtual void prepare(size_t elem_count) override {
@@ -143,6 +96,22 @@ struct sycl_sparse_allreduce_coll : base_sparse_allreduce_coll<cl::sycl::buffer<
     virtual void finalize(size_t elem_count) override {
         // TODO not implemented yet
     }
+
+    virtual void prepare_internal(size_t elem_count,
+                         ccl::communicator& comm,
+                         ccl::stream& stream,
+                         size_t rank_idx) override {
+
+        // TODO not implemented yet
+    }
+
+    virtual void finalize_internal(size_t elem_count,
+                          ccl::communicator& comm,
+                          ccl::stream& stream,
+                          size_t rank_idx) override {
+
+        // TODO not implemented yet
+    }
     virtual void start(size_t count,
                        size_t buf_idx,
                        const bench_exec_attr& attr,
@@ -163,39 +132,8 @@ struct sycl_sparse_allreduce_coll : base_sparse_allreduce_coll<cl::sycl::buffer<
             stream(),
             coll_strategy::get_op_attr(attr));*/
     }
-
-    virtual void start_single(size_t count,
-                              const bench_exec_attr& attr,
-                              req_list_t& reqs) override {
-        /*coll_strategy::start_internal(
-            comm(),
-            *static_cast<const cl::sycl::buffer<IType>*>(single_send_ibuf),
-            count,
-            *reinterpret_cast<const cl::sycl::buffer<VType>*>(single_send_vbuf),
-            count,
-            *static_cast<cl::sycl::buffer<IType>*>(single_recv_ibuf),
-            single_recv_icount,
-            *reinterpret_cast<cl::sycl::buffer<VType>*>(single_recv_vbuf),
-            single_recv_vcount,
-            attr,
-            reqs,
-            single_fn_ctx,
-            stream(),
-            coll_strategy::get_op_attr(attr));*/
-    }
-
-    /* global communicator for cpu collectives */
-    static ccl::communicator& comm() {
-        if (!device_data::comm_ptr) {
-        }
-        return *device_data::comm_ptr;
-    }
-
-    static ccl::stream& stream() {
-        if (!device_data::stream_ptr) {
-        }
-        return *device_data::stream_ptr;
-    }
 };
 
 #endif /* CCL_ENABLE_SYCL */
+
+#endif
diff --git a/examples/benchmark/src/transport_impl.hpp b/examples/benchmark/src/transport_impl.hpp
index e63d525b1..3c6b81b9c 100644
--- a/examples/benchmark/src/transport_impl.hpp
+++ b/examples/benchmark/src/transport_impl.hpp
@@ -13,61 +13,138 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
 #include <mpi.h>
 
-#include "base.hpp"
+#ifdef CCL_ENABLE_SYCL
+#include <CL/sycl.hpp>
+#include "sycl_coll.hpp"
+#endif /* CCL_ENABLE_SYCL */
+
 #include "transport.hpp"
 
-transport_settings::transport_settings() {
+transport_data::transport_data() {
     init_by_mpi();
+
+    service_comms.push_back(ccl::create_communicator(size, rank, kvs));
 }
 
-transport_settings::~transport_settings() {
+transport_data::~transport_data() {
     deinit_by_mpi();
 }
 
-transport_settings &transport_settings::instance() {
-    static transport_settings inst;
+transport_data& transport_data::instance() {
+    static transport_data inst;
     return inst;
 }
 
-int transport_settings::get_rank() const noexcept {
+size_t transport_data::get_comm_size() {
+    return transport_data::instance().get_comms()[0].size();
+}
+
+int transport_data::get_rank() const noexcept {
     return rank;
 }
 
-int transport_settings::get_size() const noexcept {
+int transport_data::get_size() const noexcept {
     return size;
 }
 
-ccl::shared_ptr_class<ccl::kvs> transport_settings::get_kvs() {
+ccl::shared_ptr_class<ccl::kvs> transport_data::get_kvs() {
     return kvs;
 }
 
-void transport_settings::init_by_mpi() {
-
+void transport_data::init_by_mpi() {
     ccl::init();
 
     MPI_Init(NULL, NULL);
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-    /* create CCL internal KVS */
     ccl::shared_ptr_class<ccl::kvs> kvs_candidate;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
         kvs_candidate = ccl::create_main_kvs();
         main_addr = kvs_candidate->get_address();
-        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
     }
     else {
-        MPI_Bcast((void *)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
         kvs_candidate = ccl::create_kvs(main_addr);
     }
     kvs = kvs_candidate;
 }
 
-void transport_settings::deinit_by_mpi() {
+void transport_data::deinit_by_mpi() {
     MPI_Finalize();
 }
+
+ccl::communicator& transport_data::get_service_comm() {
+    return service_comms[0];
+}
+
+std::vector<ccl::stream>& transport_data::get_streams() {
+    return streams;
+}
+
+std::vector<ccl::stream>& transport_data::get_bench_streams() {
+    return bench_streams;
+}
+
+void transport_data::init_comms(user_options_t& options) {
+    int ranks_per_proc = options.ranks_per_proc;
+
+    std::vector<int> local_ranks;
+    for (int idx = 0; idx < ranks_per_proc; idx++) {
+        local_ranks.push_back(rank * ranks_per_proc + idx);
+    }
+
+    ccl::context context = ccl::create_context();
+    std::vector<ccl::device> devices;
+    std::map<int, ccl::device> r2d_map;
+
+    if (options.backend == BACKEND_HOST) {
+        for (int idx = 0; idx < ranks_per_proc; idx++) {
+            streams.push_back(ccl::create_stream());
+            bench_streams.push_back(ccl::create_stream());
+            devices.push_back(ccl::create_device());
+        }
+    }
+#ifdef CCL_ENABLE_SYCL
+    else if (options.backend == BACKEND_SYCL) {
+        auto sycl_queues = create_sycl_queues(sycl_dev_names[options.sycl_dev_type], local_ranks);
+        ASSERT(!sycl_queues.empty(), "queues should contain at least one queue");
+        ASSERT(ranks_per_proc == sycl_queues.size(), "ranks and queues sizes should match");
+
+        auto sycl_context = sycl_queues[0].get_context();
+        context = ccl::create_context(sycl_context);
+
+        for (int idx = 0; idx < ranks_per_proc; idx++) {
+            streams.push_back(ccl::create_stream(sycl_queues[idx]));
+            auto q = sycl::queue(sycl_queues[idx].get_context(), sycl_queues[idx].get_device());
+            bench_streams.push_back(ccl::create_stream(q));
+            devices.push_back(ccl::create_device(sycl_queues[idx].get_device()));
+            // TODO: multidevice unsupported yet
+            // ASSERT(sycl_context == sycl_queues[idx].get_context(),
+            //    "all sycl queues should be from the same sycl context");
+        }
+    }
+#endif /* CCL_ENABLE_SYCL */
+    else {
+        ASSERT(0, "unknown backend %d", (int)options.backend);
+    }
+
+    for (int idx = 0; idx < ranks_per_proc; idx++) {
+        r2d_map.emplace(local_ranks[idx], devices[idx]);
+    }
+
+    comms = ccl::create_communicators(size * ranks_per_proc, r2d_map, context, kvs);
+
+    ASSERT((int)comms.size() == ranks_per_proc,
+           "unexpected comms size %zu, expected %d",
+           comms.size(),
+           ranks_per_proc);
+}
+
+std::vector<ccl::communicator>& transport_data::get_comms() {
+    return comms;
+}
diff --git a/examples/common/CMakeLists.txt b/examples/common/CMakeLists.txt
index 9ff37b6e3..a5df2d71c 100644
--- a/examples/common/CMakeLists.txt
+++ b/examples/common/CMakeLists.txt
@@ -25,5 +25,6 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC rt)
     target_link_libraries(${executable} PUBLIC m)
     target_link_libraries(${executable} PUBLIC dl)
-    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/common)
+    target_link_libraries(${executable} PUBLIC mpi)
+    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/common OPTIONAL)
 endforeach()
diff --git a/examples/common/version.cpp b/examples/common/version.cpp
index 2cd8e4a19..bdbf47078 100644
--- a/examples/common/version.cpp
+++ b/examples/common/version.cpp
@@ -30,13 +30,14 @@ int main() {
            CCL_PRODUCT_FULL);
 
     printf("\nRuntime CCL library version:\nmajor: %d\nminor: %d\nupdate: %d\n"
-           "Product: %s\nBuild date: %s\nFull: %s\n",
+           "Product: %s\nBuild date: %s\nFull: %s\ncl_backend name: %s\n",
            version.major,
            version.minor,
            version.update,
            version.product_status,
            version.build_date,
-           version.full);
+           version.full,
+           version.cl_backend_name.c_str());
 
     printf("\noneCCL specification version: %s\n", ONECCL_SPEC_VERSION);
 
diff --git a/examples/cpu/CMakeLists.txt b/examples/cpu/CMakeLists.txt
index 07da2ea50..58099643c 100644
--- a/examples/cpu/CMakeLists.txt
+++ b/examples/cpu/CMakeLists.txt
@@ -27,6 +27,6 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC stdc++)
     target_link_libraries(${executable} PRIVATE m)
     target_link_libraries(${executable} PUBLIC mpi)
-    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/cpu)
+    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/cpu OPTIONAL)
 endforeach()
 
diff --git a/examples/cpu/allgatherv.cpp b/examples/cpu/allgatherv.cpp
index 5ddc50e8a..393710f9f 100644
--- a/examples/cpu/allgatherv.cpp
+++ b/examples/cpu/allgatherv.cpp
@@ -15,14 +15,12 @@
 */
 #include "base.hpp"
 
-void run_collective(
-    const char* cmd_name,
-    std::vector<float>& send_buf,
-    std::vector<float>& recv_buf,
-    std::vector<size_t>& recv_counts,
-    const ccl::communicator& comm,
-    const ccl::allgatherv_attr& attr) {
-
+void run_collective(const char* cmd_name,
+                    std::vector<float>& send_buf,
+                    std::vector<float>& recv_buf,
+                    std::vector<size_t>& recv_counts,
+                    const ccl::communicator& comm,
+                    const ccl::allgatherv_attr& attr) {
     std::chrono::system_clock::duration exec_time{ 0 };
     float expected = send_buf.size();
     float received;
@@ -31,13 +29,8 @@ void run_collective(
 
     for (size_t idx = 0; idx < ITERS; ++idx) {
         auto start = std::chrono::system_clock::now();
-        ccl::allgatherv(
-            send_buf.data(),
-            send_buf.size(),
-            recv_buf.data(),
-            recv_counts,
-            comm,
-            attr).wait();
+        ccl::allgatherv(send_buf.data(), send_buf.size(), recv_buf.data(), recv_counts, comm, attr)
+            .wait();
         exec_time += std::chrono::system_clock::now() - start;
     }
 
@@ -58,14 +51,12 @@ void run_collective(
               << ", us" << std::endl;
 }
 
-void run_collective_vector(
-    const char* cmd_name,
-    std::vector<float>& send_buf,
-    std::vector<float*>& recv_bufs,
-    std::vector<size_t>& recv_counts,
-    const ccl::communicator& comm,
-    const ccl::allgatherv_attr& attr) {
-
+void run_collective_vector(const char* cmd_name,
+                           std::vector<float>& send_buf,
+                           std::vector<float*>& recv_bufs,
+                           std::vector<size_t>& recv_counts,
+                           const ccl::communicator& comm,
+                           const ccl::allgatherv_attr& attr) {
     std::chrono::system_clock::duration exec_time{ 0 };
     float expected = send_buf.size();
     float received;
@@ -74,13 +65,8 @@ void run_collective_vector(
 
     for (size_t idx = 0; idx < ITERS; ++idx) {
         auto start = std::chrono::system_clock::now();
-        ccl::allgatherv(
-            send_buf.data(),
-            send_buf.size(),
-            recv_bufs,
-            recv_counts,
-            comm,
-            attr).wait();
+        ccl::allgatherv(send_buf.data(), send_buf.size(), recv_bufs, recv_counts, comm, attr)
+            .wait();
         exec_time += std::chrono::system_clock::now() - start;
     }
 
@@ -104,7 +90,6 @@ void run_collective_vector(
 }
 
 int main() {
-
     ccl::init();
 
     int size, rank;
@@ -114,50 +99,46 @@ int main() {
 
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
+    auto kvs_attr = ccl::create_kvs_attr();
     if (rank == 0) {
-        kvs = ccl::create_main_kvs();
+        kvs = ccl::create_main_kvs(kvs_attr);
         main_addr = kvs->get_address();
         MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
     }
     else {
         MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
-        kvs = ccl::create_kvs(main_addr);
+        kvs = ccl::create_kvs(main_addr, kvs_attr);
     }
 
     auto dev = ccl::create_device();
     auto ctx = ccl::create_context();
-    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs);
+    auto comm_attr = ccl::create_comm_attr();
+    auto comm = ccl::create_communicator(size, rank, dev, ctx, kvs, comm_attr);
     auto attr = ccl::create_operation_attr<ccl::allgatherv_attr>();
 
-    MSG_LOOP(
-        comm,
-
-        std::vector<float> send_buf(msg_count, static_cast<float>(msg_count));
-        std::vector<float> recv_buf(comm.size() * msg_count, 0);
-        std::vector<float*> recv_bufs(comm.size(), nullptr);
-        std::vector<size_t> recv_counts(comm.size(), msg_count);
-
-        for (size_t idx = 0; idx < comm.size(); idx++)
-            recv_bufs[idx] = new float[msg_count];
-
-        attr.set<ccl::operation_attr_id::to_cache>(false);
-        run_collective(
-            "warmup_allgatherv", send_buf, recv_buf, recv_counts, comm, attr);
-        run_collective_vector(
-            "warmup_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr);
-
-        attr.set<ccl::operation_attr_id::to_cache>(true);
-        run_collective(
-            "persistent_allgatherv", send_buf, recv_buf, recv_counts, comm, attr);
-        run_collective_vector(
-            "persistent_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr);
-
-        attr.set<ccl::operation_attr_id::to_cache>(false);
-        run_collective(
-            "regular_allgatherv", send_buf, recv_buf, recv_counts, comm, attr);
-        run_collective_vector(
-            "regular_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr);
-    );
+    MSG_LOOP(comm,
+
+             std::vector<float> send_buf(msg_count, static_cast<float>(msg_count));
+             std::vector<float> recv_buf(comm.size() * msg_count, 0);
+             std::vector<float*> recv_bufs(comm.size(), nullptr);
+             std::vector<size_t> recv_counts(comm.size(), msg_count);
+
+             for (int idx = 0; idx < comm.size(); idx++) recv_bufs[idx] = new float[msg_count];
+
+             attr.set<ccl::operation_attr_id::to_cache>(false);
+             run_collective("warmup_allgatherv", send_buf, recv_buf, recv_counts, comm, attr);
+             run_collective_vector(
+                 "warmup_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr);
+
+             attr.set<ccl::operation_attr_id::to_cache>(true);
+             run_collective("persistent_allgatherv", send_buf, recv_buf, recv_counts, comm, attr);
+             run_collective_vector(
+                 "persistent_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr);
+
+             attr.set<ccl::operation_attr_id::to_cache>(false);
+             run_collective("regular_allgatherv", send_buf, recv_buf, recv_counts, comm, attr);
+             run_collective_vector(
+                 "regular_allgatherv_vector", send_buf, recv_bufs, recv_counts, comm, attr););
 
     MPI_Finalize();
 
diff --git a/examples/cpu/allreduce.cpp b/examples/cpu/allreduce.cpp
index 14fe29b72..944ae3afb 100644
--- a/examples/cpu/allreduce.cpp
+++ b/examples/cpu/allreduce.cpp
@@ -27,12 +27,9 @@ void run_collective(const char* cmd_name,
 
     for (size_t idx = 0; idx < ITERS; ++idx) {
         auto start = std::chrono::system_clock::now();
-        ccl::allreduce(send_buf.data(),
-                       recv_buf.data(),
-                       recv_buf.size(),
-                       ccl::reduction::sum,
-                       comm,
-                       attr).wait();
+        ccl::allreduce(
+            send_buf.data(), recv_buf.data(), recv_buf.size(), ccl::reduction::sum, comm, attr)
+            .wait();
         exec_time += std::chrono::system_clock::now() - start;
     }
 
@@ -53,7 +50,6 @@ void run_collective(const char* cmd_name,
 }
 
 int main() {
-
     ccl::init();
 
     int size, rank;
diff --git a/examples/cpu/alltoallv.cpp b/examples/cpu/alltoallv.cpp
index 639f06130..bca29d482 100644
--- a/examples/cpu/alltoallv.cpp
+++ b/examples/cpu/alltoallv.cpp
@@ -34,7 +34,7 @@ void run_collective(const char* cmd_name,
         std::fill(recv_buf.begin(), recv_buf.end(), 0);
         size_t elem_idx = 0;
 
-        for (size_t rank_idx = 0; rank_idx < comm.size(); rank_idx++) {
+        for (int rank_idx = 0; rank_idx < comm.size(); rank_idx++) {
             for (size_t idx = 0; idx < send_counts[rank_idx]; idx++) {
                 send_buf[elem_idx] = comm.rank();
                 elem_idx++;
@@ -42,19 +42,15 @@ void run_collective(const char* cmd_name,
         }
 
         auto start = std::chrono::system_clock::now();
-        ccl::alltoallv(send_buf.data(),
-                       send_counts,
-                       recv_buf.data(),
-                       recv_counts,
-                       comm,
-                       attr).wait();
+        ccl::alltoallv(send_buf.data(), send_counts, recv_buf.data(), recv_counts, comm, attr)
+            .wait();
         exec_time += std::chrono::system_clock::now() - start;
     }
 
     ccl::barrier(comm);
 
     size_t elem_idx = 0;
-    for (size_t rank_idx = 0; rank_idx < comm.size(); rank_idx++) {
+    for (int rank_idx = 0; rank_idx < comm.size(); rank_idx++) {
         int expected = rank_idx;
         for (size_t idx = 0; idx < recv_counts[rank_idx]; idx++) {
             if (recv_buf[elem_idx] != expected) {
@@ -77,7 +73,6 @@ void run_collective(const char* cmd_name,
 }
 
 int main() {
-
     ccl::init();
 
     int size, rank;
@@ -112,23 +107,20 @@ int main() {
     std::vector<size_t> send_counts(comm.size());
     std::vector<size_t> recv_counts(comm.size());
 
-    for (size_t idx = 0; idx < comm.size(); idx++) {
+    for (int idx = 0; idx < comm.size(); idx++) {
         int is_even_peer = (idx % 2 == 0) ? 1 : 0;
         send_counts[idx] = send_count;
         recv_counts[idx] = (is_even_peer) ? EVEN_RANK_SEND_COUNT : ODD_RANK_SEND_COUNT;
     }
 
-    MSG_LOOP(
-        comm,
-        attr.set<ccl::operation_attr_id::to_cache>(false);
-        run_collective(
-            "warmup alltoallv", send_buf, recv_buf, send_counts, recv_counts, comm, attr);
-        attr.set<ccl::operation_attr_id::to_cache>(true);
-        run_collective(
-            "persistent alltoallv", send_buf, recv_buf, send_counts, recv_counts, comm, attr);
-        attr.set<ccl::operation_attr_id::to_cache>(false);
-        run_collective(
-            "regular alltoallv", send_buf, recv_buf, send_counts, recv_counts, comm, attr););
+    MSG_LOOP(comm, attr.set<ccl::operation_attr_id::to_cache>(false); run_collective(
+                 "warmup alltoallv", send_buf, recv_buf, send_counts, recv_counts, comm, attr);
+             attr.set<ccl::operation_attr_id::to_cache>(true);
+             run_collective(
+                 "persistent alltoallv", send_buf, recv_buf, send_counts, recv_counts, comm, attr);
+             attr.set<ccl::operation_attr_id::to_cache>(false);
+             run_collective(
+                 "regular alltoallv", send_buf, recv_buf, send_counts, recv_counts, comm, attr););
 
     MPI_Finalize();
 
diff --git a/examples/cpu/broadcast.cpp b/examples/cpu/broadcast.cpp
index 22420e597..1bea2985d 100644
--- a/examples/cpu/broadcast.cpp
+++ b/examples/cpu/broadcast.cpp
@@ -31,11 +31,7 @@ void run_collective(const char* cmd_name,
 
     for (size_t idx = 0; idx < ITERS; ++idx) {
         auto start = std::chrono::system_clock::now();
-        ccl::broadcast(buf.data(),
-                       buf.size(),
-                       COLL_ROOT,
-                       comm,
-                       attr).wait();
+        ccl::broadcast(buf.data(), buf.size(), COLL_ROOT, comm, attr).wait();
         exec_time += std::chrono::system_clock::now() - start;
     }
 
@@ -61,7 +57,6 @@ void run_collective(const char* cmd_name,
 }
 
 int main() {
-
     ccl::init();
 
     int size, rank;
diff --git a/examples/cpu/communicator.cpp b/examples/cpu/communicator.cpp
index c12e1c31b..1fe2603db 100644
--- a/examples/cpu/communicator.cpp
+++ b/examples/cpu/communicator.cpp
@@ -61,86 +61,13 @@ void check_max_comm_number(const ccl::communicator& comm,
     } while (true);
 
     PRINT_BY_ROOT(comm, "created %zu communicators", user_comms);
-    // PRINT_BY_ROOT(comm, "try to create one more communicator, it should fail");
-
-    // try
-    // {
-    //     auto comm = ccl::environment::instance().create_communicator();
-    //     printf("FAILED\n");
-    //     throw std::runtime_error("extra communicator has been created");
-    // }
-    // catch(...)
-    // {}
-
-    // PRINT_BY_ROOT(comm, "free one comm, try to create again");
-    // size_t comm_idx = user_comms / 2;
-
-    // try
-    // {
-    //     communicators[comm_idx].reset();
-    // }
-    // catch (...)
-    // {
-    //     printf("FAILED\n");
-    //     throw std::runtime_error("can't free communicator");
-    // }
-
-    // try
-    // {
-    //     communicators[comm_idx] = ccl::environment::instance().create_communicator();
-    // }
-    // catch (...)
-    // {
-    //     printf("FAILED\n");
-    //     throw std::runtime_error("can't create communicator after free");
-    // }
 }
 
-// void check_comm_create_identical_color()
-// {
-//     size_t comm_size{};
-//     size_t comm_rank{};
-
-//     PRINT_BY_ROOT(global_comm,
-//         "create comm as a copy of the global one by settings identical colors");
-
-//     ccl::comm_attr_t comm_attr = ccl::environment::instance().create_host_comm_attr();
-//     comm_attr->set_value<ccl_host_color>(123);
-//     auto comm = ccl::environment::instance().create_communicator(comm_attr);
-
-//     comm_size = comm->size();
-//     comm_rank = comm->rank();
-
-//     if (comm_size != global_comm->size())
-//     {
-//         printf("FAILED\n");
-//         throw std::runtime_error("mismatch in size, expected " +
-//             to_string(global_comm->size()) +
-//             " received " + to_string(comm_size));
-//     }
-
-//     if (comm_rank != global_comm->rank())
-//     {
-//         printf("FAILED\n");
-//         throw std::runtime_error("mismatch in rank, expected " +
-//             to_string(global_comm->rank()) +
-//             " received " + to_string(comm_rank));
-//     }
-
-//     PRINT_BY_ROOT(global_comm,
-//         "global comm: rank = %zu, size = %zu; "
-//         "new comm: rank = %zu, size = %zu",
-//         global_comm->rank(), global_comm->size(),
-//         comm_rank, comm_size);
-
-//     check_allreduce_on_comm(comm);
-// }
-
 bool isPowerOfTwo(unsigned int x) {
     return x && !(x & (x - 1));
 }
 
-void check_comm_split_by_color(ccl::communicator& comm, int mpi_size, int mpi_rank) {
+void check_comm_split_by_color(ccl::communicator& comm) {
     if (!isPowerOfTwo(comm.size())) {
         PRINT_BY_ROOT(
             comm,
@@ -148,18 +75,18 @@ void check_comm_split_by_color(ccl::communicator& comm, int mpi_size, int mpi_ra
         return;
     }
 
-    for (size_t split_by = 2; split_by <= comm.size(); split_by *= 2) {
+    for (int split_by = 2; split_by <= comm.size(); split_by *= 2) {
         int color = comm.rank() % split_by;
-        auto attr =
-            ccl::create_comm_split_attr(ccl::attr_val<ccl::comm_split_attr_id::color>(color));
+        auto attr = ccl::preview::create_comm_split_attr(
+            ccl::attr_val<ccl::comm_split_attr_id::color>(color));
         auto new_comm = comm.split(attr);
 
-        size_t comm_size = comm.size();
-        size_t new_comm_size = new_comm.size();
-        size_t comm_rank = comm.rank();
-        size_t new_comm_rank = new_comm.rank();
+        int comm_size = comm.size();
+        int new_comm_size = new_comm.size();
+        int comm_rank = comm.rank();
+        int new_comm_rank = new_comm.rank();
 
-        size_t expected_new_comm_size = comm_size / split_by;
+        int expected_new_comm_size = comm_size / split_by;
 
         if (new_comm_size != expected_new_comm_size) {
             printf("FAILED (split)\n");
@@ -170,18 +97,97 @@ void check_comm_split_by_color(ccl::communicator& comm, int mpi_size, int mpi_ra
         }
 
         PRINT_BY_ROOT(comm,
-                      "base comm: rank = %zu, size = %zu; "
-                      "new comm: rank = %zu, size = %zu",
+                      "base comm: rank = %d, size = %d; "
+                      "new comm: rank = %d, size = %d",
                       comm_rank,
                       comm_size,
                       new_comm_rank,
                       new_comm_size);
 
+        PRINT_BY_ROOT(comm, " - allreduce test on a new communicator");
         check_allreduce_on_comm(new_comm);
     }
 }
 
+void check_comm_split_identical(ccl::communicator& comm) {
+    if (!isPowerOfTwo(comm.size())) {
+        PRINT_BY_ROOT(
+            comm,
+            "split comm by color: number of processes should be a power of 2 for test purpose");
+        return;
+    }
+
+    for (int split_by = 2; split_by <= comm.size(); split_by *= 2) {
+        int color = comm.rank() % split_by;
+        auto attr = ccl::preview::create_comm_split_attr(
+            ccl::attr_val<ccl::comm_split_attr_id::color>(color));
+        auto new_comm1 = comm.split(attr);
+        auto new_comm2 = comm.split(attr);
+
+        if (new_comm1.size() != new_comm2.size()) {
+            printf("FAILED (split)\n");
+
+            throw std::runtime_error("the sizes of new communicators are not equal. Comm #1 size " +
+                                     std::to_string(new_comm1.size()) + " Comm #2 size " +
+                                     std::to_string(new_comm2.size()));
+        }
+
+        if (new_comm1.rank() != new_comm2.rank()) {
+            printf("FAILED (split)\n");
+
+            throw std::runtime_error("the sizes of new communicators are not equal. Comm #1 rank " +
+                                     std::to_string(new_comm1.rank()) + " Comm #2 rank " +
+                                     std::to_string(new_comm2.rank()));
+        }
+
+        PRINT_BY_ROOT(comm,
+                      "comm #1: rank = %d, size = %d; "
+                      "comm #2: rank = %d, size = %d",
+                      new_comm1.rank(),
+                      new_comm1.size(),
+                      new_comm2.rank(),
+                      new_comm2.size());
+    }
+}
+
+void check_comm_split_identical_color(ccl::communicator& comm) {
+    auto attr =
+        ccl::preview::create_comm_split_attr(ccl::attr_val<ccl::comm_split_attr_id::color>(123));
+    auto new_comm = comm.split(attr);
+
+    if (new_comm.size() != comm.size()) {
+        printf("FAILED (split)\n");
+
+        throw std::runtime_error(
+            "the sizes of new communicator and base communicator are not equal. New comm size " +
+            std::to_string(new_comm.size()) + " Base comm size " + std::to_string(comm.size()));
+    }
+
+    if (new_comm.rank() != comm.rank()) {
+        printf("FAILED (split)\n");
+
+        throw std::runtime_error(
+            "the sizes of new communicator and base communicator are not equal. New comm rank " +
+            std::to_string(new_comm.rank()) + " Base comm rank " + std::to_string(comm.rank()));
+    }
+
+    PRINT_BY_ROOT(comm,
+                  "base comm: rank = %d, size = %d; "
+                  "new comm: rank = %d, size = %d",
+                  comm.rank(),
+                  new_comm.size(),
+                  comm.rank(),
+                  new_comm.size());
+
+    PRINT_BY_ROOT(comm, " - allreduce test on a new communicator");
+    check_allreduce_on_comm(new_comm);
+}
+
 int main() {
+    /**
+     * The example only works with CCL_ATL_TRANSPORT=ofi
+     */
+    setenv("CCL_ATL_TRANSPORT", "ofi", 0);
 
     ccl::init();
 
@@ -213,10 +219,16 @@ int main() {
     // PRINT_BY_ROOT(comm, "PASSED");
 
     PRINT_BY_ROOT(comm, "\n- Communicator split test");
-    check_comm_split_by_color(comm, mpi_size, mpi_rank);
+    check_comm_split_by_color(comm);
+    PRINT_BY_ROOT(comm, "PASSED");
+
+    PRINT_BY_ROOT(comm, "\n- Communicator identical split test");
+    check_comm_split_identical(comm);
     PRINT_BY_ROOT(comm, "PASSED");
 
-    // check_comm_create_identical_color();
+    PRINT_BY_ROOT(comm, "\n- Communicator identical color split test");
+    check_comm_split_identical_color(comm);
+    PRINT_BY_ROOT(comm, "PASSED");
 
     MPI_Finalize();
 
diff --git a/examples/cpu/cpu_allgatherv_test.cpp b/examples/cpu/cpu_allgatherv_test.cpp
index 66afedcb9..5083655db 100644
--- a/examples/cpu/cpu_allgatherv_test.cpp
+++ b/examples/cpu/cpu_allgatherv_test.cpp
@@ -22,7 +22,6 @@
 using namespace std;
 
 int main() {
-
     const size_t count = 128;
 
     size_t i = 0;
@@ -64,11 +63,7 @@ int main() {
     }
 
     /* invoke allgatherv */
-    ccl::allgatherv(send_buf.data(),
-                    count,
-                    recv_buf.data(),
-                    recv_counts,
-                    comm).wait();
+    ccl::allgatherv(send_buf.data(), count, recv_buf.data(), recv_counts, comm).wait();
 
     /* check correctness of recv_buf */
     for (i = 0; i < count; i++) {
diff --git a/examples/cpu/cpu_allreduce_bf16_test.cpp b/examples/cpu/cpu_allreduce_bf16_test.cpp
index 38ce444cf..9ab1dc481 100644
--- a/examples/cpu/cpu_allreduce_bf16_test.cpp
+++ b/examples/cpu/cpu_allreduce_bf16_test.cpp
@@ -28,15 +28,15 @@
 #define CHECK_ERROR(send_buf, recv_buf, comm) \
     { \
         /* https://www.mcs.anl.gov/papers/P4093-0713_1.pdf */ \
-        size_t comm_size = comm.size(); \
+        int comm_size = comm.size(); \
         double log_base2 = log(comm_size) / log(2); \
         double g = (log_base2 * BF16_PRECISION) / (1 - (log_base2 * BF16_PRECISION)); \
         for (size_t i = 0; i < COUNT; i++) { \
-            double expected = ((comm_size * (comm_size - 1) / 2) + ((float)(i) * comm_size)); \
+            double expected = ((comm_size * (comm_size - 1) / 2) + ((float)(i)*comm_size)); \
             double max_error = g * expected; \
             if (fabs(max_error) < fabs(expected - recv_buf[i])) { \
                 printf( \
-                    "[%zu] got recv_buf[%zu] = %0.7f, but expected = %0.7f, max_error = %0.16f\n", \
+                    "[%d] got recv_buf[%zu] = %0.7f, but expected = %0.7f, max_error = %0.16f\n", \
                     comm.rank(), \
                     i, \
                     recv_buf[i], \
@@ -50,7 +50,6 @@
 using namespace std;
 
 int main() {
-
     const size_t count = 4096;
 
     size_t idx = 0;
@@ -93,12 +92,9 @@ int main() {
     else {
         cout << "BF16 is enabled\n";
         convert_fp32_to_bf16_arrays(send_buf, send_buf_bf16, count);
-        ccl::allreduce(send_buf_bf16,
-                       recv_buf_bf16,
-                       count,
-                       ccl::datatype::bfloat16,
-                       ccl::reduction::sum,
-                       comm).wait();
+        ccl::allreduce(
+            send_buf_bf16, recv_buf_bf16, count, ccl::datatype::bfloat16, ccl::reduction::sum, comm)
+            .wait();
         convert_bf16_to_fp32_arrays(recv_buf_bf16, recv_buf, count);
         CHECK_ERROR(send_buf, recv_buf, comm);
 
diff --git a/examples/cpu/cpu_allreduce_test.cpp b/examples/cpu/cpu_allreduce_test.cpp
index 67a623d3f..e80963812 100644
--- a/examples/cpu/cpu_allreduce_test.cpp
+++ b/examples/cpu/cpu_allreduce_test.cpp
@@ -21,7 +21,6 @@
 using namespace std;
 
 int main() {
-
     const size_t count = 4096;
 
     size_t i = 0;
@@ -64,11 +63,7 @@ int main() {
     }
 
     /* invoke allreduce */
-    ccl::allreduce(send_buf,
-                   recv_buf,
-                   count,
-                   ccl::reduction::sum,
-                   comm).wait();
+    ccl::allreduce(send_buf, recv_buf, count, ccl::reduction::sum, comm).wait();
 
     /* check correctness of recv_buf */
     for (i = 0; i < count; i++) {
diff --git a/examples/cpu/custom_allreduce.cpp b/examples/cpu/custom_allreduce.cpp
index 38495a1f1..5ccfc9829 100644
--- a/examples/cpu/custom_allreduce.cpp
+++ b/examples/cpu/custom_allreduce.cpp
@@ -24,7 +24,7 @@
 
 int size, rank;
 ccl::datatype custom_dtype;
-std::string global_match_id;
+ccl::string_class global_match_id;
 
 typedef void (*expected_fn_t)(void*, size_t);
 typedef void (*fill_fn_t)(void*, size_t, size_t);
@@ -49,28 +49,12 @@ typedef int (*check_fn_t)(void*, size_t, expected_fn_t);
     } while (0)
 
 /* primitive operations for custom datatype */
-void custom_2x(void* in_elem, void* out_elem) {
-    for (size_t idx = 0; idx < CUSTOM_REPEAT_COUNT; idx++) {
-        ((CUSTOM_BASE_DTYPE*)out_elem)[idx] = 2 * ((CUSTOM_BASE_DTYPE*)in_elem)[idx];
-    }
-}
-
 void custom_sum(void* in_elem, void* inout_elem) {
     for (size_t idx = 0; idx < CUSTOM_REPEAT_COUNT; idx++) {
         ((CUSTOM_BASE_DTYPE*)inout_elem)[idx] += ((CUSTOM_BASE_DTYPE*)in_elem)[idx];
     }
 }
 
-void custom_to_char(void* in_elem, char* out_elem) {
-    *out_elem = ((CUSTOM_BASE_DTYPE*)in_elem)[0];
-}
-
-void custom_from_char(char* in_elem, void* out_elem) {
-    for (size_t idx = 0; idx < CUSTOM_REPEAT_COUNT; idx++) {
-        ((CUSTOM_BASE_DTYPE*)out_elem)[idx] = (CUSTOM_BASE_DTYPE)(*in_elem);
-    }
-}
-
 void custom_set(void* elem, size_t base_value) {
     for (size_t idx = 0; idx < CUSTOM_REPEAT_COUNT; idx++) {
         ((CUSTOM_BASE_DTYPE*)elem)[idx] = (CUSTOM_BASE_DTYPE)(base_value);
@@ -200,140 +184,6 @@ void expected_custom_6(void* elem, size_t idx) {
     custom_set(elem, 2 * idx);
 }
 
-void do_prologue_2x(const void* in_buf,
-                    size_t in_count,
-                    ccl::datatype in_dtype,
-                    void** out_buf,
-                    size_t* out_count,
-                    ccl::datatype* out_dtype,
-                    const ccl::fn_context* context) {
-    ASSERT((in_dtype == ccl::datatype::float32) || (in_dtype == custom_dtype),
-           "unexpected in_dtype %d",
-           static_cast<int>(in_dtype));
-    ASSERT(out_buf, "null ptr");
-    ASSERT(context->offset == 0, "wrong offset for prologue func, should be 0");
-    ASSERT(!strcmp(context->match_id, global_match_id.c_str()), "wrong match_id");
-
-    if (out_buf)
-        *out_buf = (void*)in_buf;
-    if (out_count)
-        *out_count = in_count;
-    if (out_dtype)
-        *out_dtype = in_dtype;
-
-    for (size_t idx = 0; idx < in_count; idx++) {
-        if (in_dtype == ccl::datatype::float32) {
-            ((float*)(*out_buf))[idx] = ((float*)in_buf)[idx] * 2;
-        }
-        else if (in_dtype == custom_dtype) {
-            custom_2x((char*)in_buf + idx * CUSTOM_DTYPE_SIZE,
-                      (char*)(*out_buf) + idx * CUSTOM_DTYPE_SIZE);
-        }
-        else {
-            ASSERT(0, "unexpected dtype %d", static_cast<int>(in_dtype));
-        }
-    }
-}
-
-void do_epilogue_2x(const void* in_buf,
-                    size_t in_count,
-                    ccl::datatype in_dtype,
-                    void* out_buf,
-                    size_t* out_count,
-                    ccl::datatype* out_dtype,
-                    const ccl::fn_context* context) {
-    ASSERT((in_dtype == ccl::datatype::float32) || (in_dtype == custom_dtype),
-           "unexpected in_dtype %d",
-           static_cast<int>(in_dtype));
-    ASSERT(context->offset == 0, "wrong offset for epilogue func, should be 0");
-    ASSERT(!strcmp(context->match_id, global_match_id.c_str()), "wrong match_id");
-
-    if (out_count)
-        *out_count = in_count;
-
-    for (size_t idx = 0; idx < in_count; idx++) {
-        if (in_dtype == ccl::datatype::float32) {
-            ((float*)out_buf)[idx] = ((float*)in_buf)[idx] * 2;
-        }
-        else if (in_dtype == custom_dtype) {
-            custom_2x((char*)in_buf + idx * CUSTOM_DTYPE_SIZE,
-                      (char*)out_buf + idx * CUSTOM_DTYPE_SIZE);
-        }
-        else {
-            ASSERT(0, "unexpected dtype %d", static_cast<int>(in_dtype));
-        }
-    }
-}
-
-void do_prologue_dtype_to_char(const void* in_buf,
-                               size_t in_count,
-                               ccl::datatype in_dtype,
-                               void** out_buf,
-                               size_t* out_count,
-                               ccl::datatype* out_dtype,
-                               const ccl::fn_context* context) {
-    ASSERT((in_dtype == ccl::datatype::float32) || (in_dtype == custom_dtype),
-           "unexpected in_dtype %d",
-           static_cast<int>(in_dtype));
-    ASSERT(out_buf, "null ptr");
-    ASSERT(context->offset == 0, "wrong offset for prologue func, should be 0");
-    ASSERT(!strcmp(context->match_id, global_match_id.c_str()), "wrong match_id");
-
-    if (out_buf)
-        *out_buf = malloc(in_count); /* will be deallocated in do_epilogue_char_to_dtype */
-    if (out_count)
-        *out_count = in_count;
-    if (out_dtype)
-        *out_dtype = ccl::datatype::int8;
-
-    for (size_t idx = 0; idx < in_count; idx++) {
-        if (in_dtype == ccl::datatype::float32) {
-            float fval = ((float*)in_buf)[idx];
-            int ival = (int)fval;
-            ((char*)(*out_buf))[idx] = (char)(ival % 256);
-        }
-        else if (in_dtype == custom_dtype) {
-            custom_to_char((char*)in_buf + idx * CUSTOM_DTYPE_SIZE, (char*)(*out_buf) + idx);
-        }
-        else {
-            ASSERT(0, "unexpected dtype %d", static_cast<int>(in_dtype));
-        }
-    }
-}
-
-void do_epilogue_char_to_dtype(const void* in_buf,
-                               size_t in_count,
-                               ccl::datatype in_dtype,
-                               void* out_buf,
-                               size_t* out_count,
-                               ccl::datatype out_dtype,
-                               const ccl::fn_context* context) {
-    ASSERT(in_dtype == ccl::datatype::int8, "unexpected in_dtype %d", static_cast<int>(in_dtype));
-    ASSERT((out_dtype == ccl::datatype::float32) || (out_dtype == custom_dtype),
-           "unexpected out_dtype %d",
-           static_cast<int>(out_dtype));
-    ASSERT(context->offset == 0, "wrong offset for epilogue func, should be 0");
-    ASSERT(!strcmp(context->match_id, global_match_id.c_str()), "wrong match_id");
-
-    if (out_count)
-        *out_count = in_count;
-
-    for (size_t idx = 0; idx < in_count; idx++) {
-        if (out_dtype == ccl::datatype::float32) {
-            ((float*)out_buf)[idx] = (float)(((char*)in_buf)[idx]);
-        }
-        else if (out_dtype == custom_dtype) {
-            custom_from_char((char*)in_buf + idx, (char*)out_buf + idx * CUSTOM_DTYPE_SIZE);
-        }
-        else {
-            ASSERT(0, "unexpected dtype %d", static_cast<int>(out_dtype));
-        }
-    }
-
-    if (in_buf != out_buf)
-        free((void*)in_buf);
-}
-
 void do_reduction_sum(const void* in_buf,
                       size_t in_count,
                       void* inout_buf,
@@ -341,8 +191,7 @@ void do_reduction_sum(const void* in_buf,
                       ccl::datatype dtype,
                       const ccl::fn_context* context) {
     size_t dtype_size;
-    auto& env = ccl::environment::instance();
-    dtype_size = env.get_datatype_size(dtype);
+    dtype_size = ccl::get_datatype_size(dtype);
 
     ASSERT((dtype == ccl::datatype::int8) || (dtype == ccl::datatype::float32) ||
                (dtype == custom_dtype),
@@ -379,8 +228,7 @@ void do_reduction_null(const void* in_buf,
                        ccl::datatype dtype,
                        const ccl::fn_context* context) {
     size_t dtype_size;
-    auto& env = ccl::environment::instance();
-    dtype_size = env.get_datatype_size(dtype);
+    dtype_size = ccl::get_datatype_size(dtype);
 
     ASSERT((dtype == ccl::datatype::int8) || (dtype == ccl::datatype::float32) ||
                (dtype == custom_dtype),
@@ -416,8 +264,7 @@ void do_reduction_custom(const void* in_buf,
                          ccl::datatype dtype,
                          const ccl::fn_context* context) {
     size_t dtype_size;
-    auto& env = ccl::environment::instance();
-    dtype_size = env.get_datatype_size(dtype);
+    dtype_size = ccl::get_datatype_size(dtype);
 
     ASSERT((dtype == ccl::datatype::float32) || (dtype == custom_dtype),
            "unexpected in_dtype %d",
@@ -445,7 +292,6 @@ void do_reduction_custom(const void* in_buf,
 }
 
 int main() {
-
     setenv("CCL_ATL_TRANSPORT", "ofi", 1);
 
     ccl::init();
@@ -454,22 +300,20 @@ int main() {
     MPI_Comm_size(MPI_COMM_WORLD, &size);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-    auto& env = ccl::environment::instance();
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
-        kvs = env.create_main_kvs();
+        kvs = ccl::create_main_kvs();
         main_addr = kvs->get_address();
         MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
     }
     else {
         MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
-        kvs = env.create_kvs(main_addr);
+        kvs = ccl::create_kvs(main_addr);
     }
 
-    auto comm = env.create_communicator(size, rank, kvs);
-    auto attr = env.create_operation_attr<ccl::allreduce_attr>();
+    auto comm = ccl::create_communicator(size, rank, kvs);
+    auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
 
     float float_send_buf[MSG_SIZE_COUNT];
     float float_recv_buf[MSG_SIZE_COUNT];
@@ -481,9 +325,9 @@ int main() {
     char custom_send_buf[MSG_SIZE_COUNT * CUSTOM_DTYPE_SIZE];
     char custom_recv_buf[MSG_SIZE_COUNT * CUSTOM_DTYPE_SIZE];
 
-    std::string base_match_id = attr.get<ccl::operation_attr_id::match_id>();
+    ccl::string_class base_match_id = attr.get<ccl::operation_attr_id::match_id>();
     attr.set<ccl::operation_attr_id::to_cache>(true);
-    std::string match_id;
+    ccl::string_class match_id;
 
     for (size_t idx = 0; idx < 2; idx++) {
         if (rank == 0)
@@ -508,55 +352,12 @@ int main() {
                 check_fn,
                 expected_fn,
                 "regular_allreduce");
-
-            /* prologue */
-            expected_fn = (idx == 0) ? expected_float_2 : expected_custom_2;
-            match_id = base_match_id + "_prologue_" + std::to_string(idx);
-            attr.set<ccl::operation_attr_id::match_id>(match_id);
-            attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn)do_prologue_2x);
-            RUN_COLLECTIVE(
-                ccl::allreduce(
-                    send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::sum, comm, attr),
-                fill_fn,
-                check_fn,
-                expected_fn,
-                "allreduce_with_prologue");
-
-            /* epilogue */
-            expected_fn = (idx == 0) ? expected_float_2 : expected_custom_2;
-            match_id = base_match_id + "_epilogue_" + std::to_string(idx);
-            attr.set<ccl::operation_attr_id::match_id>(match_id);
-            attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn) nullptr);
-            attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn)do_epilogue_2x);
-            RUN_COLLECTIVE(
-                ccl::allreduce(
-                    send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::sum, comm, attr),
-                fill_fn,
-                check_fn,
-                expected_fn,
-                "allreduce_with_epilogue");
-
-            /* prologue and epilogue */
-            expected_fn = (idx == 0) ? expected_float_4 : expected_custom_4;
-            match_id = base_match_id + "_prologue_and_epilogue_" + std::to_string(idx);
-            attr.set<ccl::operation_attr_id::match_id>(match_id);
-            attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn)do_prologue_2x);
-            attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn)do_epilogue_2x);
-            RUN_COLLECTIVE(
-                ccl::allreduce(
-                    send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::sum, comm, attr),
-                fill_fn,
-                check_fn,
-                expected_fn,
-                "allreduce_with_prologue_and_epilogue");
         }
 
         /* reduction_sum */
         expected_fn = (idx == 0) ? expected_float_1 : expected_custom_1;
         match_id = base_match_id + "_reduction_sum_" + std::to_string(idx);
         attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn) nullptr);
-        attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn) nullptr);
         attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_sum);
         RUN_COLLECTIVE(
             ccl::allreduce(
@@ -573,8 +374,6 @@ int main() {
             expected_fn = (idx == 0) ? expected_float_3 : expected_custom_3;
         match_id = base_match_id + "_reduction_null_" + std::to_string(idx);
         attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn) nullptr);
-        attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn) nullptr);
         attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_null);
         RUN_COLLECTIVE(
             ccl::allreduce(
@@ -591,8 +390,6 @@ int main() {
             expected_fn = (idx == 0) ? expected_float_5 : expected_custom_5;
         match_id = base_match_id + "_reduction_custom_" + std::to_string(idx);
         attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn) nullptr);
-        attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn) nullptr);
         attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_custom);
         RUN_COLLECTIVE(
             ccl::allreduce(
@@ -601,108 +398,6 @@ int main() {
             check_fn,
             expected_fn,
             "allreduce_with_reduction_custom");
-
-        /* prologue and reduction_sum */
-        expected_fn = (idx == 0) ? expected_float_2 : expected_custom_2;
-        match_id = base_match_id + "_prologue_and_reduction_sum_" + std::to_string(idx);
-        attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn)do_prologue_2x);
-        attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn) nullptr);
-        attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_sum);
-        RUN_COLLECTIVE(
-            ccl::allreduce(
-                send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::custom, comm, attr),
-            fill_fn,
-            check_fn,
-            expected_fn,
-            "allreduce_with_prologue_and_reduction_sum");
-
-        /* epilogue and reduction_sum */
-        expected_fn = (idx == 0) ? expected_float_2 : expected_custom_2;
-        match_id = base_match_id + "_epilogue_and_reduction_sum_" + std::to_string(idx);
-        attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn) nullptr);
-        attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn)do_epilogue_2x);
-        attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_sum);
-        RUN_COLLECTIVE(
-            ccl::allreduce(
-                send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::custom, comm, attr),
-            fill_fn,
-            check_fn,
-            expected_fn,
-            "allreduce_with_epilogue_and_reduction_sum");
-
-        /* prologue and epilogue and reduction_sum */
-        expected_fn = (idx == 0) ? expected_float_4 : expected_custom_4;
-        match_id =
-            base_match_id + "_prologue_and_epilogue_and_reduction_sum_" + std::to_string(idx);
-        attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn)do_prologue_2x);
-        attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn)do_epilogue_2x);
-        attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_sum);
-        RUN_COLLECTIVE(
-            ccl::allreduce(
-                send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::custom, comm, attr),
-            fill_fn,
-            check_fn,
-            expected_fn,
-            "allreduce_with_prologue_and_epilogue_and_reduction_sum");
-
-        /* prologue and epilogue and reduction_null */
-        if (size == 1)
-            expected_fn = (idx == 0) ? expected_float_4 : expected_custom_4;
-        else
-            expected_fn = (idx == 0) ? expected_float_3 : expected_custom_3;
-        match_id =
-            base_match_id + "_prologue_and_epilogue_and_reduction_null_" + std::to_string(idx);
-        attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn)do_prologue_2x);
-        attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn)do_epilogue_2x);
-        attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_null);
-        RUN_COLLECTIVE(
-            ccl::allreduce(
-                send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::custom, comm, attr),
-            fill_fn,
-            check_fn,
-            expected_fn,
-            "allreduce_with_prologue_and_epilogue_and_reduction_null");
-
-        /* prologue and epilogue and reduction_sum */
-        expected_fn = (idx == 0) ? expected_float_1 : expected_custom_1;
-        match_id =
-            base_match_id + "_prologue_and_epilogue_and_reduction_sum2_" + std::to_string(idx);
-        attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>(
-            (ccl::prologue_fn)do_prologue_dtype_to_char);
-        attr.set<ccl::operation_attr_id::epilogue_fn>(
-            (ccl::epilogue_fn)do_epilogue_char_to_dtype);
-        attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_sum);
-        RUN_COLLECTIVE(
-            ccl::allreduce(
-                send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::custom, comm, attr),
-            fill_fn,
-            check_fn,
-            expected_fn,
-            "allreduce_with_prologue_and_epilogue_and_reduction_sum2");
-
-        /* epilogue and reduction_custom */
-        if (size == 1)
-            expected_fn = (idx == 0) ? expected_float_1 : expected_custom_1;
-        else
-            expected_fn = (idx == 0) ? expected_float_6 : expected_custom_6;
-        match_id =
-            base_match_id + "_prologue_and_epilogue_and_reduction_custom_" + std::to_string(idx);
-        attr.set<ccl::operation_attr_id::match_id>(match_id);
-        attr.set<ccl::operation_attr_id::prologue_fn>((ccl::prologue_fn) nullptr);
-        attr.set<ccl::operation_attr_id::epilogue_fn>((ccl::epilogue_fn)do_epilogue_2x);
-        attr.set<ccl::allreduce_attr_id::reduction_fn>((ccl::reduction_fn)do_reduction_custom);
-        RUN_COLLECTIVE(
-            ccl::allreduce(
-                send_buf, recv_buf, MSG_SIZE_COUNT, dtype, ccl::reduction::custom, comm, attr),
-            fill_fn,
-            check_fn,
-            expected_fn,
-            "allreduce_with_epilogue_and_reduction_custom");
     }
 
     if (rank == 0)
diff --git a/examples/cpu/datatype.cpp b/examples/cpu/datatype.cpp
index 88863db66..40fdaf659 100644
--- a/examples/cpu/datatype.cpp
+++ b/examples/cpu/datatype.cpp
@@ -63,12 +63,12 @@ void check_allreduce(const ccl::communicator &comm) {
 
     for (size_t idx = 0; idx < max_dtype_count; idx++) {
         reqs[idx] = ccl::allreduce(send_bufs[idx].data(),
-                              recv_bufs[idx].data(),
-                              COUNT,
-                              dtypes[idx],
-                              ccl::reduction::custom,
-                              comm,
-                              attr);
+                                   recv_bufs[idx].data(),
+                                   COUNT,
+                                   dtypes[idx],
+                                   ccl::reduction::custom,
+                                   comm,
+                                   attr);
     }
 
     for (size_t idx = 0; idx < max_dtype_count; idx++) {
@@ -125,6 +125,10 @@ void check_create_and_free() {
 }
 
 int main() {
+    /**
+     * The example only works with CCL_ATL_TRANSPORT=ofi
+     */
+    setenv("CCL_ATL_TRANSPORT", "ofi", 0);
 
     ccl::init();
 
diff --git a/examples/cpu/external_kvs.cpp b/examples/cpu/external_kvs.cpp
new file mode 100644
index 000000000..f2c272a5c
--- /dev/null
+++ b/examples/cpu/external_kvs.cpp
@@ -0,0 +1,101 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "base.hpp"
+
+class external_kvs : public ccl::kvs_interface {
+public:
+    external_kvs(ccl::shared_ptr_class<ccl::kvs> kvs) : kvs(kvs) {}
+
+    virtual ccl::vector_class<char> get(const ccl::string_class& key) {
+        return kvs->get(key);
+    }
+
+    virtual void set(const ccl::string_class& key, const ccl::vector_class<char>& data) {
+        return kvs->set(key, data);
+    }
+
+private:
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+};
+
+void run_collective(const char* cmd_name,
+                    const std::vector<float>& send_buf,
+                    std::vector<float>& recv_buf,
+                    const ccl::communicator& comm,
+                    const ccl::allreduce_attr& attr) {
+    std::chrono::system_clock::duration exec_time{ 0 };
+    float expected = (comm.size() - 1) * (static_cast<float>(comm.size()) / 2);
+
+    ccl::barrier(comm);
+
+    for (size_t idx = 0; idx < ITERS; ++idx) {
+        auto start = std::chrono::system_clock::now();
+        ccl::allreduce(
+            send_buf.data(), recv_buf.data(), recv_buf.size(), ccl::reduction::sum, comm, attr)
+            .wait();
+        exec_time += std::chrono::system_clock::now() - start;
+    }
+
+    for (size_t idx = 0; idx < recv_buf.size(); idx++) {
+        if (recv_buf[idx] != expected) {
+            fprintf(stderr, "idx %zu, expected %4.4f, got %4.4f\n", idx, expected, recv_buf[idx]);
+
+            std::cout << "FAILED" << std::endl;
+            std::terminate();
+        }
+    }
+
+    ccl::barrier(comm);
+
+    std::cout << "avg time of " << cmd_name << ": "
+              << std::chrono::duration_cast<std::chrono::microseconds>(exec_time).count() / ITERS
+              << ", us" << std::endl;
+}
+
+int main() {
+    ccl::init_attr init_attr = ccl::create_init_attr();
+    ccl::init(init_attr);
+
+    int size, rank;
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    ccl::kvs::address_type main_addr;
+    if (rank == 0) {
+        kvs = ccl::create_main_kvs();
+        main_addr = kvs->get_address();
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs = ccl::create_kvs(main_addr);
+    }
+
+    auto ext_kvs = std::make_shared<external_kvs>(kvs);
+
+    auto comm = ccl::create_communicator(size, rank, ext_kvs);
+    auto attr = ccl::create_operation_attr<ccl::allreduce_attr>();
+
+    MSG_LOOP(comm, std::vector<float> send_buf(msg_count, static_cast<float>(comm.rank()));
+             std::vector<float> recv_buf(msg_count);
+             run_collective("regular allreduce", send_buf, recv_buf, comm, attr););
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/examples/cpu/priority_allreduce.cpp b/examples/cpu/priority_allreduce.cpp
index af4d56043..6299b22a8 100644
--- a/examples/cpu/priority_allreduce.cpp
+++ b/examples/cpu/priority_allreduce.cpp
@@ -84,8 +84,7 @@ double msg_timers_stddev[MSG_COUNT];
 
 size_t comp_delay_ms;
 
-void do_iter(size_t iter_idx,
-             ccl::communicator& comm) {
+void do_iter(size_t iter_idx, ccl::communicator& comm) {
     if (comm.rank() == 0) {
         printf("started iter %zu\n", iter_idx);
         fflush(stdout);
@@ -104,7 +103,7 @@ void do_iter(size_t iter_idx,
         for (idx = 0; idx < MSG_COUNT; idx++) {
             sprintf(match_id, "%zu", idx);
 
-            attr.set<ccl::operation_attr_id::match_id>(std::string(match_id));
+            attr.set<ccl::operation_attr_id::match_id>(ccl::string_class(match_id));
 
             tmp_start_timer = when();
             ccl::allreduce(msg_buffers[idx],
@@ -113,7 +112,8 @@ void do_iter(size_t iter_idx,
                            ccl::datatype::float32,
                            ccl::reduction::sum,
                            comm,
-                           attr).wait();
+                           attr)
+                .wait();
             tmp_stop_timer = when();
             msg_iso_timers[idx] += (tmp_stop_timer - tmp_start_timer);
         }
@@ -137,7 +137,7 @@ void do_iter(size_t iter_idx,
 
         sprintf(match_id, "%zu", idx);
 
-        attr.set<ccl::operation_attr_id::match_id>(std::string(match_id));
+        attr.set<ccl::operation_attr_id::match_id>(ccl::string_class(match_id));
 
         msg_starts[idx] = when();
         tmp_start_timer = when();
@@ -192,7 +192,6 @@ void do_iter(size_t iter_idx,
 }
 
 int main() {
-    
     setenv("CCL_PRIORITY", "direct", 0);
 
     ccl::init();
@@ -285,17 +284,10 @@ int main() {
     std::vector<double> recv_iter_timers(size);
     std::vector<size_t> recv_iter_timers_counts(size, 1);
 
-    ccl::allgatherv(msg_timers,
-                    MSG_COUNT,
-                    recv_msg_timers.data(),
-                    recv_msg_timers_counts,
-                    comm).wait();
-
-    ccl::allgatherv(&iter_timer,
-                    1,
-                    recv_iter_timers.data(),
-                    recv_iter_timers_counts,
-                    comm).wait();
+    ccl::allgatherv(msg_timers, MSG_COUNT, recv_msg_timers.data(), recv_msg_timers_counts, comm)
+        .wait();
+
+    ccl::allgatherv(&iter_timer, 1, recv_iter_timers.data(), recv_iter_timers_counts, comm).wait();
 
     if (rank == 0) {
         size_t rank_idx;
diff --git a/examples/cpu/reduce.cpp b/examples/cpu/reduce.cpp
index f49dad3dd..fca41d025 100644
--- a/examples/cpu/reduce.cpp
+++ b/examples/cpu/reduce.cpp
@@ -34,7 +34,8 @@ void run_collective(const char* cmd_name,
                     ccl::reduction::sum,
                     COLL_ROOT,
                     comm,
-                    attr).wait();
+                    attr)
+            .wait();
         exec_time += std::chrono::system_clock::now() - start;
     }
 
@@ -56,7 +57,6 @@ void run_collective(const char* cmd_name,
 }
 
 int main() {
-
     ccl::init();
 
     int size, rank;
diff --git a/examples/cpu/reduce_scatter.cpp b/examples/cpu/reduce_scatter.cpp
index 59d3c7c06..29accda5d 100644
--- a/examples/cpu/reduce_scatter.cpp
+++ b/examples/cpu/reduce_scatter.cpp
@@ -27,12 +27,9 @@ void run_collective(const char* cmd_name,
 
     for (size_t idx = 0; idx < ITERS; ++idx) {
         auto start = std::chrono::system_clock::now();
-        ccl::reduce_scatter(send_buf.data(),
-                            recv_buf.data(),
-                            recv_buf.size(),
-                            ccl::reduction::sum,
-                            comm,
-                            attr).wait();
+        ccl::reduce_scatter(
+            send_buf.data(), recv_buf.data(), recv_buf.size(), ccl::reduction::sum, comm, attr)
+            .wait();
         exec_time += std::chrono::system_clock::now() - start;
     }
 
@@ -53,7 +50,6 @@ void run_collective(const char* cmd_name,
 }
 
 int main() {
-
     ccl::init();
 
     int size, rank;
@@ -76,7 +72,8 @@ int main() {
     auto comm = ccl::create_communicator(size, rank, kvs);
     auto attr = ccl::create_operation_attr<ccl::reduce_scatter_attr>();
 
-    MSG_LOOP(comm, std::vector<float> send_buf(msg_count * comm.size(), static_cast<float>(comm.rank()));
+    MSG_LOOP(comm,
+             std::vector<float> send_buf(msg_count * comm.size(), static_cast<float>(comm.rank()));
              std::vector<float> recv_buf(msg_count);
              attr.set<ccl::operation_attr_id::to_cache>(false);
              run_collective("warmup reduce_scatter", send_buf, recv_buf, comm, attr);
diff --git a/examples/cpu/unordered_allreduce.cpp b/examples/cpu/unordered_allreduce.cpp
index aa6ce39b6..45c44a3ec 100644
--- a/examples/cpu/unordered_allreduce.cpp
+++ b/examples/cpu/unordered_allreduce.cpp
@@ -23,13 +23,17 @@
 #include "base.hpp"
 
 int main() {
+    /**
+     * The example only works with CCL_ATL_TRANSPORT=ofi
+     */
+    setenv("CCL_ATL_TRANSPORT", "ofi", 0);
 
     setenv("CCL_UNORDERED_COLL", "1", 1);
 
     const size_t buf_size = 1024;
     const size_t iter_count = 64;
 
-    std::vector<std::string> match_ids;
+    std::vector<ccl::string_class> match_ids;
 
     /* event, operation idx */
     std::list<std::pair<ccl::event, size_t>> active_ops;
@@ -76,36 +80,31 @@ int main() {
     }
 
     for (size_t iter = 0; iter < iter_count; ++iter) {
-
         std::cout << "starting iter " << iter << std::endl;
 
         size_t start_idx = distribution(rand_dev);
         size_t rank_idx = start_idx;
 
         for (auto idx = 0; idx < size; idx++) {
-
-            std::cout << "submit allreduce " << rank_idx
-                      << " for match_id " << match_ids[rank_idx] << std::endl;
+            std::cout << "submit allreduce " << rank_idx << " for match_id " << match_ids[rank_idx]
+                      << std::endl;
 
             attr.set<ccl::operation_attr_id::match_id>(match_ids[rank_idx]);
 
-            active_ops.emplace_back(
-                ccl::allreduce(send_bufs[rank_idx].data(),
-                               recv_bufs[rank_idx].data(),
-                               buf_size,
-                               ccl::reduction::sum,
-                               comm,
-                               attr),
-                rank_idx);
+            active_ops.emplace_back(ccl::allreduce(send_bufs[rank_idx].data(),
+                                                   recv_bufs[rank_idx].data(),
+                                                   buf_size,
+                                                   ccl::reduction::sum,
+                                                   comm,
+                                                   attr),
+                                    rank_idx);
 
             rank_idx = (rank_idx + 1) % size;
         }
 
         while (!active_ops.empty()) {
             for (auto it = active_ops.begin(); it != active_ops.end();) {
-
                 if (it->first.test()) {
-
                     float expected = (it->second + 1) * size;
                     printf(
                         "completed allreduce %zu for match_id %s. Actual %3.2f, expected %3.2f\n",
diff --git a/examples/include/base.hpp b/examples/include/base.hpp
index 0b3152778..fc4239f5b 100644
--- a/examples/include/base.hpp
+++ b/examples/include/base.hpp
@@ -67,7 +67,7 @@ using namespace cl::sycl::access;
                       START_MSG_SIZE_POWER, \
                       COLL_ROOT); \
         std::vector<size_t> msg_counts(MSG_SIZE_COUNT); \
-        std::vector<std::string> msg_match_ids(MSG_SIZE_COUNT); \
+        std::vector<ccl::string_class> msg_match_ids(MSG_SIZE_COUNT); \
         for (size_t idx = 0; idx < MSG_SIZE_COUNT; ++idx) { \
             msg_counts[idx] = 1u << (START_MSG_SIZE_POWER + idx); \
             msg_match_ids[idx] = std::to_string(msg_counts[idx]); \
@@ -114,4 +114,12 @@ double when(void) {
     return (double)(tv.tv_sec - tv_base.tv_sec) * 1.0e6 + (double)(tv.tv_usec - tv_base.tv_usec);
 }
 
+void mpi_finalize() {
+    int is_finalized = 0;
+    MPI_Finalized(&is_finalized);
+
+    if (!is_finalized)
+        MPI_Finalize();
+}
+
 #endif /* BASE_HPP */
diff --git a/examples/include/base_utils.hpp b/examples/include/base_utils.hpp
index 539b76ae1..80330334d 100644
--- a/examples/include/base_utils.hpp
+++ b/examples/include/base_utils.hpp
@@ -179,6 +179,72 @@ void str_to_mset(const char* input, std::multiset<ccl::device_index_type>& outpu
         output.insert(ccl::from_string(processes_input));
     }
 }
+
+std::shared_ptr<ccl::kvs> build_kvs(int mpi_rank) {
+    std::shared_ptr<ccl::kvs> kvs_instance;
+    ccl::kvs::address_type main_addr;
+    if (mpi_rank == 0) {
+        kvs_instance = ccl::create_main_kvs();
+        main_addr = kvs_instance->get_address();
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs_instance = ccl::create_kvs(main_addr);
+    }
+    return kvs_instance;
+}
+
+inline size_t take_mpi_rank_id_offest(const size_t mpi_rank_in_cluster,
+                                      const int mpi_size,
+                                      const size_t total_device_in_cluster) {
+    if (mpi_size > 2) {
+        throw std::runtime_error(std::string(__FUNCTION__) +
+                                 " - Only TWO processes support case !\n");
+    }
+    return total_device_in_cluster;
+}
+
+ccl::process_device_indices_type extract_indices_for_threads(
+    const size_t mpi_rank_in_cluster,
+    const int current_mpi_rank,
+    std::vector<std::string> thread_gpu_affinity,
+    size_t& total_device_in_cluster,
+    std::vector<size_t>& total_devices_in_process,
+    std::map<size_t, std::vector<ccl::communicator::device_type>>& devices_for_current_mpi_rank) {
+    ccl::process_device_indices_type thread_group_affinity;
+
+    for (size_t thread_index = 0; thread_index < thread_gpu_affinity.size(); thread_index++) {
+        ccl::device_indices_type device_group_affinity;
+        str_to_mset<ccl::device_index_type>(
+            thread_gpu_affinity[thread_index].c_str(), device_group_affinity, ',');
+
+        std::cout << " Extracted GPU indices for thread by id: " << thread_index
+                  << ", devices in threads count: " << device_group_affinity.size() << std::endl;
+        total_device_in_cluster += device_group_affinity.size();
+        total_devices_in_process[mpi_rank_in_cluster] += device_group_affinity.size();
+        thread_group_affinity[thread_index] = device_group_affinity;
+
+        if (mpi_rank_in_cluster == static_cast<size_t>(current_mpi_rank)) {
+            for (auto device_vendor_id : device_group_affinity) {
+                devices_for_current_mpi_rank[thread_index].push_back(
+                    ccl::create_from_index(device_vendor_id).device);
+            }
+        }
+    }
+    return thread_group_affinity;
+}
+
+std::vector<ccl::communicator::device_type> set_union_devices_in_current_process(
+    const std::map<size_t, std::vector<ccl::communicator::device_type>>& devices_for_mpi_rank) {
+    std::vector<ccl::communicator::device_type> devices_in_process;
+    for (auto& thread_devices : devices_for_mpi_rank) {
+        devices_in_process.insert(
+            devices_in_process.end(), thread_devices.second.begin(), thread_devices.second.end());
+    }
+    return devices_in_process;
+}
+
 #endif //MULTI_GPU_SUPPORT
 } // namespace utils
 #endif /* BASE_UTILS_HPP */
diff --git a/examples/include/sycl_base.hpp b/examples/include/sycl_base.hpp
index 027c60291..bad23be3a 100644
--- a/examples/include/sycl_base.hpp
+++ b/examples/include/sycl_base.hpp
@@ -18,13 +18,12 @@
 
 #include <CL/sycl.hpp>
 #include <iostream>
-#include <string>
-#include <iostream>
 #include <map>
 #include <mpi.h>
 #include <set>
 #include <string>
 
+#include "base.hpp"
 #include "base_utils.hpp"
 
 #include "oneapi/ccl.hpp"
@@ -55,7 +54,6 @@ inline bool has_accelerator() {
 }
 
 inline bool check_sycl_usm(queue& q, usm::alloc alloc_type) {
-
     bool ret = true;
 
     device d = q.get_device();
@@ -73,67 +71,272 @@ inline bool check_sycl_usm(queue& q, usm::alloc alloc_type) {
     return ret;
 }
 
-inline bool create_sycl_queue(int argc,
-                              char* argv[],
-                              queue& q) {
+std::string get_preferred_gpu_platform_name() {
+    std::string backend;
+    std::string result;
 
-    auto exception_handler = [&](exception_list elist) {
-        for (exception_ptr const& e : elist) {
-            try {
-                rethrow_exception(e);
+    if (getenv("SYCL_BE") == nullptr) {
+        backend = "OpenCL";
+    }
+    else if (getenv("SYCL_BE") != nullptr) {
+        if (std::strcmp(getenv("SYCL_BE"), "PI_LEVEL_ZERO") == 0) {
+            backend = "Level-Zero";
+        }
+        else if (std::strcmp(getenv("SYCL_BE"), "PI_OPENCL") == 0) {
+            backend = "OpenCL";
+        }
+        else {
+            throw std::runtime_error("invalid backend: " + std::string(getenv("SYCL_BE")));
+        }
+    }
+
+    auto plaform_list = sycl::platform::get_platforms();
+
+    for (const auto& platform : plaform_list) {
+        auto platform_name = platform.get_info<sycl::info::platform::name>();
+
+        auto devices = platform.get_devices();
+        auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
+            return d.is_gpu();
+        });
+
+        if (gpu_dev == devices.end()) {
+            // cout << "platform [" << platform_name
+            //      << "] does not contain GPU devices, skipping\n";
+            continue;
+        }
+
+        if (platform_name.find(backend) == std::string::npos) {
+            // cout << "platform [" << platform_name
+            //      << "] does not match with requested "
+            //      << backend << ", skipping\n";
+            continue;
+        }
+
+        result = platform_name;
+    }
+
+    if (result.empty())
+        throw std::runtime_error("can not find preferred GPU platform");
+
+    return result;
+}
+
+std::vector<sycl::device> create_sycl_gpu_devices() {
+    constexpr char dev_prefix[] = "-- ";
+    constexpr char sub_dev_prefix[] = "---- ";
+
+    std::vector<sycl::device> result;
+    auto plaform_list = sycl::platform::get_platforms();
+    auto preferred_platform_name = get_preferred_gpu_platform_name();
+
+    cout << "preferred platform: [" << preferred_platform_name << "]\n";
+
+    for (const auto& platform : plaform_list) {
+        auto platform_name = platform.get_info<sycl::info::platform::name>();
+
+        if (platform_name.compare(preferred_platform_name) != 0)
+            continue;
+
+        cout << "platform: [" << platform_name << "]\n";
+
+        auto device_list = platform.get_devices();
+
+        for (const auto& device : device_list) {
+            auto device_name = device.get_info<cl::sycl::info::device::name>();
+
+            if (!device.is_gpu()) {
+                cout << dev_prefix << "device [" << device_name << "] is not GPU, skipping\n";
+                continue;
             }
-            catch (std::exception const& e) {
-                cout << "failure\n";
+
+            auto part_props = device.get_info<info::device::partition_properties>();
+
+            if (std::find(part_props.begin(),
+                          part_props.end(),
+                          info::partition_property::partition_by_affinity_domain) ==
+                part_props.end()) {
+                cout << dev_prefix << "device [" << device_name
+                     << "] does not support partition by affinity domain"
+                     << ", use root device\n";
+                result.push_back(device);
+                continue;
+            }
+
+            auto part_affinity_domains =
+                device.get_info<info::device::partition_affinity_domains>();
+
+            if (std::find(part_affinity_domains.begin(),
+                          part_affinity_domains.end(),
+                          info::partition_affinity_domain::next_partitionable) ==
+                part_affinity_domains.end()) {
+                cout << dev_prefix << "device [" << device_name
+                     << "] does not support next_partitionable affinity domain"
+                     << ", use root device\n";
+                result.push_back(device);
+                continue;
+            }
+
+            cout << dev_prefix << "device [" << device_name << "] should provide "
+                 << device.template get_info<info::device::partition_max_sub_devices>()
+                 << " sub-devices\n";
+
+            auto sub_devices =
+                device.create_sub_devices<info::partition_property::partition_by_affinity_domain>(
+                    info::partition_affinity_domain::next_partitionable);
+
+            if (sub_devices.empty()) {
+                /* TODO: remove when SYCL/L0 sub-devices will be supported */
+                cout << dev_prefix << "device [" << device_name << "] does not provide sub-devices"
+                     << ", use root device\n";
+                result.push_back(device);
+                continue;
+            }
+
+            cout << dev_prefix << "device [" << device_name << "] provides " << sub_devices.size()
+                 << " sub-devices\n";
+            result.insert(result.end(), sub_devices.begin(), sub_devices.end());
+
+            for (auto idx = 0; idx < sub_devices.size(); idx++) {
+                cout << sub_dev_prefix << "sub-device " << idx << ": ["
+                     << sub_devices[idx].get_info<cl::sycl::info::device::name>() << "]\n";
             }
         }
-    };
+    }
 
-    unique_ptr<device_selector> selector;
-    if (argc >= 2) {
-        if (strcmp(argv[1], "cpu") == 0) {
-            selector.reset(new cpu_selector());
+    if (result.empty()) {
+        throw std::runtime_error("no GPU devices found");
+    }
+
+    cout << "found: " << result.size() << " GPU device(s)\n";
+
+    return result;
+}
+
+std::vector<sycl::queue> create_sycl_queues(const std::string& device_type,
+                                            const std::vector<int>& ranks) {
+    std::vector<sycl::device> devices;
+
+    try {
+        if ((device_type.compare("gpu") == 0) && has_gpu()) {
+            /* special handling to cover multi-tile case */
+            devices = create_sycl_gpu_devices();
         }
-        else if (strcmp(argv[1], "gpu") == 0) {
-            if (has_gpu()) {
-                selector.reset(new gpu_selector());
+        else {
+            unique_ptr<device_selector> selector;
+
+            if (device_type.compare("cpu") == 0) {
+                selector.reset(new cpu_selector());
             }
-            else if (has_accelerator()) {
+            else if (device_type.compare("gpu") == 0) {
+                if (has_accelerator()) {
+                    selector.reset(new host_selector());
+                    cout
+                        << "Accelerator is the first in device list, but unavailable for multiprocessing "
+                        << "host_selector has been created instead of default_selector.\n";
+                }
+                else {
+                    selector.reset(new default_selector());
+                    cout
+                        << "GPU is unavailable, default_selector has been created instead of gpu_selector.\n";
+                }
+            }
+            else if (device_type.compare("host") == 0) {
                 selector.reset(new host_selector());
-                cout
-                    << "Accelerator is the first in device list, but unavailable for multiprocessing, host_selector has been created instead of default_selector.\n";
+            }
+            else if (device_type.compare("default") == 0) {
+                if (!has_accelerator()) {
+                    selector.reset(new default_selector());
+                }
+                else {
+                    selector.reset(new host_selector());
+                    cout
+                        << "Accelerator is the first in device list, but unavailable for multiprocessing "
+                        << " host_selector has been created instead of default_selector.\n";
+                }
             }
             else {
-                selector.reset(new default_selector());
-                cout
-                    << "GPU is unavailable, default_selector has been created instead of gpu_selector.\n";
+                throw std::runtime_error("Please provide device type: cpu | gpu | host | default");
             }
+            devices.push_back(sycl::device(*selector));
         }
-        else if (strcmp(argv[1], "host") == 0) {
-            selector.reset(new host_selector());
-        }
-        else if (strcmp(argv[1], "default") == 0) {
-            if (!has_accelerator()) {
-                selector.reset(new default_selector());
+    }
+    catch (...) {
+        throw std::runtime_error("No devices of requested type available");
+    }
+
+    if (devices.empty()) {
+        throw std::runtime_error("No devices of requested type available");
+    }
+
+    std::vector<sycl::device> rank_devices;
+
+    for (size_t idx = 0; idx < ranks.size(); idx++) {
+        rank_devices.push_back(devices[ranks[idx] % devices.size()]);
+    }
+
+    if (rank_devices.empty()) {
+        throw std::runtime_error("No devices of requested type available for specified ranks");
+    }
+
+    sycl::context ctx;
+
+    try {
+        ctx = sycl::context(rank_devices);
+    }
+    catch (sycl::runtime_error&) {
+        size_t preferred_idx = (ranks.back() / ranks.size()) % devices.size();
+        cout << "Can not create context from all rank devices of type: " << device_type
+             << ", create context from single device, idx " << preferred_idx << "\n";
+        ctx = sycl::context(devices[preferred_idx]);
+    }
+
+    auto exception_handler = [&](exception_list elist) {
+        for (exception_ptr const& e : elist) {
+            try {
+                rethrow_exception(e);
             }
-            else {
-                selector.reset(new host_selector());
-                cout
-                    << "Accelerator is the first in device list, but unavailable for multiprocessing, host_selector has been created instead of default_selector.\n";
+            catch (std::exception const& e) {
+                cout << "failure\n";
             }
         }
-        else {
-            cerr << "Please provide device type: cpu | gpu | host | default\n";
+    };
+
+    auto ctx_devices = ctx.get_devices();
+
+    if (ctx_devices.empty()) {
+        throw std::runtime_error("No devices of requested type available in context");
+    }
+
+    std::vector<sycl::queue> queues;
+
+    cout << "Created context from devices of type: " << device_type << "\n";
+    cout << "Devices [" << ctx_devices.size() << "]:\n";
+
+    for (size_t idx = 0; idx < ctx_devices.size(); idx++) {
+        cout << "[" << idx << "]: [" << ctx_devices[idx].get_info<info::device::name>() << "]\n";
+        queues.push_back(sycl::queue(ctx_devices[idx], exception_handler));
+    }
+
+    return queues;
+}
+
+inline bool create_sycl_queue(int argc, char* argv[], int rank, queue& q) {
+    if (argc >= 2) {
+        try {
+            std::vector<int> ranks = { rank };
+            q = create_sycl_queues(argv[1], ranks)[0];
+            return true;
+        }
+        catch (std::exception& e) {
+            cerr << e.what() << "\n";
             return false;
         }
-        q = queue(*selector, exception_handler);
-        cout << "Requested device type: " << argv[1] << "\nRunning on "
-                  << q.get_device().get_info<info::device::name>() << "\n";
     }
     else {
         cerr << "Please provide device type: cpu | gpu | host | default\n";
         return false;
     }
-    return true;
 }
 
 bool handle_exception(queue& q) {
@@ -166,18 +369,27 @@ usm::alloc usm_alloc_type_from_string(const string& str) {
     return it->second;
 }
 
-template <typename  T>
-struct buf_allocator {
+std::pair<usm::alloc, std::string> take_usm_type(const int argc, char* str_type) {
+    std::map<usm::alloc, std::string> map_usm_type;
+    auto usm_alloc_type = usm::alloc::shared;
+    auto str_usm_alloc_type = "shared";
+    if (argc > 1) {
+        str_usm_alloc_type = str_type;
+        usm_alloc_type = usm_alloc_type_from_string(str_usm_alloc_type);
+    }
+
+    return std::make_pair(usm_alloc_type, str_usm_alloc_type);
+}
 
+template <typename T>
+struct buf_allocator {
     const size_t alignment = 64;
 
-    buf_allocator(queue& q)
-        : q(q)
-    {}
+    buf_allocator(queue& q) : q(q) {}
 
     ~buf_allocator() {
         for (auto& ptr : memory_storage) {
-            cl::sycl::free(ptr, q);
+            sycl::free(ptr, q);
         }
     }
 
@@ -186,7 +398,7 @@ struct buf_allocator {
         if (alloc_type == usm::alloc::host)
             ptr = aligned_alloc_host<T>(alignment, count, q);
         else if (alloc_type == usm::alloc::device)
-            ptr =  aligned_alloc_device<T>(alignment, count, q);
+            ptr = aligned_alloc_device<T>(alignment, count, q);
         else if (alloc_type == usm::alloc::shared)
             ptr = aligned_alloc_shared<T>(alignment, count, q);
         else
@@ -195,10 +407,16 @@ struct buf_allocator {
         auto it = memory_storage.find(ptr);
         if (it != memory_storage.end()) {
             throw std::runtime_error(string(__PRETTY_FUNCTION__) +
-                                        " - allocator already owns this pointer");
+                                     " - allocator already owns this pointer");
         }
         memory_storage.insert(ptr);
 
+        auto pointer_type = sycl::get_pointer_type(ptr, q.get_context());
+        if (pointer_type != alloc_type)
+            throw std::runtime_error(
+                string(__PRETTY_FUNCTION__) + "pointer_type " + std::to_string((int)pointer_type) +
+                " doesn't match with requested " + std::to_string((int)alloc_type));
+
         return ptr;
     }
 
@@ -206,7 +424,7 @@ struct buf_allocator {
         auto it = memory_storage.find(ptr);
         if (it == memory_storage.end()) {
             throw std::runtime_error(string(__PRETTY_FUNCTION__) +
-                                        " - allocator doesn't own this pointer");
+                                     " - allocator doesn't own this pointer");
         }
         free(ptr, q);
         memory_storage.erase(it);
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index f2653b28a..412234d68 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -28,5 +28,5 @@ foreach(src ${sources})
     target_link_libraries(${executable} PRIVATE m)
     target_link_libraries(${executable} PUBLIC mpi)
     target_link_libraries(${executable} PRIVATE ${COMPUTE_RUNTIME_TARGET_NAME})
-    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/sycl)
+    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/sycl OPTIONAL)
 endforeach()
diff --git a/examples/sycl/sycl_allgatherv_custom_usm_test.cpp b/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
index af707eb0a..9380a064f 100644
--- a/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
+++ b/examples/sycl/sycl_allgatherv_custom_usm_test.cpp
@@ -24,7 +24,6 @@ struct custom_data_type {
 } __attribute__((packed));
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -33,8 +32,14 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
@@ -50,10 +55,6 @@ int main(int argc, char *argv[]) {
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -89,17 +90,17 @@ int main(int argc, char *argv[]) {
     auto e = q.submit([&](auto &h) {
         accessor expected_buf_acc(expected_buf, h, write_only);
         h.parallel_for(send_count, [=](auto id) {
-                static_cast<native_dtype *>(send_buf)[id] = rank + 1;
-                for (int i = 0; i < size; i++) {
-                    static_cast<native_dtype *>(recv_buf)[id] = -1;
-                    expected_buf_acc[i * send_count + id] = i + 1;
-                }
-            });
+            static_cast<native_dtype *>(send_buf)[id] = rank + 1;
+            for (int i = 0; i < size; i++) {
+                static_cast<native_dtype *>(recv_buf)[id] = -1;
+                expected_buf_acc[i * send_count + id] = i + 1;
+            }
+        });
     });
 
     /* create dependency vector */
     vector<ccl::event> events;
-    events.push_back(ccl::create_event(e));
+    // events.push_back(ccl::create_event(e));
 
     if (!handle_exception(q))
         return -1;
@@ -122,10 +123,10 @@ int main(int argc, char *argv[]) {
         accessor expected_buf_acc(expected_buf, h, read_only);
         accessor check_buf_acc(check_buf, h, write_only);
         h.parallel_for(size * send_count, [=](auto id) {
-                if (static_cast<native_dtype *>(recv_buf)[id] != expected_buf_acc[id]) {
-                    check_buf_acc[id] = -1;
-                }
-            });
+            if (static_cast<native_dtype *>(recv_buf)[id] != expected_buf_acc[id]) {
+                check_buf_acc[id] = -1;
+            }
+        });
     });
 
     if (!handle_exception(q))
@@ -145,7 +146,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_allgatherv_inplace_test.cpp b/examples/sycl/sycl_allgatherv_inplace_test.cpp
index 18afb02b9..6fd7d7bc2 100644
--- a/examples/sycl/sycl_allgatherv_inplace_test.cpp
+++ b/examples/sycl/sycl_allgatherv_inplace_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -31,16 +30,18 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -103,8 +104,8 @@ int main(int argc, char *argv[]) {
         accessor send_buf_acc(send_buf, h, read_only);
         accessor recv_buf_acc(recv_buf, h, write_only);
         h.parallel_for(send_buf_count, [=](auto id) {
-                recv_buf_acc[rbuf_idx + id] = send_buf_acc[id] + 1;
-            });
+            recv_buf_acc[rbuf_idx + id] = send_buf_acc[id] + 1;
+        });
     });
 
     if (!handle_exception(q))
@@ -119,10 +120,10 @@ int main(int argc, char *argv[]) {
         accessor recv_buf_acc(recv_buf, h, write_only);
         accessor expected_buf_acc(expected_buf, h, read_only);
         h.parallel_for(recv_buf_count, [=](auto id) {
-                if (recv_buf_acc[id] != expected_buf_acc[id]) {
-                    recv_buf_acc[id] = -1;
-                }
-            });
+            if (recv_buf_acc[id] != expected_buf_acc[id]) {
+                recv_buf_acc[id] = -1;
+            }
+        });
     });
 
     if (!handle_exception(q))
@@ -142,7 +143,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_allgatherv_test.cpp b/examples/sycl/sycl_allgatherv_test.cpp
index 7854a3a9d..7ac3a48b0 100644
--- a/examples/sycl/sycl_allgatherv_test.cpp
+++ b/examples/sycl/sycl_allgatherv_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -29,16 +28,18 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -109,10 +110,10 @@ int main(int argc, char *argv[]) {
         accessor recv_buf_acc(recv_buf, h, write_only);
         accessor expected_buf_acc(expected_buf, h, read_only);
         h.parallel_for(size * count, [=](auto id) {
-                if (recv_buf_acc[id] != expected_buf_acc[id]) {
-                    recv_buf_acc[id] = -1;
-                }
-            });
+            if (recv_buf_acc[id] != expected_buf_acc[id]) {
+                recv_buf_acc[id] = -1;
+            }
+        });
     });
 
     if (!handle_exception(q))
@@ -132,7 +133,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_allgatherv_usm_test.cpp b/examples/sycl/sycl_allgatherv_usm_test.cpp
index 0160deab6..a6013485a 100644
--- a/examples/sycl/sycl_allgatherv_usm_test.cpp
+++ b/examples/sycl/sycl_allgatherv_usm_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -28,8 +27,14 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
@@ -45,10 +50,6 @@ int main(int argc, char *argv[]) {
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -91,7 +92,7 @@ int main(int argc, char *argv[]) {
 
     /* create dependency vector */
     vector<ccl::event> events;
-    events.push_back(ccl::create_event(e));
+    // events.push_back(ccl::create_event(e));
 
     if (!handle_exception(q))
         return -1;
@@ -105,10 +106,10 @@ int main(int argc, char *argv[]) {
         accessor expected_buf_acc(expected_buf, h, read_only);
         accessor check_buf_acc(check_buf, h, write_only);
         h.parallel_for(size * count, [=](auto id) {
-                if (recv_buf[id] != expected_buf_acc[id]) {
-                    check_buf_acc[id] = -1;
-                }
-            });
+            if (recv_buf[id] != expected_buf_acc[id]) {
+                check_buf_acc[id] = -1;
+            }
+        });
     });
 
     if (!handle_exception(q))
@@ -128,7 +129,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
index 086eefc20..4c0605ba2 100644
--- a/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
+++ b/examples/sycl/sycl_allreduce_inplace_usm_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -28,8 +27,14 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
@@ -45,10 +50,6 @@ int main(int argc, char *argv[]) {
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -113,7 +114,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_allreduce_test.cpp b/examples/sycl/sycl_allreduce_test.cpp
index c42ab5043..6200b3c33 100644
--- a/examples/sycl/sycl_allreduce_test.cpp
+++ b/examples/sycl/sycl_allreduce_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -28,16 +27,18 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -113,7 +114,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_allreduce_usm_test.cpp b/examples/sycl/sycl_allreduce_usm_test.cpp
index fe87a8ff1..e2fceb44a 100644
--- a/examples/sycl/sycl_allreduce_usm_test.cpp
+++ b/examples/sycl/sycl_allreduce_usm_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -28,8 +27,14 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
@@ -45,10 +50,6 @@ int main(int argc, char *argv[]) {
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -115,7 +116,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_alltoall_test.cpp b/examples/sycl/sycl_alltoall_test.cpp
index 3e8aede51..23e20629a 100644
--- a/examples/sycl/sycl_alltoall_test.cpp
+++ b/examples/sycl/sycl_alltoall_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -29,16 +28,18 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -79,10 +80,9 @@ int main(int argc, char *argv[]) {
     /* open send_buf and modify it on the device side */
     q.submit([&](auto &h) {
         accessor send_buf_acc(send_buf, h, write_only);
-        h.parallel_for(count * size,
-                                                          [=](auto id) {
-                                                              send_buf_acc[id] += 1;
-                                                          });
+        h.parallel_for(count * size, [=](auto id) {
+            send_buf_acc[id] += 1;
+        });
     });
 
     if (!handle_exception(q))
@@ -118,7 +118,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_alltoall_usm_test.cpp b/examples/sycl/sycl_alltoall_usm_test.cpp
index 3a5c8c595..8fa744a97 100644
--- a/examples/sycl/sycl_alltoall_usm_test.cpp
+++ b/examples/sycl/sycl_alltoall_usm_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -28,8 +27,14 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
@@ -45,10 +50,6 @@ int main(int argc, char *argv[]) {
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -75,11 +76,10 @@ int main(int argc, char *argv[]) {
 
     /* open buffers and modify them on the device side */
     q.submit([&](auto &h) {
-        h.parallel_for(count * size,
-                                                          [=](auto id) {
-                                                              send_buf[id] = id / count + 1;
-                                                              recv_buf[id] = -1;
-                                                          });
+        h.parallel_for(count * size, [=](auto id) {
+            send_buf[id] = id / count + 1;
+            recv_buf[id] = -1;
+        });
     });
 
     if (!handle_exception(q))
@@ -116,7 +116,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_alltoallv_test.cpp b/examples/sycl/sycl_alltoallv_test.cpp
index cc9cf5181..fd9bd7810 100644
--- a/examples/sycl/sycl_alltoallv_test.cpp
+++ b/examples/sycl/sycl_alltoallv_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -29,16 +28,18 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -82,10 +83,9 @@ int main(int argc, char *argv[]) {
     /* open send_buf and modify it on the device side */
     q.submit([&](auto &h) {
         accessor send_buf_acc(send_buf, h, write_only);
-        h.parallel_for(count * size,
-                                                           [=](auto id) {
-                                                               send_buf_acc[id] += 1;
-                                                           });
+        h.parallel_for(count * size, [=](auto id) {
+            send_buf_acc[id] += 1;
+        });
     });
 
     if (!handle_exception(q))
@@ -97,12 +97,11 @@ int main(int argc, char *argv[]) {
     /* open recv_buf and check its correctness on the device side */
     q.submit([&](auto &h) {
         accessor recv_buf_acc(recv_buf, h, write_only);
-        h.parallel_for(count * size,
-                                                          [=](auto id) {
-                                                              if (recv_buf_acc[id] != rank + 1) {
-                                                                  recv_buf_acc[id] = -1;
-                                                              }
-                                                          });
+        h.parallel_for(count * size, [=](auto id) {
+            if (recv_buf_acc[id] != rank + 1) {
+                recv_buf_acc[id] = -1;
+            }
+        });
     });
 
     if (!handle_exception(q))
@@ -122,7 +121,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_alltoallv_usm_test.cpp b/examples/sycl/sycl_alltoallv_usm_test.cpp
index f9f0519c4..5f23ad973 100644
--- a/examples/sycl/sycl_alltoallv_usm_test.cpp
+++ b/examples/sycl/sycl_alltoallv_usm_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
 
     int i = 0;
@@ -28,8 +27,14 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
@@ -45,10 +50,6 @@ int main(int argc, char *argv[]) {
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -78,11 +79,10 @@ int main(int argc, char *argv[]) {
 
     /* open buffers and modify them on the device side */
     q.submit([&](auto &h) {
-        h.parallel_for(count * size,
-                                                          [=](auto id) {
-                                                              send_buf[id] = id / count + 1;
-                                                              recv_buf[id] = -1;
-                                                          });
+        h.parallel_for(count * size, [=](auto id) {
+            send_buf[id] = id / count + 1;
+            recv_buf[id] = -1;
+        });
     });
 
     if (!handle_exception(q))
@@ -119,7 +119,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/examples/sycl/sycl_broadcast_test.cpp b/examples/sycl/sycl_broadcast_test.cpp
index 8594731a8..1976afdd5 100644
--- a/examples/sycl/sycl_broadcast_test.cpp
+++ b/examples/sycl/sycl_broadcast_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
     const size_t root_rank = 0;
 
@@ -29,16 +28,18 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -67,7 +68,7 @@ int main(int argc, char *argv[]) {
         host_accessor send_buf_acc(buf, write_only);
         for (i = 0; i < count; i++) {
             if (rank == root_rank)
-                send_buf_acc[i] = rank;
+                send_buf_acc[i] = rank + 10;
             else
                 send_buf_acc[i] = 0;
         }
@@ -91,7 +92,7 @@ int main(int argc, char *argv[]) {
     q.submit([&](auto &h) {
         accessor recv_buf_acc(buf, h, write_only);
         h.parallel_for(count, [=](auto id) {
-            if (recv_buf_acc[id] != root_rank + 1) {
+            if (recv_buf_acc[id] != root_rank + 11) {
                 recv_buf_acc[id] = -1;
             }
         });
@@ -101,20 +102,16 @@ int main(int argc, char *argv[]) {
         return -1;
 
     /* print out the result of the test on the host side */
-    if (rank == root_rank) {
-        host_accessor recv_buf_acc(buf, read_only);
-        for (i = 0; i < count; i++) {
-            if (recv_buf_acc[i] == -1) {
-                cout << "FAILED\n";
-                break;
-            }
-        }
-        if (i == count) {
-            cout << "PASSED\n";
+    host_accessor recv_buf_acc(buf, read_only);
+    for (i = 0; i < count; i++) {
+        if (recv_buf_acc[i] == -1) {
+            cout << "FAILED\n";
+            break;
         }
     }
-
-    MPI_Finalize();
+    if (i == count) {
+        cout << "PASSED\n";
+    }
 
     return 0;
 }
diff --git a/examples/sycl/sycl_broadcast_usm_test.cpp b/examples/sycl/sycl_broadcast_usm_test.cpp
index cf64997d4..78b95af82 100644
--- a/examples/sycl/sycl_broadcast_usm_test.cpp
+++ b/examples/sycl/sycl_broadcast_usm_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
     const size_t root_rank = 0;
 
@@ -29,8 +28,14 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
@@ -46,10 +51,6 @@ int main(int argc, char *argv[]) {
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -76,8 +77,11 @@ int main(int argc, char *argv[]) {
     /* open buffers and modify them on the device side */
     q.submit([&](auto &h) {
         h.parallel_for(count, [=](auto id) {
-            if (id == root_rank) {
-                buf[id] = root_rank;
+            if (rank == root_rank) {
+                buf[id] = root_rank + 10;
+            }
+            else {
+                buf[id] = 0;
             }
             buf[id] += 1;
         });
@@ -94,7 +98,7 @@ int main(int argc, char *argv[]) {
     q.submit([&](auto &h) {
         accessor check_buf_acc(check_buf, h, write_only);
         h.parallel_for(count, [=](auto id) {
-            if (buf[id] != root_rank + 1) {
+            if (buf[id] != root_rank + 11) {
                 check_buf_acc[id] = -1;
             }
         });
@@ -104,20 +108,16 @@ int main(int argc, char *argv[]) {
         return -1;
 
     /* print out the result of the test on the host side */
-    if (rank == root_rank) {
-        host_accessor check_buf_acc(check_buf, read_only);
-        for (i = 0; i < count; i++) {
-            if (check_buf_acc[i] == -1) {
-                cout << "FAILED\n";
-                break;
-            }
-        }
-        if (i == count) {
-            cout << "PASSED\n";
+    host_accessor check_buf_acc(check_buf, read_only);
+    for (i = 0; i < count; i++) {
+        if (check_buf_acc[i] == -1) {
+            cout << "FAILED\n";
+            break;
         }
     }
-
-    MPI_Finalize();
+    if (i == count) {
+        cout << "PASSED\n";
+    }
 
     return 0;
 }
diff --git a/examples/sycl/sycl_reduce_test.cpp b/examples/sycl/sycl_reduce_test.cpp
index ae2739c5c..8d3230a2a 100644
--- a/examples/sycl/sycl_reduce_test.cpp
+++ b/examples/sycl/sycl_reduce_test.cpp
@@ -19,7 +19,6 @@ using namespace std;
 using namespace sycl;
 
 int main(int argc, char *argv[]) {
-
     const size_t count = 10 * 1024 * 1024;
     const size_t root_rank = 0;
 
@@ -29,16 +28,18 @@ int main(int argc, char *argv[]) {
 
     ccl::init();
 
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    atexit(mpi_finalize);
+
     queue q;
-    if (!create_sycl_queue(argc, argv, q)) {
+    if (!create_sycl_queue(argc, argv, rank, q)) {
         return -1;
     }
 
     /* create kvs */
-    MPI_Init(NULL, NULL);
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
     ccl::shared_ptr_class<ccl::kvs> kvs;
     ccl::kvs::address_type main_addr;
     if (rank == 0) {
@@ -124,7 +125,5 @@ int main(int argc, char *argv[]) {
         }
     }
 
-    MPI_Finalize();
-
     return 0;
 }
diff --git a/include/oneapi/ccl.hpp b/include/oneapi/ccl.hpp
index 8a0d3356b..cb60ff453 100644
--- a/include/oneapi/ccl.hpp
+++ b/include/oneapi/ccl.hpp
@@ -15,9 +15,9 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_environment.hpp"
-
-#include "oneapi/ccl/ccl_api_functions.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/environment.hpp"
+#include "oneapi/ccl/api_functions.hpp"
 
 namespace ccl {}
 namespace oneapi {
diff --git a/include/oneapi/ccl/ccl_aliases.hpp b/include/oneapi/ccl/aliases.hpp
similarity index 96%
rename from include/oneapi/ccl/ccl_aliases.hpp
rename to include/oneapi/ccl/aliases.hpp
index 9cb28d212..74031d7d8 100644
--- a/include/oneapi/ccl/ccl_aliases.hpp
+++ b/include/oneapi/ccl/aliases.hpp
@@ -26,6 +26,8 @@
 #include <utility>
 #include <vector>
 
+#include "oneapi/ccl/string.hpp"
+
 namespace ccl {
 template <class T, class Alloc = std::allocator<T>>
 using vector_class = std::vector<T, Alloc>;
@@ -33,7 +35,7 @@ using vector_class = std::vector<T, Alloc>;
 template <class T, std::size_t N>
 using array_class = std::array<T, N>;
 
-using string_class = std::string;
+using string_class = ccl::string;
 
 template <class R, class... ArgTypes>
 using function_class = std::function<R(ArgTypes...)>;
diff --git a/include/oneapi/ccl/ccl_api_functions.hpp b/include/oneapi/ccl/api_functions.hpp
similarity index 55%
rename from include/oneapi/ccl/ccl_api_functions.hpp
rename to include/oneapi/ccl/api_functions.hpp
index f2bedb1dc..6a1c8c961 100644
--- a/include/oneapi/ccl/ccl_api_functions.hpp
+++ b/include/oneapi/ccl/api_functions.hpp
@@ -13,1191 +13,1201 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#ifndef CCL_PRODUCT_FULL
-#error "Do not include this file directly. Please include 'ccl.hpp'"
-#endif
-
-namespace ccl {
-
-/******************** INIT ********************/
-
-/**
- * Initializes the library. Optional for invocation.
- */
-void init();
-
-/**
- * Retrieves the library version
- */
-library_version get_library_version();
-
-
-/******************** DATATYPE ********************/
-
-/**
- * Creates a datatype attribute object, which may used to register custom datatype
- * @return an attribute object
- */
-template <class... attr_value_pair_t>
-datatype_attr CCL_API create_datatype_attr(attr_value_pair_t&&... avps) {
-    return environment::instance().create_datatype_attr(std::forward<attr_value_pair_t>(avps)...);
-}
-
-/**
- * Registers custom datatype to be used in communication operations
- * @param attr datatype attributes
- * @return datatype handle
- */
-datatype register_datatype(const datatype_attr& attr);
-
-/**
- * Deregisters custom datatype
- * @param dtype custom datatype handle
- */
-void deregister_datatype(datatype dtype);
-
-/**
- * Retrieves a datatype size in bytes
- * @param dtype datatype handle
- * @return datatype size
- */
-size_t get_datatype_size(datatype dtype);
-
-
-/******************** KVS ********************/
-
-/**
- * Creates a main key-value store.
- * It's address should be distributed using out of band communication mechanism
- * and be used to create key-value stores on other ranks.
- * @return kvs object
- */
-shared_ptr_class<kvs> create_main_kvs();
-
-/**
- * Creates a new key-value store from main kvs address
- * @param addr address of main kvs
- * @return kvs object
- */
-shared_ptr_class<kvs> create_kvs(const kvs::address_type& addr);
-
-
-/******************** DEVICE ********************/
-
-/**
- * Creates a new device from @native_device_type
- * @param native_device the existing handle of device
- * @return device object
- */
-device create_device();
-
-template <class native_device_type,
-          class = typename std::enable_if<is_device_supported<native_device_type>()>::type>
-device create_device(native_device_type&& native_device) {
-    return environment::instance().create_device(std::forward<native_device_type>(native_device));
-}
-
-template <class... attr_value_pair_t>
-device create_device_from_attr(typename unified_device_type::ccl_native_t dev,
-                               attr_value_pair_t&&... avps) {
-    return environment::instance().create_device_from_attr(
-        dev, std::forward<attr_value_pair_t>(avps)...);
-}
-
-
-/******************** CONTEXT ********************/
-
-/**
- * Creates a new context from @native_device_contex_type
- * @param native_device_context the existing handle of context
- * @return context object
- */
-context create_context();
-
-template <class native_device_context_type,
-          class = typename std::enable_if<is_context_supported<native_device_context_type>()>::type>
-context create_context(native_device_context_type&& native_device_context) {
-    return environment::instance().create_context(std::forward<native_device_context_type>(native_device_context));
-}
-
-template <class... attr_value_pair_t>
-context create_context_from_attr(typename unified_device_context_type::ccl_native_t ctx,
-                               attr_value_pair_t&&... avps) {
-    return environment::instance().create_context_from_attr(
-        ctx, std::forward<attr_value_pair_t>(avps)...);
-}
-
-/******************** EVENT ********************/
-
-/**
- * Creates a new event from @native_event_type
- * @param native_event the existing handle of event
- * @return event object
- */
-template <class event_type,
-          class = typename std::enable_if<is_event_supported<event_type>()>::type>
-event create_event(event_type& native_event) {
-    return environment::instance().create_event(native_event);
-}
-
-
-/******************** STREAM ********************/
-
-/**
- * Creates a new stream from @native_stream_type
- * @param native_stream the existing handle of stream
- * @return stream object
- */
-stream create_stream();
-
-template <class native_stream_type,
-          class = typename std::enable_if<is_stream_supported<native_stream_type>()>::type>
-stream create_stream(native_stream_type& native_stream) {
-    return environment::instance().create_stream(native_stream);
-}
-
-template <class native_stream_type, class native_context_type,
-          class = typename std::enable_if<is_stream_supported<native_stream_type>()>::type>
-stream create_stream(native_stream_type& native_stream, native_context_type& native_ctx) {
-    return environment::instance().create_stream(native_stream, native_ctx);
-}
-
-template <class... attr_value_pair_t>
-stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
-                               attr_value_pair_t&&... avps) {
-    return environment::instance().create_stream_from_attr(
-        device, std::forward<attr_value_pair_t>(avps)...);
-}
-
-template <class... attr_value_pair_t>
-stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
-                               typename unified_device_context_type::ccl_native_t context,
-                               attr_value_pair_t&&... avps) {
-    return environment::instance().create_stream_from_attr(
-        device, context, std::forward<attr_value_pair_t>(avps)...);
-}
-
-
-/******************** COMMUNICATOR ********************/
-
-template <class... attr_value_pair_t>
-comm_split_attr create_comm_split_attr(attr_value_pair_t&&... avps) {
-    return environment::instance().create_comm_split_attr(
-    std::forward<attr_value_pair_t>(avps)...);
-}
-
-namespace preview {
-
-/**
- * Splits device communicators according to attributes.
- * @param attrs split attributes for local communicators
- * @return vector of device communicators
- */
-vector_class<communicator> split_device_communicators(
-    const vector_class<pair_class<communicator, comm_split_attr>>& attrs);
-
-
-/**
- * Creates a new communicator with externally provided size, rank and kvs.
- * Implementation is platform specific and non portable.
- * @return communicator
- */
-communicator create_communicator();
-
-/**
- * Creates a new communicator with user supplied size and kvs.
- * Rank will be assigned automatically.
- * @param size user-supplied total number of ranks
- * @param kvs key-value store for ranks wire-up
- * @return communicator
- */
-communicator create_communicator(size_t size, shared_ptr_class<kvs_interface> kvs);
-
-} // namespace preview
-
-
-/**
- * Creates a new communicator with user supplied size, rank and kvs.
- * @param size user-supplied total number of ranks
- * @param rank user-supplied rank
- * @param kvs key-value store for ranks wire-up
- * @return communicator
- */
-communicator create_communicator(size_t size,
-                                 size_t rank,
-                                 shared_ptr_class<kvs_interface> kvs);
-
-/**
- * Creates a new communicators with user supplied size, locao devices and kvs.
- * Ranks will be assigned automatically.
- * @param size user-supplied total number of ranks
- * @param local_devices user-supplied device objects for local ranks
- * @param context context containing the devices
- * @param kvs key-value store for ranks wire-up
- * @return vector of communicators
- */
-template <class DeviceType, class ContextType>
-vector_class<communicator> create_communicators(
-    size_t size,
-    const vector_class<DeviceType>& local_devices,
-    ContextType& context,
-    shared_ptr_class<kvs_interface> kvs) {
-    return environment::instance().create_communicators(
-        size, local_devices, context, kvs);
-}
-
-/**
- * Creates a new communicators with user supplied size, ranks, local device-rank mapping and kvs.
- * @param size user-supplied total number of ranks
- * @param local_rank_device_map user-supplied mapping of local ranks on devices
- * @param context context containing the devices
- * @param kvs key-value store for ranks wire-up
- * @return vector of communicators
- */
-template <class DeviceType, class ContextType>
-vector_class<communicator> create_communicators(
-    size_t size,
-    const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-    ContextType& context,
-    shared_ptr_class<kvs_interface> kvs) {
-    return environment::instance().create_communicators(
-        size, local_rank_device_map, context, kvs);
-}
-
-template <class DeviceType, class ContextType>
-vector_class<communicator> create_communicators(
-    size_t size,
-    const map_class<rank_t, DeviceType>& local_rank_device_map,
-    ContextType& context,
-    shared_ptr_class<kvs_interface> kvs) {
-    return environment::instance().create_communicators(
-        size, local_rank_device_map, context, kvs);
-}
-
-template <class DeviceType, class ContextType>
-communicator create_communicator(
-    size_t size,
-    rank_t rank,
-    DeviceType& device,
-    ContextType& context,
-    shared_ptr_class<kvs_interface> kvs) {
-
-    auto comms = environment::instance().create_communicators(
-        size, ccl::vector_class<ccl::pair_class<ccl::rank_t, ccl::device>>{{rank,device}}, context, kvs);
-
-    if (comms.size() != 1)
-      throw ccl::exception("unexpected comm vector size");
-
-    return std::move(comms[0]);
-}
-
-
-/******************** OPERATION ********************/
-
-/**
- * Creates an operation attribute object, which may used to customize communication operation
- * @return an attribute object
- */
-template <class coll_attribute_type, class... attr_value_pair_t>
-coll_attribute_type CCL_API create_operation_attr(attr_value_pair_t&&... avps) {
-    return environment::instance().create_operation_attr<coll_attribute_type>(
-        std::forward<attr_value_pair_t>(avps)...);
-}
-
-/**
- * Allgatherv is a collective communication operation that collects data
- * from all the ranks within a communicator into a single buffer.
- * Different ranks may contribute segments of different sizes.
- * The resulting data in the output buffer must be the same for each rank.
- */
-
-/**
- * @param send_buf the buffer with @c send_count elements of @c dtype that stores local data to be gathered
- * @param send_count the number of elements of type @c dtype in @c send_buf
- * @param recv_buf [out] the buffer to store gathered result, should be large enough to hold values from all ranks
- * @param recv_bufs [out] array of buffers to store gathered result, one buffer per each rank
- * @param recv_counts array with the number of elements of type @c dtype to be received from each rank
- * @param dtype the datatype of elements in @c send_buf and @c recv_buf
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-event allgatherv(const void* send_buf,
-                   size_t send_count,
-                   void* recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   datatype dtype,
-                   const communicator& comm,
-                   const stream& stream,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-event allgatherv(const void* send_buf,
-                   size_t send_count,
-                   void* recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   datatype dtype,
-                   const communicator& comm,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-event allgatherv(const void* send_buf,
-                   size_t send_count,
-                   const vector_class<void*>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   datatype dtype,
-                   const communicator& comm,
-                   const stream& stream,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-event allgatherv(const void* send_buf,
-                   size_t send_count,
-                   const vector_class<void*>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   datatype dtype,
-                   const communicator& comm,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event allgatherv(const BufferType* send_buf,
-                   size_t send_count,
-                   BufferType* recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const stream& stream,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event allgatherv(const BufferType* send_buf,
-                   size_t send_count,
-                   BufferType* recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event allgatherv(const BufferType* send_buf,
-                   size_t send_count,
-                   vector_class<BufferType*>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const stream& stream,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event allgatherv(const BufferType* send_buf,
-                   size_t send_count,
-                   vector_class<BufferType*>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event allgatherv(const BufferObjectType& send_buf,
-                   size_t send_count,
-                   BufferObjectType& recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const stream& stream,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event allgatherv(const BufferObjectType& send_buf,
-                   size_t send_count,
-                   BufferObjectType& recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event allgatherv(const BufferObjectType& send_buf,
-                   size_t send_count,
-                   vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const stream& stream,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event allgatherv(const BufferObjectType& send_buf,
-                   size_t send_count,
-                   vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const allgatherv_attr& attr = default_allgatherv_attr,
-                   const vector_class<event>& deps = {});
-
-/**
- * Allreduce is a collective communication operation that performs the global reduction operation
- * on values from all ranks of communicator and distributes the result back to all ranks.
- */
-
-/**
- * @param send_buf the buffer with @c count elements of @c dtype that stores local data to be reduced
- * @param recv_buf [out] the buffer to store reduced result, must have the same dimension as @c send_buf
- * @param count the number of elements of type @c dtype in @c send_buf and @c recv_buf
- * @param dtype the datatype of elements in @c send_buf and @c recv_buf
- * @param rtype the type of the reduction operation to be applied
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-event allreduce(const void* send_buf,
-                  void* recv_buf,
-                  size_t count,
-                  datatype dtype,
-                  reduction rtype,
-                  const communicator& comm,
-                  const stream& stream,
-                  const allreduce_attr& attr = default_allreduce_attr,
-                  const vector_class<event>& deps = {});
-
-event allreduce(const void* send_buf,
-                  void* recv_buf,
-                  size_t count,
-                  datatype dtype,
-                  reduction rtype,
-                  const communicator& comm,
-                  const allreduce_attr& attr = default_allreduce_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event allreduce(const BufferType* send_buf,
-                  BufferType* recv_buf,
-                  size_t count,
-                  reduction rtype,
-                  const communicator& comm,
-                  const stream& stream,
-                  const allreduce_attr& attr = default_allreduce_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event allreduce(const BufferType* send_buf,
-                  BufferType* recv_buf,
-                  size_t count,
-                  reduction rtype,
-                  const communicator& comm,
-                  const allreduce_attr& attr = default_allreduce_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event allreduce(const BufferObjectType& send_buf,
-                  BufferObjectType& recv_buf,
-                  size_t count,
-                  reduction rtype,
-                  const communicator& comm,
-                  const stream& stream,
-                  const allreduce_attr& attr = default_allreduce_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event allreduce(const BufferObjectType& send_buf,
-                  BufferObjectType& recv_buf,
-                  size_t count,
-                  reduction rtype,
-                  const communicator& comm,
-                  const allreduce_attr& attr = default_allreduce_attr,
-                  const vector_class<event>& deps = {});
-
-/**
- * Alltoall is a collective communication operation in which each rank
- * sends distinct equal-sized blocks of data to each rank.
- * The j-th block of @c send_buf sent from the i-th rank is received by the j-th rank
- * and is placed in the i-th block of @c recvbuf.
- */
-
-/**
- * @param send_buf the buffer with @c count elements of @c dtype that stores local data to be sent
- * @param recv_buf [out] the buffer to store received result, should be large enough
- * to hold values from all ranks, i.e. at least @c comm_size * @c count
- * @param send_bufs array of buffers with local data to be sent, one buffer per each rank
- * @param recv_bufs [out] array of buffers to store received result, one buffer per each rank
- * @param count the number of elements of type @c dtype to be send to or to received from each rank
- * @param dtype the datatype of elements in @c send_buf and @c recv_buf
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-event alltoall(const void* send_buf,
-                 void* recv_buf,
-                 size_t count,
-                 datatype dtype,
-                 const communicator& comm,
-                 const stream& stream,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-event alltoall(const void* send_buf,
-                 void* recv_buf,
-                 size_t count,
-                 datatype dtype,
-                 const communicator& comm,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-event alltoall(const vector_class<void*>& send_buf,
-                 const vector_class<void*>& recv_buf,
-                 size_t count,
-                 datatype dtype,
-                 const communicator& comm,
-                 const stream& stream,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-event alltoall(const vector_class<void*>& send_buf,
-                 const vector_class<void*>& recv_buf,
-                 size_t count,
-                 datatype dtype,
-                 const communicator& comm,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event alltoall(const BufferType* send_buf,
-                 BufferType* recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const stream& stream,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event alltoall(const BufferType* send_buf,
-                 BufferType* recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event alltoall(const vector_class<BufferType*>& send_buf,
-                 const vector_class<BufferType*>& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const stream& stream,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event alltoall(const vector_class<BufferType*>& send_buf,
-                 const vector_class<BufferType*>& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event alltoall(const BufferObjectType& send_buf,
-                 BufferObjectType& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const stream& stream,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event alltoall(const BufferObjectType& send_buf,
-                 BufferObjectType& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
-                 const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const stream& stream,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
-                 const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const alltoall_attr& attr = default_alltoall_attr,
-                 const vector_class<event>& deps = {});
-
-/**
- * Alltoallv is a collective communication operation in which each rank
- * sends distinct blocks of data to each rank. Block sizes may differ.
- * The j-th block of @c send_buf sent from the i-th rank is received by the j-th rank
- * and is placed in the i-th block of @c recvbuf.
- */
-
-/**
- * @param send_buf the buffer with elements of @c dtype that stores local blocks to be sent to each rank
- * @param send_bufs array of buffers to store send blocks, one buffer per each rank
- * @param recv_buf [out] the buffer to store received result, should be large enough to hold blocks from all ranks
- * @param recv_bufs [out] array of buffers to store receive blocks, one buffer per each rank
- * @param send_counts array with the number of elements of type @c dtype in send blocks for each rank
- * @param recv_counts array with the number of elements of type @c dtype in receive blocks from each rank
- * @param dtype the datatype of elements in @c send_buf and @c recv_buf
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-event alltoallv(const void* send_buf,
-                  const vector_class<size_t>& send_counts,
-                  void* recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  datatype dtype,
-                  const communicator& comm,
-                  const stream& stream,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-event alltoallv(const void* send_buf,
-                  const vector_class<size_t>& send_counts,
-                  void* recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  datatype dtype,
-                  const communicator& comm,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-event alltoallv(const vector_class<void*>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<void*>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  datatype dtype,
-                  const communicator& comm,
-                  const stream& stream,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-event alltoallv(const vector_class<void*>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<void*>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  datatype dtype,
-                  const communicator& comm,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event alltoallv(const BufferType* send_buf,
-                  const vector_class<size_t>& send_counts,
-                  BufferType* recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const stream& stream,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event alltoallv(const BufferType* send_buf,
-                  const vector_class<size_t>& send_counts,
-                  BufferType* recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event alltoallv(const vector_class<BufferType*>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<BufferType*>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const stream& stream,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event alltoallv(const vector_class<BufferType*>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<BufferType*>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event alltoallv(const BufferObjectType& send_buf,
-                  const vector_class<size_t>& send_counts,
-                  BufferObjectType& recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const stream& stream,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event alltoallv(const BufferObjectType& send_buf,
-                  const vector_class<size_t>& send_counts,
-                  BufferObjectType& recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const stream& stream,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const alltoallv_attr& attr = default_alltoallv_attr,
-                  const vector_class<event>& deps = {});
-
-/**
- * Barrier synchronization is performed across all ranks of the communicator
- * and it is completed only after all the ranks in the communicator have called it.
- */
-
-/**
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-event barrier(const communicator& comm,
-                const stream& stream,
-                const barrier_attr& attr = default_barrier_attr,
-                const vector_class<event>& deps = {});
-
-event barrier(const communicator& comm,
-                const barrier_attr& attr = default_barrier_attr,
-                const vector_class<event>& deps = {});
-
-/**
- * Broadcast is a collective communication operation that broadcasts data
- * from one rank of communicator (denoted as root) to all other ranks.
- */
-
-/**
- * @param buf [in,out] the buffer with @c count elements of @c dtype
- * serves as send buffer for root and as receive buffer for other ranks
- * @param count the number of elements of type @c dtype in @c buf
- * @param dtype the datatype of elements in @c buf
- * @param root the rank that broadcasts @c buf
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-event broadcast(void* buf,
-                  size_t count,
-                  datatype dtype,
-                  size_t root,
-                  const communicator& comm,
-                  const stream& stream,
-                  const broadcast_attr& attr = default_broadcast_attr,
-                  const vector_class<event>& deps = {});
-
-event broadcast(void* buf,
-                  size_t count,
-                  datatype dtype,
-                  size_t root,
-                  const communicator& comm,
-                  const broadcast_attr& attr = default_broadcast_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event broadcast(BufferType* buf,
-                  size_t count,
-                  size_t root,
-                  const communicator& comm,
-                  const stream& stream,
-                  const broadcast_attr& attr = default_broadcast_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event broadcast(BufferType* buf,
-                  size_t count,
-                  size_t root,
-                  const communicator& comm,
-                  const broadcast_attr& attr = default_broadcast_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event broadcast(BufferObjectType& buf,
-                  size_t count,
-                  size_t root,
-                  const communicator& comm,
-                  const stream& stream,
-                  const broadcast_attr& attr = default_broadcast_attr,
-                  const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event broadcast(BufferObjectType& buf,
-                  size_t count,
-                  size_t root,
-                  const communicator& comm,
-                  const broadcast_attr& attr = default_broadcast_attr,
-                  const vector_class<event>& deps = {});
-
-/**
- * Reduce is a collective communication operation that performs the global reduction operation
- * on values from all ranks of the communicator and returns the result to the root rank.
- */
-
-/**
- * @param send_buf the buffer with @c count elements of @c dtype that stores local data to be reduced
- * @param recv_buf [out] the buffer to store reduced result, must have the same dimension as @c send_buf.
- * Used by the @c root rank only, ignored by other ranks.
- * @param count the number of elements of type @c dtype in @c send_buf and @c recv_buf
- * @param dtype the datatype of elements in @c send_buf and @c recv_buf
- * @param rtype the type of the reduction operation to be applied
- * @param root the rank that gets the result of reduction
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-event reduce(const void* send_buf,
-               void* recv_buf,
-               size_t count,
-               datatype dtype,
-               reduction rtype,
-               size_t root,
-               const communicator& comm,
-               const stream& stream,
-               const reduce_attr& attr = default_reduce_attr,
-               const vector_class<event>& deps = {});
-
-event reduce(const void* send_buf,
-               void* recv_buf,
-               size_t count,
-               datatype dtype,
-               reduction rtype,
-               size_t root,
-               const communicator& comm,
-               const reduce_attr& attr = default_reduce_attr,
-               const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event reduce(const BufferType* send_buf,
-               BufferType* recv_buf,
-               size_t count,
-               reduction rtype,
-               size_t root,
-               const communicator& comm,
-               const stream& stream,
-               const reduce_attr& attr = default_reduce_attr,
-               const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event reduce(const BufferType* send_buf,
-               BufferType* recv_buf,
-               size_t count,
-               reduction rtype,
-               size_t root,
-               const communicator& comm,
-               const reduce_attr& attr = default_reduce_attr,
-               const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event reduce(const BufferObjectType& send_buf,
-               BufferObjectType& recv_buf,
-               size_t count,
-               reduction rtype,
-               size_t root,
-               const communicator& comm,
-               const stream& stream,
-               const reduce_attr& attr = default_reduce_attr,
-               const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event reduce(const BufferObjectType& send_buf,
-               BufferObjectType& recv_buf,
-               size_t count,
-               reduction rtype,
-               size_t root,
-               const communicator& comm,
-               const reduce_attr& attr = default_reduce_attr,
-               const vector_class<event>& deps = {});
-
-/**
- * Reduce-scatter is a collective communication operation that performs the global reduction operation
- * on values from all ranks of the communicator and scatters the result in blocks back to all ranks.
- */
-
-/**
- * @param send_buf the buffer with @c comm_size * @c count elements of @c dtype that stores local data to be reduced
- * @param recv_buf [out] the buffer to store result block containing @c recv_count elements of type @c dtype
- * @param recv_count the number of elements of type @c dtype in receive block
- * @param dtype the datatype of elements in @c send_buf and @c recv_buf
- * @param rtype the type of the reduction operation to be applied
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-event reduce_scatter(const void* send_buf,
-                       void* recv_buf,
-                       size_t recv_count,
-                       datatype dtype,
-                       reduction rtype,
-                       const communicator& comm,
-                       const stream& stream,
-                       const reduce_scatter_attr& attr = default_reduce_scatter_attr,
-                       const vector_class<event>& deps = {});
-
-event reduce_scatter(const void* send_buf,
-                       void* recv_buf,
-                       size_t recv_count,
-                       datatype dtype,
-                       reduction rtype,
-                       const communicator& comm,
-                       const reduce_scatter_attr& attr = default_reduce_scatter_attr,
-                       const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event reduce_scatter(const BufferType* send_buf,
-                       BufferType* recv_buf,
-                       size_t recv_count,
-                       reduction rtype,
-                       const communicator& comm,
-                       const stream& stream,
-                       const reduce_scatter_attr& attr = default_reduce_scatter_attr,
-                       const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferType,
-          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
-event reduce_scatter(const BufferType* send_buf,
-                       BufferType* recv_buf,
-                       size_t recv_count,
-                       reduction rtype,
-                       const communicator& comm,
-                       const reduce_scatter_attr& attr = default_reduce_scatter_attr,
-                       const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event reduce_scatter(const BufferObjectType& send_buf,
-                       BufferObjectType& recv_buf,
-                       size_t recv_count,
-                       reduction rtype,
-                       const communicator& comm,
-                       const stream& stream,
-                       const reduce_scatter_attr& attr = default_reduce_scatter_attr,
-                       const vector_class<event>& deps = {});
-
-/* Type safety version */
-template <class BufferObjectType,
-          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
-event reduce_scatter(const BufferObjectType& send_buf,
-                       BufferObjectType& recv_buf,
-                       size_t recv_count,
-                       reduction rtype,
-                       const communicator& comm,
-                       const reduce_scatter_attr& attr = default_reduce_scatter_attr,
-                       const vector_class<event>& deps = {});
-
-namespace preview {
-
-/**
- * Sparse allreduce is a collective communication operation that makes global reduction operation
- * on sparse buffers from all ranks of communicator and distributes result back to all ranks.
- * Sparse buffers are defined by separate index and value buffers.
- */
-
-/**
- * @param send_ind_buf the buffer of indices with @c send_ind_count elements of type @c ind_dtype
- * @param send_ind_count the number of elements of type @c ind_type @c send_ind_buf
- * @param send_val_buf the buffer of values with @c send_val_count elements of type @c val_dtype
- * @param send_val_count the number of elements of type @c val_type @c send_val_buf
- * @param recv_ind_buf [out] the buffer to store reduced indices, unused
- * @param recv_ind_count [out] the number of elements in @c recv_ind_buf, unused
- * @param recv_val_buf [out] the buffer to store reduced values, unused
- * @param recv_val_count [out] the number of elements in @c recv_val_buf, unused
- * @param ind_dtype the datatype of elements in @c send_ind_buf and @c recv_ind_buf
- * @param val_dtype the datatype of elements in @c send_val_buf and @c recv_val_buf
- * @param rtype the type of the reduction operation to be applied
- * @param comm the communicator for which the operation will be performed
- * @param stream an optional stream associated with the operation
- * @param attr optional attributes to customize operation
- * @param deps an optional vector of the events that the operation should depend on
- * @return @ref ccl::event an object to track the progress of the operation
- */
-
-ccl::event sparse_allreduce(
-    const void* send_ind_buf,
-    size_t send_ind_count,
-    const void* send_val_buf,
-    size_t send_val_count,
-    void* recv_ind_buf,
-    size_t recv_ind_count,
-    void* recv_val_buf,
-    size_t recv_val_count,
-    ccl::datatype ind_dtype,
-    ccl::datatype val_dtype,
-    ccl::reduction rtype,
-    const ccl::communicator& comm,
-    const ccl::stream& stream,
-    const ccl::sparse_allreduce_attr& attr = ccl::default_sparse_allreduce_attr,
-    const ccl::vector_class<ccl::event>& deps = {});
-
-ccl::event sparse_allreduce(
-    const void* send_ind_buf,
-    size_t send_ind_count,
-    const void* send_val_buf,
-    size_t send_val_count,
-    void* recv_ind_buf,
-    size_t recv_ind_count,
-    void* recv_val_buf,
-    size_t recv_val_count,
-    ccl::datatype ind_dtype,
-    ccl::datatype val_dtype,
-    ccl::reduction rtype,
-    const ccl::communicator& comm,
-    const ccl::sparse_allreduce_attr& attr = ccl::default_sparse_allreduce_attr,
-    const ccl::vector_class<ccl::event>& deps = {});
-
-/* Type safety version */
-template <class IndexBufferType,
-          class ValueBufferType,
-          class = typename std::enable_if<ccl::is_native_type_supported<ValueBufferType>(),
-                                          ccl::event>::type>
-ccl::event sparse_allreduce(
-    const IndexBufferType* send_ind_buf,
-    size_t send_ind_count,
-    const ValueBufferType* send_val_buf,
-    size_t send_val_count,
-    IndexBufferType* recv_ind_buf,
-    size_t recv_ind_count,
-    ValueBufferType* recv_val_buf,
-    size_t recv_val_count,
-    ccl::reduction rtype,
-    const ccl::communicator& comm,
-    const ccl::stream& stream,
-    const ccl::sparse_allreduce_attr& attr = default_sparse_allreduce_attr,
-    const ccl::vector_class<ccl::event>& deps = {});
-
-/* Type safety version */
-template <class IndexBufferType,
-          class ValueBufferType,
-          class = typename std::enable_if<ccl::is_native_type_supported<ValueBufferType>(),
-                                          ccl::event>::type>
-ccl::event sparse_allreduce(
-    const IndexBufferType* send_ind_buf,
-    size_t send_ind_count,
-    const ValueBufferType* send_val_buf,
-    size_t send_val_count,
-    IndexBufferType* recv_ind_buf,
-    size_t recv_ind_count,
-    ValueBufferType* recv_val_buf,
-    size_t recv_val_count,
-    ccl::reduction rtype,
-    const ccl::communicator& comm,
-    const ccl::sparse_allreduce_attr& attr = default_sparse_allreduce_attr,
-    const ccl::vector_class<ccl::event>& deps = {});
-
-} // namespace preview
-
-} // namespace ccl
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace v1 {
+
+/******************** INIT ********************/
+
+/**
+ * Creates an attribute object, which may used to control init operation
+ * @return an attribute object
+ */
+template <class... attr_value_pair_t>
+init_attr create_init_attr(attr_value_pair_t&&... avps) {
+    return detail::environment::create_init_attr(std::forward<attr_value_pair_t>(avps)...);
+}
+
+/**
+ * Initializes the library. Optional for invocation.
+  * @param attr optional init attributes
+ */
+void init(const init_attr& attr = default_init_attr);
+
+/**
+ * Retrieves the library version
+ */
+library_version get_library_version();
+
+/******************** DATATYPE ********************/
+
+/**
+ * Creates an attribute object, which may used to register custom datatype
+ * @return an attribute object
+ */
+template <class... attr_value_pair_t>
+datatype_attr create_datatype_attr(attr_value_pair_t&&... avps) {
+    return detail::environment::create_datatype_attr(std::forward<attr_value_pair_t>(avps)...);
+}
+
+/**
+ * Registers custom datatype to be used in communication operations
+ * @param attr datatype attributes
+ * @return datatype handle
+ */
+datatype register_datatype(const datatype_attr& attr);
+
+/**
+ * Deregisters custom datatype
+ * @param dtype custom datatype handle
+ */
+void deregister_datatype(datatype dtype);
+
+/**
+ * Retrieves a datatype size in bytes
+ * @param dtype datatype handle
+ * @return datatype size
+ */
+size_t get_datatype_size(datatype dtype);
+
+/******************** KVS ********************/
+
+template <class... attr_value_pair_t>
+kvs_attr create_kvs_attr(attr_value_pair_t&&... avps) {
+    return detail::environment::create_kvs_attr(std::forward<attr_value_pair_t>(avps)...);
+}
+
+/**
+ * Creates a main key-value store.
+ * It's address should be distributed using out of band communication mechanism
+ * and be used to create key-value stores on other processes.
+ * @param attr optional kvs attributes
+ * @return kvs object
+ */
+shared_ptr_class<kvs> create_main_kvs(const kvs_attr& attr = default_kvs_attr);
+
+/**
+ * Creates a new key-value store from main kvs address
+ * @param addr address of main kvs
+ * @param attr optional kvs attributes
+ * @return kvs object
+ */
+shared_ptr_class<kvs> create_kvs(const kvs::address_type& addr,
+                                 const kvs_attr& attr = default_kvs_attr);
+
+/******************** DEVICE ********************/
+
+/**
+ * Creates a new device from @native_device_type
+ * @param native_device the existing handle of device
+ * @return device object
+ */
+template <class native_device_type,
+          class = typename std::enable_if<is_device_supported<native_device_type>()>::type>
+device create_device(native_device_type&& native_device) {
+    return detail::environment::instance().create_device(
+        std::forward<native_device_type>(native_device));
+}
+
+device create_device();
+
+/******************** CONTEXT ********************/
+
+/**
+ * Creates a new context from @native_contex_type
+ * @param native_context the existing handle of context
+ * @return context object
+ */
+template <class native_context_type,
+          class = typename std::enable_if<is_context_supported<native_context_type>()>::type>
+context create_context(native_context_type&& native_context) {
+    return detail::environment::instance().create_context(
+        std::forward<native_context_type>(native_context));
+}
+
+context create_context();
+
+/******************** EVENT ********************/
+
+/**
+ * Creates a new event from @native_event_type
+ * @param native_event the existing event
+ * @return event object
+ */
+template <class event_type, class = typename std::enable_if<is_event_supported<event_type>()>::type>
+event create_event(event_type& native_event) {
+    return detail::environment::instance().create_event(native_event);
+}
+
+/******************** STREAM ********************/
+
+/**
+ * Creates a new stream from @native_stream_type
+ * @param native_stream the existing handle of stream
+ * @return stream object
+ */
+template <class native_stream_type,
+          class = typename std::enable_if<is_stream_supported<native_stream_type>()>::type>
+stream create_stream(native_stream_type& native_stream) {
+    return detail::environment::instance().create_stream(native_stream);
+}
+
+stream create_stream();
+
+/******************** COMMUNICATOR ********************/
+
+/**
+ * Creates an attribute object, which may used to control create communicator operation
+ * @return an attribute object
+ */
+template <class... attr_value_pair_t>
+comm_attr create_comm_attr(attr_value_pair_t&&... avps) {
+    return detail::environment::create_comm_attr(std::forward<attr_value_pair_t>(avps)...);
+}
+
+} // namespace v1
+
+namespace preview {
+
+/**
+ * Creates an attribute object, which may used to control split communicator operation
+ * @return an attribute object
+ */
+template <class... attr_value_pair_t>
+comm_split_attr create_comm_split_attr(attr_value_pair_t&&... avps) {
+    return detail::environment::create_comm_split_attr(std::forward<attr_value_pair_t>(avps)...);
+}
+
+} // namespace preview
+
+namespace v1 {
+
+/**
+ * Creates a new communicator with user supplied size, rank and kvs.
+ * @param size user-supplied total number of ranks
+ * @param rank user-supplied rank
+ * @param kvs key-value store for ranks wire-up
+ * @return communicator
+ */
+communicator create_communicator(int size,
+                                 int rank,
+                                 shared_ptr_class<kvs_interface> kvs,
+                                 const comm_attr& attr = default_comm_attr);
+
+/**
+ * Creates a new communicators with user supplied size, ranks, local device-rank mapping and kvs.
+ * @param size user-supplied total number of ranks
+ * @param device local device
+ * @param devices user-supplied mapping of local ranks on devices
+ * @param context context containing the devices
+ * @param kvs key-value store for ranks wire-up
+ * @return vector of communicators
+ */
+template <class DeviceType, class ContextType>
+vector_class<communicator> create_communicators(
+    int size,
+    const vector_class<pair_class<int, DeviceType>>& devices,
+    const ContextType& context,
+    shared_ptr_class<kvs_interface> kvs,
+    const comm_attr& attr = default_comm_attr) {
+    return detail::environment::instance().create_communicators(size, devices, context, kvs, attr);
+}
+
+template <class DeviceType, class ContextType>
+vector_class<communicator> create_communicators(int size,
+                                                const map_class<int, DeviceType>& devices,
+                                                const ContextType& context,
+                                                shared_ptr_class<kvs_interface> kvs,
+                                                const comm_attr& attr = default_comm_attr) {
+    return detail::environment::instance().create_communicators(size, devices, context, kvs, attr);
+}
+
+template <class DeviceType, class ContextType>
+communicator create_communicator(int size,
+                                 int rank,
+                                 DeviceType& device,
+                                 const ContextType& context,
+                                 shared_ptr_class<kvs_interface> kvs,
+                                 const comm_attr& attr = default_comm_attr) {
+    auto comms = detail::environment::instance().create_communicators(
+        size,
+        ccl::vector_class<ccl::pair_class<int, ccl::device>>{ { rank, device } },
+        context,
+        kvs,
+        attr);
+
+    if (comms.size() != 1)
+        throw ccl::exception("unexpected comm vector size");
+
+    return std::move(comms[0]);
+}
+
+} // namespace v1
+
+namespace preview {
+
+/**
+ * Splits communicators according to attributes.
+ * @param attrs split attributes for local communicators
+ * @return vector of communicators
+ */
+vector_class<communicator> split_communicators(
+    const vector_class<pair_class<communicator, comm_split_attr>>& attrs);
+
+/**
+ * Creates a new communicator with externally provided size, rank and kvs.
+ * Implementation is platform specific and non portable.
+ * @return communicator
+ */
+communicator create_communicator(const comm_attr& attr = default_comm_attr);
+
+/**
+ * Creates a new communicator with user supplied size and kvs.
+ * Rank will be assigned automatically.
+ * @param size user-supplied total number of ranks
+ * @param kvs key-value store for ranks wire-up
+ * @return communicator
+ */
+communicator create_communicator(int size,
+                                 shared_ptr_class<kvs_interface> kvs,
+                                 const comm_attr& attr = default_comm_attr);
+
+/**
+ * Creates a new communicators with user supplied size, local devices and kvs.
+ * Ranks will be assigned automatically.
+ * @param size user-supplied total number of ranks
+ * @param devices user-supplied device objects for local ranks
+ * @param context context containing the devices
+ * @param kvs key-value store for ranks wire-up
+ * @return vector of communicators
+ */
+template <class DeviceType, class ContextType>
+vector_class<communicator> create_communicators(int size,
+                                                const vector_class<DeviceType>& devices,
+                                                const ContextType& context,
+                                                shared_ptr_class<kvs_interface> kvs,
+                                                const comm_attr& attr = default_comm_attr) {
+    return detail::environment::instance().create_communicators(size, devices, context, kvs, attr);
+}
+
+} // namespace preview
+
+namespace v1 {
+
+/******************** OPERATION ********************/
+
+/**
+ * Creates an attribute object, which may used to customize communication operation
+ * @return an attribute object
+ */
+template <class coll_attribute_type, class... attr_value_pair_t>
+coll_attribute_type CCL_API create_operation_attr(attr_value_pair_t&&... avps) {
+    return detail::environment::create_operation_attr<coll_attribute_type>(
+        std::forward<attr_value_pair_t>(avps)...);
+}
+
+/**
+ * Allgatherv is a collective communication operation that collects data
+ * from all the ranks within a communicator into a single buffer.
+ * Different ranks may contribute segments of different sizes.
+ * The resulting data in the output buffer must be the same for each rank.
+ */
+
+/**
+ * @param send_buf the buffer with @c send_count elements of @c dtype that stores local data to be gathered
+ * @param send_count the number of elements of type @c dtype in @c send_buf
+ * @param recv_buf [out] the buffer to store gathered result, should be large enough to hold values from all ranks
+ * @param recv_bufs [out] array of buffers to store gathered result, one buffer per each rank
+ * @param recv_counts array with the number of elements of type @c dtype to be received from each rank
+ * @param dtype the datatype of elements in @c send_buf and @c recv_buf
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+event allgatherv(const void* send_buf,
+                 size_t send_count,
+                 void* recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 datatype dtype,
+                 const communicator& comm,
+                 const stream& stream,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+event allgatherv(const void* send_buf,
+                 size_t send_count,
+                 void* recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 datatype dtype,
+                 const communicator& comm,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+event allgatherv(const void* send_buf,
+                 size_t send_count,
+                 const vector_class<void*>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 datatype dtype,
+                 const communicator& comm,
+                 const stream& stream,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+event allgatherv(const void* send_buf,
+                 size_t send_count,
+                 const vector_class<void*>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 datatype dtype,
+                 const communicator& comm,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event allgatherv(const BufferType* send_buf,
+                 size_t send_count,
+                 BufferType* recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const stream& stream,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event allgatherv(const BufferType* send_buf,
+                 size_t send_count,
+                 BufferType* recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event allgatherv(const BufferType* send_buf,
+                 size_t send_count,
+                 vector_class<BufferType*>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const stream& stream,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event allgatherv(const BufferType* send_buf,
+                 size_t send_count,
+                 vector_class<BufferType*>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event allgatherv(const BufferObjectType& send_buf,
+                 size_t send_count,
+                 BufferObjectType& recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const stream& stream,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event allgatherv(const BufferObjectType& send_buf,
+                 size_t send_count,
+                 BufferObjectType& recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event allgatherv(const BufferObjectType& send_buf,
+                 size_t send_count,
+                 vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const stream& stream,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event allgatherv(const BufferObjectType& send_buf,
+                 size_t send_count,
+                 vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const allgatherv_attr& attr = default_allgatherv_attr,
+                 const vector_class<event>& deps = {});
+
+/**
+ * Allreduce is a collective communication operation that performs the global reduction operation
+ * on values from all ranks of communicator and distributes the result back to all ranks.
+ */
+
+/**
+ * @param send_buf the buffer with @c count elements of @c dtype that stores local data to be reduced
+ * @param recv_buf [out] the buffer to store reduced result, must have the same dimension as @c send_buf
+ * @param count the number of elements of type @c dtype in @c send_buf and @c recv_buf
+ * @param dtype the datatype of elements in @c send_buf and @c recv_buf
+ * @param rtype the type of the reduction operation to be applied
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+event allreduce(const void* send_buf,
+                void* recv_buf,
+                size_t count,
+                datatype dtype,
+                reduction rtype,
+                const communicator& comm,
+                const stream& stream,
+                const allreduce_attr& attr = default_allreduce_attr,
+                const vector_class<event>& deps = {});
+
+event allreduce(const void* send_buf,
+                void* recv_buf,
+                size_t count,
+                datatype dtype,
+                reduction rtype,
+                const communicator& comm,
+                const allreduce_attr& attr = default_allreduce_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event allreduce(const BufferType* send_buf,
+                BufferType* recv_buf,
+                size_t count,
+                reduction rtype,
+                const communicator& comm,
+                const stream& stream,
+                const allreduce_attr& attr = default_allreduce_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event allreduce(const BufferType* send_buf,
+                BufferType* recv_buf,
+                size_t count,
+                reduction rtype,
+                const communicator& comm,
+                const allreduce_attr& attr = default_allreduce_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event allreduce(const BufferObjectType& send_buf,
+                BufferObjectType& recv_buf,
+                size_t count,
+                reduction rtype,
+                const communicator& comm,
+                const stream& stream,
+                const allreduce_attr& attr = default_allreduce_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event allreduce(const BufferObjectType& send_buf,
+                BufferObjectType& recv_buf,
+                size_t count,
+                reduction rtype,
+                const communicator& comm,
+                const allreduce_attr& attr = default_allreduce_attr,
+                const vector_class<event>& deps = {});
+
+/**
+ * Alltoall is a collective communication operation in which each rank
+ * sends distinct equal-sized blocks of data to each rank.
+ * The j-th block of @c send_buf sent from the i-th rank is received by the j-th rank
+ * and is placed in the i-th block of @c recvbuf.
+ */
+
+/**
+ * @param send_buf the buffer with @c count elements of @c dtype that stores local data to be sent
+ * @param recv_buf [out] the buffer to store received result, should be large enough
+ * to hold values from all ranks, i.e. at least @c comm_size * @c count
+ * @param send_bufs array of buffers with local data to be sent, one buffer per each rank
+ * @param recv_bufs [out] array of buffers to store received result, one buffer per each rank
+ * @param count the number of elements of type @c dtype to be send to or to received from each rank
+ * @param dtype the datatype of elements in @c send_buf and @c recv_buf
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+event alltoall(const void* send_buf,
+               void* recv_buf,
+               size_t count,
+               datatype dtype,
+               const communicator& comm,
+               const stream& stream,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+event alltoall(const void* send_buf,
+               void* recv_buf,
+               size_t count,
+               datatype dtype,
+               const communicator& comm,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+event alltoall(const vector_class<void*>& send_buf,
+               const vector_class<void*>& recv_buf,
+               size_t count,
+               datatype dtype,
+               const communicator& comm,
+               const stream& stream,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+event alltoall(const vector_class<void*>& send_buf,
+               const vector_class<void*>& recv_buf,
+               size_t count,
+               datatype dtype,
+               const communicator& comm,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event alltoall(const BufferType* send_buf,
+               BufferType* recv_buf,
+               size_t count,
+               const communicator& comm,
+               const stream& stream,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event alltoall(const BufferType* send_buf,
+               BufferType* recv_buf,
+               size_t count,
+               const communicator& comm,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event alltoall(const vector_class<BufferType*>& send_buf,
+               const vector_class<BufferType*>& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const stream& stream,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event alltoall(const vector_class<BufferType*>& send_buf,
+               const vector_class<BufferType*>& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event alltoall(const BufferObjectType& send_buf,
+               BufferObjectType& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const stream& stream,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event alltoall(const BufferObjectType& send_buf,
+               BufferObjectType& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
+               const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const stream& stream,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
+               const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const alltoall_attr& attr = default_alltoall_attr,
+               const vector_class<event>& deps = {});
+
+/**
+ * Alltoallv is a collective communication operation in which each rank
+ * sends distinct blocks of data to each rank. Block sizes may differ.
+ * The j-th block of @c send_buf sent from the i-th rank is received by the j-th rank
+ * and is placed in the i-th block of @c recvbuf.
+ */
+
+/**
+ * @param send_buf the buffer with elements of @c dtype that stores local blocks to be sent to each rank
+ * @param send_bufs array of buffers to store send blocks, one buffer per each rank
+ * @param recv_buf [out] the buffer to store received result, should be large enough to hold blocks from all ranks
+ * @param recv_bufs [out] array of buffers to store receive blocks, one buffer per each rank
+ * @param send_counts array with the number of elements of type @c dtype in send blocks for each rank
+ * @param recv_counts array with the number of elements of type @c dtype in receive blocks from each rank
+ * @param dtype the datatype of elements in @c send_buf and @c recv_buf
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+event alltoallv(const void* send_buf,
+                const vector_class<size_t>& send_counts,
+                void* recv_buf,
+                const vector_class<size_t>& recv_counts,
+                datatype dtype,
+                const communicator& comm,
+                const stream& stream,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+event alltoallv(const void* send_buf,
+                const vector_class<size_t>& send_counts,
+                void* recv_buf,
+                const vector_class<size_t>& recv_counts,
+                datatype dtype,
+                const communicator& comm,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+event alltoallv(const vector_class<void*>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<void*>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                datatype dtype,
+                const communicator& comm,
+                const stream& stream,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+event alltoallv(const vector_class<void*>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<void*>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                datatype dtype,
+                const communicator& comm,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event alltoallv(const BufferType* send_buf,
+                const vector_class<size_t>& send_counts,
+                BufferType* recv_buf,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const stream& stream,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event alltoallv(const BufferType* send_buf,
+                const vector_class<size_t>& send_counts,
+                BufferType* recv_buf,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event alltoallv(const vector_class<BufferType*>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<BufferType*>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const stream& stream,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event alltoallv(const vector_class<BufferType*>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<BufferType*>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event alltoallv(const BufferObjectType& send_buf,
+                const vector_class<size_t>& send_counts,
+                BufferObjectType& recv_buf,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const stream& stream,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event alltoallv(const BufferObjectType& send_buf,
+                const vector_class<size_t>& send_counts,
+                BufferObjectType& recv_buf,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const stream& stream,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const alltoallv_attr& attr = default_alltoallv_attr,
+                const vector_class<event>& deps = {});
+
+/**
+ * Barrier synchronization is performed across all ranks of the communicator
+ * and it is completed only after all the ranks in the communicator have called it.
+ */
+
+/**
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+event barrier(const communicator& comm,
+              const stream& stream,
+              const barrier_attr& attr = default_barrier_attr,
+              const vector_class<event>& deps = {});
+
+event barrier(const communicator& comm,
+              const barrier_attr& attr = default_barrier_attr,
+              const vector_class<event>& deps = {});
+
+/**
+ * Broadcast is a collective communication operation that broadcasts data
+ * from one rank of communicator (denoted as root) to all other ranks.
+ */
+
+/**
+ * @param buf [in,out] the buffer with @c count elements of @c dtype
+ * serves as send buffer for root and as receive buffer for other ranks
+ * @param count the number of elements of type @c dtype in @c buf
+ * @param dtype the datatype of elements in @c buf
+ * @param root the rank that broadcasts @c buf
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+event broadcast(void* buf,
+                size_t count,
+                datatype dtype,
+                int root,
+                const communicator& comm,
+                const stream& stream,
+                const broadcast_attr& attr = default_broadcast_attr,
+                const vector_class<event>& deps = {});
+
+event broadcast(void* buf,
+                size_t count,
+                datatype dtype,
+                int root,
+                const communicator& comm,
+                const broadcast_attr& attr = default_broadcast_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event broadcast(BufferType* buf,
+                size_t count,
+                int root,
+                const communicator& comm,
+                const stream& stream,
+                const broadcast_attr& attr = default_broadcast_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event broadcast(BufferType* buf,
+                size_t count,
+                int root,
+                const communicator& comm,
+                const broadcast_attr& attr = default_broadcast_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event broadcast(BufferObjectType& buf,
+                size_t count,
+                int root,
+                const communicator& comm,
+                const stream& stream,
+                const broadcast_attr& attr = default_broadcast_attr,
+                const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event broadcast(BufferObjectType& buf,
+                size_t count,
+                int root,
+                const communicator& comm,
+                const broadcast_attr& attr = default_broadcast_attr,
+                const vector_class<event>& deps = {});
+
+/**
+ * Reduce is a collective communication operation that performs the global reduction operation
+ * on values from all ranks of the communicator and returns the result to the root rank.
+ */
+
+/**
+ * @param send_buf the buffer with @c count elements of @c dtype that stores local data to be reduced
+ * @param recv_buf [out] the buffer to store reduced result, must have the same dimension as @c send_buf.
+ * Used by the @c root rank only, ignored by other ranks.
+ * @param count the number of elements of type @c dtype in @c send_buf and @c recv_buf
+ * @param dtype the datatype of elements in @c send_buf and @c recv_buf
+ * @param rtype the type of the reduction operation to be applied
+ * @param root the rank that gets the result of reduction
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+event reduce(const void* send_buf,
+             void* recv_buf,
+             size_t count,
+             datatype dtype,
+             reduction rtype,
+             int root,
+             const communicator& comm,
+             const stream& stream,
+             const reduce_attr& attr = default_reduce_attr,
+             const vector_class<event>& deps = {});
+
+event reduce(const void* send_buf,
+             void* recv_buf,
+             size_t count,
+             datatype dtype,
+             reduction rtype,
+             int root,
+             const communicator& comm,
+             const reduce_attr& attr = default_reduce_attr,
+             const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event reduce(const BufferType* send_buf,
+             BufferType* recv_buf,
+             size_t count,
+             reduction rtype,
+             int root,
+             const communicator& comm,
+             const stream& stream,
+             const reduce_attr& attr = default_reduce_attr,
+             const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event reduce(const BufferType* send_buf,
+             BufferType* recv_buf,
+             size_t count,
+             reduction rtype,
+             int root,
+             const communicator& comm,
+             const reduce_attr& attr = default_reduce_attr,
+             const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event reduce(const BufferObjectType& send_buf,
+             BufferObjectType& recv_buf,
+             size_t count,
+             reduction rtype,
+             int root,
+             const communicator& comm,
+             const stream& stream,
+             const reduce_attr& attr = default_reduce_attr,
+             const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event reduce(const BufferObjectType& send_buf,
+             BufferObjectType& recv_buf,
+             size_t count,
+             reduction rtype,
+             int root,
+             const communicator& comm,
+             const reduce_attr& attr = default_reduce_attr,
+             const vector_class<event>& deps = {});
+
+/**
+ * Reduce-scatter is a collective communication operation that performs the global reduction operation
+ * on values from all ranks of the communicator and scatters the result in blocks back to all ranks.
+ */
+
+/**
+ * @param send_buf the buffer with @c comm_size * @c count elements of @c dtype that stores local data to be reduced
+ * @param recv_buf [out] the buffer to store result block containing @c recv_count elements of type @c dtype
+ * @param recv_count the number of elements of type @c dtype in receive block
+ * @param dtype the datatype of elements in @c send_buf and @c recv_buf
+ * @param rtype the type of the reduction operation to be applied
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+event reduce_scatter(const void* send_buf,
+                     void* recv_buf,
+                     size_t recv_count,
+                     datatype dtype,
+                     reduction rtype,
+                     const communicator& comm,
+                     const stream& stream,
+                     const reduce_scatter_attr& attr = default_reduce_scatter_attr,
+                     const vector_class<event>& deps = {});
+
+event reduce_scatter(const void* send_buf,
+                     void* recv_buf,
+                     size_t recv_count,
+                     datatype dtype,
+                     reduction rtype,
+                     const communicator& comm,
+                     const reduce_scatter_attr& attr = default_reduce_scatter_attr,
+                     const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event reduce_scatter(const BufferType* send_buf,
+                     BufferType* recv_buf,
+                     size_t recv_count,
+                     reduction rtype,
+                     const communicator& comm,
+                     const stream& stream,
+                     const reduce_scatter_attr& attr = default_reduce_scatter_attr,
+                     const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferType,
+          class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type>
+event reduce_scatter(const BufferType* send_buf,
+                     BufferType* recv_buf,
+                     size_t recv_count,
+                     reduction rtype,
+                     const communicator& comm,
+                     const reduce_scatter_attr& attr = default_reduce_scatter_attr,
+                     const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event reduce_scatter(const BufferObjectType& send_buf,
+                     BufferObjectType& recv_buf,
+                     size_t recv_count,
+                     reduction rtype,
+                     const communicator& comm,
+                     const stream& stream,
+                     const reduce_scatter_attr& attr = default_reduce_scatter_attr,
+                     const vector_class<event>& deps = {});
+
+/* Type safety version */
+template <class BufferObjectType,
+          class = typename std::enable_if<is_class_supported<BufferObjectType>(), event>::type>
+event reduce_scatter(const BufferObjectType& send_buf,
+                     BufferObjectType& recv_buf,
+                     size_t recv_count,
+                     reduction rtype,
+                     const communicator& comm,
+                     const reduce_scatter_attr& attr = default_reduce_scatter_attr,
+                     const vector_class<event>& deps = {});
+
+} // namespace v1
+
+namespace preview {
+
+/**
+ * Sparse allreduce is a collective communication operation that makes global reduction operation
+ * on sparse buffers from all ranks of communicator and distributes result back to all ranks.
+ * Sparse buffers are defined by separate index and value buffers.
+ */
+
+/**
+ * @param send_ind_buf the buffer of indices with @c send_ind_count elements of type @c ind_dtype
+ * @param send_ind_count the number of elements of type @c ind_type @c send_ind_buf
+ * @param send_val_buf the buffer of values with @c send_val_count elements of type @c val_dtype
+ * @param send_val_count the number of elements of type @c val_type @c send_val_buf
+ * @param recv_ind_buf [out] the buffer to store reduced indices, unused
+ * @param recv_ind_count [out] the number of elements in @c recv_ind_buf, unused
+ * @param recv_val_buf [out] the buffer to store reduced values, unused
+ * @param recv_val_count [out] the number of elements in @c recv_val_buf, unused
+ * @param ind_dtype the datatype of elements in @c send_ind_buf and @c recv_ind_buf
+ * @param val_dtype the datatype of elements in @c send_val_buf and @c recv_val_buf
+ * @param rtype the type of the reduction operation to be applied
+ * @param comm the communicator for which the operation will be performed
+ * @param stream a stream associated with the operation
+ * @param attr optional attributes to customize operation
+ * @param deps an optional vector of the events that the operation should depend on
+ * @return @ref ccl::event an object to track the progress of the operation
+ */
+
+ccl::event sparse_allreduce(
+    const void* send_ind_buf,
+    size_t send_ind_count,
+    const void* send_val_buf,
+    size_t send_val_count,
+    void* recv_ind_buf,
+    size_t recv_ind_count,
+    void* recv_val_buf,
+    size_t recv_val_count,
+    ccl::datatype ind_dtype,
+    ccl::datatype val_dtype,
+    ccl::reduction rtype,
+    const ccl::communicator& comm,
+    const ccl::stream& stream,
+    const ccl::sparse_allreduce_attr& attr = ccl::default_sparse_allreduce_attr,
+    const ccl::vector_class<ccl::event>& deps = {});
+
+ccl::event sparse_allreduce(
+    const void* send_ind_buf,
+    size_t send_ind_count,
+    const void* send_val_buf,
+    size_t send_val_count,
+    void* recv_ind_buf,
+    size_t recv_ind_count,
+    void* recv_val_buf,
+    size_t recv_val_count,
+    ccl::datatype ind_dtype,
+    ccl::datatype val_dtype,
+    ccl::reduction rtype,
+    const ccl::communicator& comm,
+    const ccl::sparse_allreduce_attr& attr = ccl::default_sparse_allreduce_attr,
+    const ccl::vector_class<ccl::event>& deps = {});
+
+/* Type safety version */
+template <class IndexBufferType,
+          class ValueBufferType,
+          class = typename std::enable_if<ccl::is_native_type_supported<ValueBufferType>(),
+                                          ccl::event>::type>
+ccl::event sparse_allreduce(
+    const IndexBufferType* send_ind_buf,
+    size_t send_ind_count,
+    const ValueBufferType* send_val_buf,
+    size_t send_val_count,
+    IndexBufferType* recv_ind_buf,
+    size_t recv_ind_count,
+    ValueBufferType* recv_val_buf,
+    size_t recv_val_count,
+    ccl::reduction rtype,
+    const ccl::communicator& comm,
+    const ccl::stream& stream,
+    const ccl::sparse_allreduce_attr& attr = ccl::default_sparse_allreduce_attr,
+    const ccl::vector_class<ccl::event>& deps = {});
+
+/* Type safety version */
+template <class IndexBufferType,
+          class ValueBufferType,
+          class = typename std::enable_if<ccl::is_native_type_supported<ValueBufferType>(),
+                                          ccl::event>::type>
+ccl::event sparse_allreduce(
+    const IndexBufferType* send_ind_buf,
+    size_t send_ind_count,
+    const ValueBufferType* send_val_buf,
+    size_t send_val_count,
+    IndexBufferType* recv_ind_buf,
+    size_t recv_ind_count,
+    ValueBufferType* recv_val_buf,
+    size_t recv_val_count,
+    ccl::reduction rtype,
+    const ccl::communicator& comm,
+    const ccl::sparse_allreduce_attr& attr = ccl::default_sparse_allreduce_attr,
+    const ccl::vector_class<ccl::event>& deps = {});
+
+} // namespace preview
+
+using namespace v1;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_environment.hpp b/include/oneapi/ccl/ccl_environment.hpp
deleted file mode 100644
index 657625550..000000000
--- a/include/oneapi/ccl/ccl_environment.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <memory>
-#include <ostream>
-#include <utility>
-#include <vector>
-
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
-
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
-
-#include "oneapi/ccl/ccl_context_attr_ids.hpp"
-#include "oneapi/ccl/ccl_context_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_context.hpp"
-
-#include "oneapi/ccl/ccl_datatype_attr_ids.hpp"
-#include "oneapi/ccl/ccl_datatype_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_datatype_attr.hpp"
-
-#include "oneapi/ccl/ccl_device_attr_ids.hpp"
-#include "oneapi/ccl/ccl_device_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_device.hpp"
-
-#include "oneapi/ccl/ccl_kvs.hpp"
-
-#include "oneapi/ccl/ccl_event.hpp"
-
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_stream.hpp"
-
-#include "oneapi/ccl/ccl_communicator.hpp"
-
-#include "oneapi/ccl/ccl_exception.hpp"
-
-namespace ccl {
-
-/**
- * CCL environment singleton
- */
-class environment {
-public:
-    ~environment();
-
-    /**
-     * Retrieves the unique environment object
-     * and makes the first-time initialization of CCL library
-     */
-    static environment& instance();
-
-    ccl::library_version get_library_version() const;
-
-    template <class... attr_value_pair_t>
-    datatype_attr create_datatype_attr(attr_value_pair_t&&... avps) const {
-        static_assert(sizeof...(avps) > 0, "At least one argument must be specified");
-        auto attr = create_postponed_api_type<datatype_attr>();
-        int expander[]{ (attr.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
-        (void)expander;
-        return attr;
-    }
-
-    ccl::datatype register_datatype(const ccl::datatype_attr& attr);
-    void deregister_datatype(ccl::datatype dtype);
-    size_t get_datatype_size(ccl::datatype dtype) const;
-
-    shared_ptr_class<kvs> create_main_kvs() const;
-    shared_ptr_class<kvs> create_kvs(const kvs::address_type& addr) const;
-
-    device create_device(empty_t empty) const;
-
-    template <class native_device_type,
-              class = typename std::enable_if<is_device_supported<native_device_type>()>::type>
-    device create_device(native_device_type&& native_device) const;
-
-    template <class... attr_value_pair_t>
-    device create_device_from_attr(typename unified_device_type::ccl_native_t dev,
-                                   attr_value_pair_t&&... avps) const {
-        device str = create_postponed_api_type<device>(dev);
-        int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
-        (void)expander;
-        str.build_from_params();
-        return str;
-    }
-
-    context create_context(empty_t empty) const;
-
-    template <class native_device_contex_type,
-              class = typename std::enable_if<is_device_supported<native_device_contex_type>()>::type>
-    context create_context(native_device_contex_type&& native_device_context) const;
-
-    template <class... attr_value_pair_t>
-    context create_context_from_attr(typename unified_device_context_type::ccl_native_t ctx,
-                                   attr_value_pair_t&&... avps) const {
-        context str = create_postponed_api_type<context>(ctx);
-        int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
-        (void)expander;
-        str.build_from_params();
-        return str;
-    }
-
-    template <class coll_attribute_type, class... attr_value_pair_t>
-    coll_attribute_type create_operation_attr(attr_value_pair_t&&... avps) const {
-        auto op_attr = create_postponed_api_type<coll_attribute_type>();
-        int expander[]{ (op_attr.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
-        (void)expander;
-        return op_attr;
-    }
-
-    template <class event_type,
-            class = typename std::enable_if<is_event_supported<event_type>()>::type>
-    event create_event(event_type& native_event) {
-        return event::create_from_native(native_event);
-    }
-
-    template <class native_stream_type,
-              class = typename std::enable_if<is_stream_supported<native_stream_type>()>::type>
-    stream create_stream(native_stream_type& native_stream);
-
-    template <class native_stream_type,
-              class native_context_type,
-              class = typename std::enable_if<is_stream_supported<native_stream_type>()>::type>
-    stream create_stream(native_stream_type& native_stream, native_context_type& native_ctx);
-
-    template <class... attr_value_pair_t>
-    stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
-                                   attr_value_pair_t&&... avps) {
-        stream str = create_postponed_api_type<stream>(device);
-        int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
-        (void)expander;
-        str.build_from_params();
-        return str;
-    }
-
-    template <class... attr_value_pair_t>
-    stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
-                                   typename unified_device_context_type::ccl_native_t context,
-                                   attr_value_pair_t&&... avps) {
-        stream str = create_postponed_api_type<stream>(device, context);
-        int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
-        (void)expander;
-        str.build_from_params();
-        return str;
-    }
-
-
-#ifdef CCL_ENABLE_SYCL
-    communicator create_single_device_communicator(
-        size_t comm_size,
-        size_t rank,
-        const cl::sycl::device& device,
-        const cl::sycl::context& context,
-        shared_ptr_class<kvs_interface> kvs) const;
-#endif
-
-    template <class... attr_value_pair_t>
-    comm_split_attr create_comm_split_attr(attr_value_pair_t&&... avps) const {
-        auto split_attr = create_postponed_api_type<comm_split_attr>();
-        int expander[]{ (split_attr.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
-        (void)expander;
-        return split_attr;
-    }
-
-    communicator create_communicator() const;
-    communicator create_communicator(size_t size, shared_ptr_class<kvs_interface> kvs) const;
-    communicator create_communicator(size_t size,
-                                     size_t rank,
-                                     shared_ptr_class<kvs_interface> kvs) const;
-
-    template <class DeviceType, class ContextType>
-    vector_class<communicator> create_communicators(
-        size_t comm_size,
-        const vector_class<DeviceType>& local_devices,
-        ContextType& context,
-        shared_ptr_class<kvs_interface> kvs) const;
-
-    template <class DeviceType, class ContextType>
-    vector_class<communicator> create_communicators(
-        size_t comm_size,
-        const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-        ContextType& context,
-        shared_ptr_class<kvs_interface> kvs) const;
-
-    template <class DeviceType, class ContextType>
-    vector_class<communicator> create_communicators(
-        size_t comm_size,
-        const map_class<rank_t, DeviceType>& local_rank_device_map,
-        ContextType& context,
-        shared_ptr_class<kvs_interface> kvs) const;
-
-    vector_class<communicator> split_device_communicators(
-        const vector_class<pair_class<communicator, comm_split_attr>>& attrs) const;
-
-private:
-    environment();
-
-    template <class ccl_api_type, class... args_type>
-    ccl_api_type create_postponed_api_type(args_type... args) const;
-};
-
-} /* ccl */
diff --git a/include/oneapi/ccl/ccl_type_traits.hpp b/include/oneapi/ccl/ccl_type_traits.hpp
deleted file mode 100644
index 0964f66ec..000000000
--- a/include/oneapi/ccl/ccl_type_traits.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#ifndef TRAITS_H_
-#define TRAITS_H_
-
-#include <tuple>
-#include <type_traits>
-
-#ifdef CCL_ENABLE_SYCL
-#include <CL/sycl.hpp>
-#endif
-
-#include "oneapi/ccl/ccl_types.hpp"
-
-namespace ccl {
-/**
- * Base type-trait helpers for "unknown" types
- */
-template <ccl::datatype ccl_type_id>
-struct type_info {
-    static constexpr bool is_supported = false;
-    static constexpr bool is_class = false;
-};
-
-template <class type>
-struct native_type_info {
-    static constexpr bool is_supported = false;
-    static constexpr bool is_class = false;
-};
-
-#define CCL_TYPE_TRAITS(ccl_type_id, dtype, dtype_size) \
-    template <> \
-    struct type_info<ccl_type_id> \
-            : public ccl_type_info_export<dtype, dtype_size, ccl_type_id, false, true> { \
-        static constexpr const char* name() { \
-            return #dtype; \
-        } \
-    }; \
-    template <> \
-    struct native_type_info<dtype> : public type_info<ccl_type_id> {};
-
-#define CCL_CLASS_TYPE_TRAITS(ccl_type_id, dtype, sizeof_dtype) \
-    template <> \
-    struct native_type_info<dtype> \
-            : public ccl_type_info_export<dtype, sizeof_dtype, ccl_type_id, true, true> { \
-        static constexpr const char* name() { \
-            return #dtype; \
-        } \
-    };
-
-#define COMMA ,
-
-/*struct bf16_impl
-{
-    uint16_t data;
-} __attribute__((packed));*/
-
-using bf16 = uint16_t;
-
-/**
- * Enumeration of supported CCL API data types
- */
-CCL_TYPE_TRAITS(ccl::datatype::int8, char, sizeof(char))
-CCL_TYPE_TRAITS(ccl::datatype::int32, int, sizeof(int))
-CCL_TYPE_TRAITS(ccl::datatype::bfloat16, bf16, sizeof(bf16))
-CCL_TYPE_TRAITS(ccl::datatype::float32, float, sizeof(float))
-CCL_TYPE_TRAITS(ccl::datatype::float64, double, sizeof(double))
-CCL_TYPE_TRAITS(ccl::datatype::int64, int64_t, sizeof(int64_t))
-CCL_TYPE_TRAITS(ccl::datatype::uint64, uint64_t, sizeof(uint64_t))
-
-#ifdef CCL_ENABLE_SYCL
-CCL_CLASS_TYPE_TRAITS(ccl::datatype::int8, cl::sycl::buffer<char COMMA 1>, sizeof(char))
-CCL_CLASS_TYPE_TRAITS(ccl::datatype::int32, cl::sycl::buffer<int COMMA 1>, sizeof(int))
-CCL_CLASS_TYPE_TRAITS(ccl::datatype::bfloat16, cl::sycl::buffer<bf16 COMMA 1>, sizeof(bf16))
-CCL_CLASS_TYPE_TRAITS(ccl::datatype::int64, cl::sycl::buffer<int64_t COMMA 1>, sizeof(int64_t))
-CCL_CLASS_TYPE_TRAITS(ccl::datatype::uint64, cl::sycl::buffer<uint64_t COMMA 1>, sizeof(uint64_t))
-CCL_CLASS_TYPE_TRAITS(ccl::datatype::float32, cl::sycl::buffer<float COMMA 1>, sizeof(float))
-CCL_CLASS_TYPE_TRAITS(ccl::datatype::float64, cl::sycl::buffer<double COMMA 1>, sizeof(double))
-#endif //CCL_ENABLE_SYCL
-
-/**
- * Checks for supporting @c type in ccl API
- */
-template <class type>
-constexpr bool is_supported() {
-    using clear_type = typename std::remove_pointer<type>::type;
-    //    static_assert(native_type_info<clear_type>::is_supported, "type is not supported by ccl API");
-    return native_type_info<clear_type>::is_supported;
-}
-
-/**
- * Checks is @c type a class
- */
-template <class type>
-constexpr bool is_class() {
-    using clear_type = typename std::remove_pointer<type>::type;
-    return native_type_info<clear_type>::is_class;
-}
-
-/**
- * SFINAE checks for supporting native type @c type in ccl API
- */
-template <class type>
-constexpr bool is_native_type_supported() {
-    return (not is_class<type>() and is_supported<type>());
-}
-
-/**
-  * SFINAE checks for supporting class @c type in ccl API
-  */
-template <class type>
-constexpr bool is_class_supported() {
-    return (is_class<type>() and is_supported<type>());
-}
-
-} // namespace ccl
-#include "oneapi/ccl/ccl_device_type_traits.hpp"
-#endif //TRAITS_H_
diff --git a/include/oneapi/ccl/ccl_types.hpp b/include/oneapi/ccl/ccl_types.hpp
deleted file mode 100644
index 5e8032ab8..000000000
--- a/include/oneapi/ccl/ccl_types.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include <stdint.h>
-#include <stdlib.h>
-#include "oneapi/ccl/ccl_config.h"
-
-#include <bitset>
-#include <iostream>
-#include <limits>
-#include <map>
-#include <memory>
-#include <set>
-#include <stdexcept>
-#include <vector>
-
-#include "oneapi/ccl/ccl_aliases.hpp"
-#include "oneapi/ccl/ccl_exception.hpp"
-
-// TODO: tmp enums, refactor core code and remove them
-/************************************************/
-typedef int ccl_status_t;
-#define ccl_status_success               (0)
-#define ccl_status_out_of_resource       (1)
-#define ccl_status_invalid_arguments     (2)
-#define ccl_status_unimplemented         (3)
-#define ccl_status_runtime_error         (4)
-#define ccl_status_blocked_due_to_resize (5)
-#define ccl_status_last_value            (6)
-
-/** Resize action types. */
-typedef enum ccl_resize_action {
-    /* Wait additional changes for number of ranks */
-    ccl_ra_wait = 0,
-    /* Run with current number of ranks */
-    ccl_ra_run = 1,
-    /* Finalize work */
-    ccl_ra_finalize = 2,
-} ccl_resize_action_t;
-
-/* comm_size */
-typedef ccl_resize_action_t (*ccl_resize_fn_t)(size_t comm_size);
-
-/** Stream types. */
-typedef enum {
-    ccl_stream_host = 0,
-    ccl_stream_cpu = 1,
-    ccl_stream_gpu = 2,
-
-    ccl_stream_last_value
-} ccl_stream_type_t;
-/************************************************/
-
-namespace ccl {
-
-/** Library version description. */
-typedef struct {
-    unsigned int major;
-    unsigned int minor;
-    unsigned int update;
-    const char* product_status;
-    const char* build_date;
-    const char* full;
-} library_version;
-
-/**
- * Supported reduction operations
- */
-enum class reduction : int {
-    sum = 0,
-    prod,
-    min,
-    max,
-    custom,
-
-    last_value
-};
-
-/**
- * Supported datatypes
- */
-enum class datatype : int {
-    int8 = 0,
-    uint8,
-    int16,
-    uint16,
-    int32,
-    uint32,
-    int64,
-    uint64,
-
-    float16,
-    float32,
-    float64,
-
-    bfloat16,
-
-    last_predefined = bfloat16
-};
-
-string_class to_string(const ccl::datatype& dt);
-
-inline std::ostream& operator<<(std::ostream& os, const ccl::datatype& dt) {
-    os << ccl::to_string(dt);
-    return os;
-}
-
-typedef struct {
-    const char* match_id;
-    const size_t offset;
-} fn_context;
-
-/* Sparse coalesce modes */
-/* Use this variable to set sparse_allreduce coalescing mode:
-   ccl_sparse_coalesce_regular run regular coalesce funtion;
-   ccl_sparse_coalesce_disable disables coalesce function in sparse_allreduce,
-                               allgathered data is returned;
-   ccl_sparse_coalesce_keep_precision on every local reduce bf16 data is
-                               converted to fp32, reduced and then converted
-                               back to bf16.
-*/
-
-enum class sparse_coalesce_mode : int { regular = 0, disable = 1, keep_precision = 2 };
-
-/* comm_size */
-typedef ccl_resize_action_t (*ccl_resize_fn_t)(size_t comm_size);
-
-/* in_buf, in_count, in_dtype, out_buf, out_count, out_dtype, context */
-typedef void (*prologue_fn)(const void*,
-                            size_t,
-                            ccl::datatype,
-                            void**,
-                            size_t*,
-                            ccl::datatype*,
-                            const ccl::fn_context*);
-
-/* in_buf, in_count, in_dtype, out_buf, out_count, out_dtype, context */
-typedef void (*epilogue_fn)(const void*,
-                            size_t,
-                            ccl::datatype,
-                            void*,
-                            size_t*,
-                            ccl::datatype,
-                            const ccl::fn_context*);
-
-/* in_buf, in_count, inout_buf, out_count, dtype, context */
-typedef void (
-    *reduction_fn)(const void*, size_t, void*, size_t*, ccl::datatype, const ccl::fn_context*);
-
-/* idx_buf, idx_count, idx_dtype, val_buf, val_count, val_dtype, user_context */
-typedef void (*sparse_allreduce_completion_fn)(const void*,
-                                               size_t,
-                                               ccl::datatype,
-                                               const void*,
-                                               size_t,
-                                               ccl::datatype,
-                                               const void*);
-
-/* idx_count, idx_dtype, val_count, val_dtype, user_context, out_idx_buf, out_val_buf */
-typedef void (*sparse_allreduce_alloc_fn)(size_t,
-                                          ccl::datatype,
-                                          size_t,
-                                          ccl::datatype,
-                                          const void*,
-                                          void**,
-                                          void**);
-
-// using datatype_attr_t = ccl_datatype_attr_t;
-
-/**
- *  Supported CL backend types
- */
-enum class cl_backend_type : int {
-    empty_backend = 0x0,
-    dpcpp_sycl = 0x1,
-    l0 = 0x2,
-    dpcpp_sycl_l0 = 0x3,
-
-    last_value
-};
-/**
- * Supported stream types
- */
-enum class stream_type : int {
-    host = 0,
-    cpu,
-    gpu,
-
-    last_value
-};
-
-/**
- * Type traits, which describes how-to types would be interpretered by ccl API
- */
-template <class ntype_t,
-          size_t size_of_type,
-          ccl::datatype ccl_type_v,
-          bool iclass = false,
-          bool supported = false>
-struct ccl_type_info_export {
-    using native_type = ntype_t;
-    using ccl_type = std::integral_constant<ccl::datatype, ccl_type_v>;
-    static constexpr size_t size = size_of_type;
-    static constexpr ccl::datatype ccl_type_value = ccl_type::value;
-    static constexpr datatype ccl_datatype_value = static_cast<datatype>(ccl_type_value);
-    static constexpr bool is_class = iclass;
-    static constexpr bool is_supported = supported;
-};
-
-struct ccl_empty_attr {
-    static ccl::library_version version;
-
-    template <class attr>
-    static attr create_empty();
-};
-
-/**
- * API object attributes traits
- */
-namespace info {
-template <class param_type, param_type value>
-struct param_traits {};
-
-} //namespace info
-} // namespace ccl
-
-// TODO: tmp struct, refactor core code and remove it
-/*********************************************************/
-
-/** Extendable list of collective attributes. */
-typedef struct {
-    /**
-     * Callbacks into application code
-     * for pre-/post-processing data
-     * and custom reduction operation
-     */
-    ccl::prologue_fn prologue_fn;
-    ccl::epilogue_fn epilogue_fn;
-    ccl::reduction_fn reduction_fn;
-
-    /* Priority for collective operation */
-    size_t priority;
-
-    /* Blocking/non-blocking */
-    int synchronous;
-
-    /* Persistent/non-persistent */
-    int to_cache;
-
-    /* Treat buffer as vector/regular - applicable for allgatherv only */
-    int vector_buf;
-
-    /**
-     * Id of the operation. If specified, new communicator will be created and collective
-     * operations with the same @b match_id will be executed in the same order.
-     */
-    const char* match_id;
-
-    /* Sparse allreduce specific */
-    ccl::sparse_allreduce_completion_fn sparse_allreduce_completion_fn;
-    ccl::sparse_allreduce_alloc_fn sparse_allreduce_alloc_fn;
-    const void* sparse_allreduce_fn_ctx;
-    ccl::sparse_coalesce_mode sparse_coalesce_mode;
-
-} ccl_coll_attr_t;
-
-#include "oneapi/ccl/ccl_device_types.hpp"
diff --git a/include/oneapi/ccl/ccl_coll_attr.hpp b/include/oneapi/ccl/coll_attr.hpp
similarity index 67%
rename from include/oneapi/ccl/ccl_coll_attr.hpp
rename to include/oneapi/ccl/coll_attr.hpp
index 16c0ee155..30d843b45 100644
--- a/include/oneapi/ccl/ccl_coll_attr.hpp
+++ b/include/oneapi/ccl/coll_attr.hpp
@@ -20,6 +20,9 @@
 #endif
 
 namespace ccl {
+namespace detail {
+class environment;
+}
 
 class ccl_allgatherv_attr_impl_t;
 class ccl_allreduce_attr_impl_t;
@@ -31,6 +34,8 @@ class ccl_reduce_attr_impl_t;
 class ccl_reduce_scatter_attr_impl_t;
 class ccl_sparse_allreduce_attr_impl_t;
 
+namespace v1 {
+
 struct ccl_empty_attr;
 
 /**
@@ -65,30 +70,29 @@ class allgatherv_attr : public ccl_api_base_copyable<allgatherv_attr,
     template <allgatherv_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
     /**
      * Get specific attribute value by @attrId
      */
     template <allgatherv_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>::return_type& get()
         const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
     allgatherv_attr(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 /**
@@ -123,31 +127,30 @@ class allreduce_attr : public ccl_api_base_copyable<allreduce_attr,
     template <allreduce_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<allreduce_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<allreduce_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
 
     /**
      * Get specific attribute value by @attrId
      */
     template <allreduce_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<allreduce_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<allreduce_attr_id, attrId>::return_type& get()
         const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
     allreduce_attr(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 /**
@@ -181,31 +184,30 @@ class alltoall_attr : public ccl_api_base_copyable<alltoall_attr,
     template <alltoall_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<alltoall_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<alltoall_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
 
     /**
      * Get specific attribute value by @attrId
      */
     template <alltoall_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<alltoall_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<alltoall_attr_id, attrId>::return_type& get()
         const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
     alltoall_attr(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 /**
@@ -240,31 +242,30 @@ class alltoallv_attr : public ccl_api_base_copyable<alltoallv_attr,
     template <alltoallv_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
 
     /**
      * Get specific attribute value by @attrId
      */
     template <alltoallv_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>::return_type& get()
         const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
     alltoallv_attr(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 /**
@@ -298,30 +299,30 @@ class barrier_attr : public ccl_api_base_copyable<barrier_attr,
     template <barrier_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<barrier_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<barrier_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
 
     /**
      * Get specific attribute value by @attrId
      */
     template <barrier_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<barrier_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<barrier_attr_id, attrId>::return_type& get()
         const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
-    barrier_attr(const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                                  operation_attr_id::version>::type&
-                     version);
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
+    barrier_attr(
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
     ;
 };
 
@@ -357,31 +358,30 @@ class broadcast_attr : public ccl_api_base_copyable<broadcast_attr,
     template <broadcast_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<broadcast_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<broadcast_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
 
     /**
      * Get specific attribute value by @attrId
      */
     template <broadcast_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<broadcast_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<broadcast_attr_id, attrId>::return_type& get()
         const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
     broadcast_attr(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 /**
@@ -415,30 +415,30 @@ class reduce_attr : public ccl_api_base_copyable<reduce_attr,
     template <reduce_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<reduce_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<reduce_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
 
     /**
      * Get specific attribute value by @attrId
      */
     template <reduce_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<reduce_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<reduce_attr_id, attrId>::return_type& get()
         const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
-    reduce_attr(const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                                 operation_attr_id::version>::type&
-                    version);
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
+    reduce_attr(
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 /**
@@ -473,31 +473,30 @@ class reduce_scatter_attr : public ccl_api_base_copyable<reduce_scatter_attr,
     template <reduce_scatter_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
 
     /**
      * Get specific attribute value by @attrId
      */
     template <reduce_scatter_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>::return_type&
+    const typename detail::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>::return_type&
     get() const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
     reduce_scatter_attr(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
     ;
 };
 
@@ -533,31 +532,30 @@ class sparse_allreduce_attr : public ccl_api_base_copyable<sparse_allreduce_attr
     template <sparse_allreduce_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>::return_type set(const Value& v);
 
     template <operation_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type set(const Value& v);
 
     /**
      * Get specific attribute value by @attrId
      */
     template <sparse_allreduce_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>::return_type&
+    const typename detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>::return_type&
     get() const;
 
     template <operation_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type& get()
         const;
 
 private:
-    friend class environment;
-    friend struct ccl_empty_attr;
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
     sparse_allreduce_attr(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
     ;
 };
 
@@ -579,59 +577,82 @@ extern barrier_attr default_barrier_attr;
  */
 template <allgatherv_attr_id t, class value_type>
 constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<allgatherv_attr_id, t, value_type> {
-    return details::attr_value_tripple<allgatherv_attr_id, t, value_type>(v);
+    -> detail::attr_value_triple<allgatherv_attr_id, t, value_type> {
+    return detail::attr_value_triple<allgatherv_attr_id, t, value_type>(v);
 }
 
 template <allreduce_attr_id t, class value_type>
 constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<allreduce_attr_id, t, value_type> {
-    return details::attr_value_tripple<allreduce_attr_id, t, value_type>(v);
+    -> detail::attr_value_triple<allreduce_attr_id, t, value_type> {
+    return detail::attr_value_triple<allreduce_attr_id, t, value_type>(v);
 }
 
 template <alltoall_attr_id t, class value_type>
 constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<alltoall_attr_id, t, value_type> {
-    return details::attr_value_tripple<alltoall_attr_id, t, value_type>(v);
+    -> detail::attr_value_triple<alltoall_attr_id, t, value_type> {
+    return detail::attr_value_triple<alltoall_attr_id, t, value_type>(v);
 }
 
 template <alltoallv_attr_id t, class value_type>
 constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<alltoallv_attr_id, t, value_type> {
-    return details::attr_value_tripple<alltoallv_attr_id, t, value_type>(v);
+    -> detail::attr_value_triple<alltoallv_attr_id, t, value_type> {
+    return detail::attr_value_triple<alltoallv_attr_id, t, value_type>(v);
 }
 
 template <broadcast_attr_id t, class value_type>
 constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<broadcast_attr_id, t, value_type> {
-    return details::attr_value_tripple<broadcast_attr_id, t, value_type>(v);
+    -> detail::attr_value_triple<broadcast_attr_id, t, value_type> {
+    return detail::attr_value_triple<broadcast_attr_id, t, value_type>(v);
 }
 
 template <reduce_attr_id t, class value_type>
-constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<reduce_attr_id, t, value_type> {
-    return details::attr_value_tripple<reduce_attr_id, t, value_type>(v);
+constexpr auto attr_val(value_type v) -> detail::attr_value_triple<reduce_attr_id, t, value_type> {
+    return detail::attr_value_triple<reduce_attr_id, t, value_type>(v);
 }
 
 template <reduce_scatter_attr_id t, class value_type>
 constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<reduce_scatter_attr_id, t, value_type> {
-    return details::attr_value_tripple<reduce_scatter_attr_id, t, value_type>(v);
+    -> detail::attr_value_triple<reduce_scatter_attr_id, t, value_type> {
+    return detail::attr_value_triple<reduce_scatter_attr_id, t, value_type>(v);
 }
 
 template <sparse_allreduce_attr_id t, class value_type>
 constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<sparse_allreduce_attr_id, t, value_type> {
-    return details::attr_value_tripple<sparse_allreduce_attr_id, t, value_type>(v);
+    -> detail::attr_value_triple<sparse_allreduce_attr_id, t, value_type> {
+    return detail::attr_value_triple<sparse_allreduce_attr_id, t, value_type>(v);
 }
 
 template <operation_attr_id t, class value_type>
 constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<operation_attr_id, t, value_type> {
-    return details::attr_value_tripple<operation_attr_id, t, value_type>(v);
+    -> detail::attr_value_triple<operation_attr_id, t, value_type> {
+    return detail::attr_value_triple<operation_attr_id, t, value_type>(v);
 }
 
-/* TODO temporary function for UT compilation: would be part of ccl::environment in final*/
+/* TODO temporary function for UT compilation: would be part of detail::environment in final*/
 template <class coll_attribute_type, class... attr_value_pair_t>
 coll_attribute_type create_coll_attr(attr_value_pair_t&&... avps);
+
+} // namespace v1
+
+using v1::allgatherv_attr;
+using v1::allreduce_attr;
+using v1::alltoall_attr;
+using v1::alltoallv_attr;
+using v1::broadcast_attr;
+using v1::reduce_attr;
+using v1::reduce_scatter_attr;
+using v1::sparse_allreduce_attr;
+using v1::barrier_attr;
+using v1::attr_val;
+
+using v1::default_allgatherv_attr;
+using v1::default_allreduce_attr;
+using v1::default_alltoall_attr;
+using v1::default_alltoallv_attr;
+using v1::default_broadcast_attr;
+using v1::default_reduce_attr;
+using v1::default_reduce_scatter_attr;
+using v1::default_sparse_allreduce_attr;
+using v1::default_barrier_attr;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_coll_attr_ids.hpp b/include/oneapi/ccl/coll_attr_ids.hpp
similarity index 89%
rename from include/oneapi/ccl/ccl_coll_attr_ids.hpp
rename to include/oneapi/ccl/coll_attr_ids.hpp
index 149561f64..036ab68bb 100644
--- a/include/oneapi/ccl/ccl_coll_attr_ids.hpp
+++ b/include/oneapi/ccl/coll_attr_ids.hpp
@@ -21,6 +21,8 @@
 
 namespace ccl {
 
+namespace v1 {
+
 /**
  * Common operation attributes id
  */
@@ -32,9 +34,6 @@ enum class operation_attr_id : int {
     synchronous,
     match_id,
 
-    prologue_fn,
-    epilogue_fn,
-
     last_value
 };
 
@@ -110,4 +109,18 @@ enum class barrier_attr_id : int {
 
     last_value
 };
+
+} // namespace v1
+
+using v1::operation_attr_id;
+using v1::allgatherv_attr_id;
+using v1::allreduce_attr_id;
+using v1::alltoall_attr_id;
+using v1::alltoallv_attr_id;
+using v1::broadcast_attr_id;
+using v1::reduce_attr_id;
+using v1::reduce_scatter_attr_id;
+using v1::sparse_allreduce_attr_id;
+using v1::barrier_attr_id;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_coll_attr_ids_traits.hpp b/include/oneapi/ccl/coll_attr_ids_traits.hpp
similarity index 89%
rename from include/oneapi/ccl/ccl_coll_attr_ids_traits.hpp
rename to include/oneapi/ccl/coll_attr_ids_traits.hpp
index 883e66db7..d3d5df484 100644
--- a/include/oneapi/ccl/ccl_coll_attr_ids_traits.hpp
+++ b/include/oneapi/ccl/coll_attr_ids_traits.hpp
@@ -21,7 +21,7 @@
 
 namespace ccl {
 
-namespace details {
+namespace detail {
 template <class T>
 class function_holder {
 public:
@@ -42,18 +42,6 @@ struct ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::version> {
     using return_type = type;
 };
 
-template <>
-struct ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::prologue_fn> {
-    using type = ccl::prologue_fn;
-    using return_type = function_holder<type>;
-};
-
-template <>
-struct ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::epilogue_fn> {
-    using type = ccl::epilogue_fn;
-    using return_type = function_holder<type>;
-};
-
 template <>
 struct ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::priority> {
     using type = size_t;
@@ -144,5 +132,6 @@ struct ccl_api_type_attr_traits<sparse_allreduce_attr_id, sparse_allreduce_attr_
 /**
  * Traits specialization for barrier op attributes
  */
-} // namespace details
+} // namespace detail
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/comm_attr.hpp b/include/oneapi/ccl/comm_attr.hpp
new file mode 100644
index 000000000..20cd98dd2
--- /dev/null
+++ b/include/oneapi/ccl/comm_attr.hpp
@@ -0,0 +1,98 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+namespace detail {
+class environment;
+}
+
+class ccl_comm_attr_impl;
+
+namespace v1 {
+
+struct ccl_empty_attr;
+
+/**
+ * Communicator attributes
+ */
+class comm_attr
+        : public ccl_api_base_copyable<comm_attr, copy_on_write_access_policy, ccl_comm_attr_impl> {
+public:
+    using base_t =
+        ccl_api_base_copyable<comm_attr, copy_on_write_access_policy, ccl_comm_attr_impl>;
+
+    /**
+     * Declare PIMPL type
+     */
+    using impl_value_t = typename base_t::impl_value_t;
+
+    /**
+     * Declare implementation type
+     */
+    using impl_t = typename impl_value_t::element_type;
+
+    comm_attr& operator=(const comm_attr& src);
+    comm_attr& operator=(comm_attr&& src);
+    comm_attr(comm_attr&& src);
+    comm_attr(const comm_attr& src);
+    comm_attr(ccl_empty_attr);
+    ~comm_attr() noexcept;
+
+    /**
+     * Set specific value for selft attribute by @attrId.
+     * Previous attibute value would be returned
+     */
+    template <comm_attr_id attrId,
+        class Value/*,
+              class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
+    Value set(const Value& v);
+
+    /**
+     * Get specific attribute value by @attrId
+     */
+    template <comm_attr_id attrId>
+    const typename detail::ccl_api_type_attr_traits<comm_attr_id, attrId>::type& get() const;
+
+    template <comm_attr_id attrId>
+    bool is_valid() const noexcept;
+
+private:
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
+
+    comm_attr(const typename detail::ccl_api_type_attr_traits<comm_attr_id,
+                                                              comm_attr_id::version>::return_type&
+                  version);
+};
+extern comm_attr default_comm_attr;
+
+template <comm_attr_id t, class value_type>
+constexpr auto attr_val(value_type v) -> detail::attr_value_triple<comm_attr_id, t, value_type> {
+    return detail::attr_value_triple<comm_attr_id, t, value_type>(v);
+}
+
+} // namespace v1
+
+using v1::comm_attr;
+using v1::default_comm_attr;
+using v1::attr_val;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/comm_attr_ids.hpp b/include/oneapi/ccl/comm_attr_ids.hpp
new file mode 100644
index 000000000..3b926f3fa
--- /dev/null
+++ b/include/oneapi/ccl/comm_attr_ids.hpp
@@ -0,0 +1,36 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace v1 {
+
+enum class comm_attr_id : int {
+    version,
+
+    last_value
+};
+
+} // namespace v1
+
+using v1::comm_attr_id;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/comm_attr_ids_traits.hpp b/include/oneapi/ccl/comm_attr_ids_traits.hpp
new file mode 100644
index 000000000..9fec8ab62
--- /dev/null
+++ b/include/oneapi/ccl/comm_attr_ids_traits.hpp
@@ -0,0 +1,34 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace detail {
+
+template <>
+struct ccl_api_type_attr_traits<comm_attr_id, comm_attr_id::version> {
+    using type = ccl::library_version;
+    using return_type = type;
+};
+
+} // namespace detail
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_comm_split_attr.hpp b/include/oneapi/ccl/comm_split_attr.hpp
similarity index 78%
rename from include/oneapi/ccl/ccl_comm_split_attr.hpp
rename to include/oneapi/ccl/comm_split_attr.hpp
index b84236635..a7c50b66c 100644
--- a/include/oneapi/ccl/ccl_comm_split_attr.hpp
+++ b/include/oneapi/ccl/comm_split_attr.hpp
@@ -13,74 +13,96 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#ifndef CCL_PRODUCT_FULL
-#error "Do not include this file directly. Please include 'ccl.hpp'"
-#endif
-
-namespace ccl {
-
-class ccl_comm_split_attr_impl;
-struct ccl_empty_attr;
-
-/**
- * Device attributes
- */
-class comm_split_attr : public ccl_api_base_copyable<comm_split_attr,
-                                                     copy_on_write_access_policy,
-                                                     ccl_comm_split_attr_impl> {
-public:
-    using base_t = ccl_api_base_copyable<comm_split_attr,
-                                         copy_on_write_access_policy,
-                                         ccl_comm_split_attr_impl>;
-
-    /**
-     * Declare PIMPL type
-     */
-    using impl_value_t = typename base_t::impl_value_t;
-
-    /**
-     * Declare implementation type
-     */
-    using impl_t = typename impl_value_t::element_type;
-
-    comm_split_attr& operator=(const comm_split_attr& src);
-    comm_split_attr& operator=(comm_split_attr&& src);
-    comm_split_attr(comm_split_attr&& src);
-    comm_split_attr(const comm_split_attr& src);
-    comm_split_attr(ccl_empty_attr);
-    ~comm_split_attr() noexcept;
-
-    /**
-     * Set specific value for selft attribute by @attrId.
-     * Previous attibute value would be returned
-     */
-    template <comm_split_attr_id attrId,
-              class Value/*,
-              class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    Value set(const Value& v);
-
-    /**
-     * Get specific attribute value by @attrId
-     */
-    template <comm_split_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<comm_split_attr_id, attrId>::type& get() const;
-
-    template <comm_split_attr_id attrId>
-    bool is_valid() const noexcept;
-
-private:
-    friend class environment;
-    comm_split_attr(
-        const typename details::ccl_api_type_attr_traits<comm_split_attr_id,
-                                                        comm_split_attr_id::version>::type&
-            version);
-};
-
-template <comm_split_attr_id t, class value_type>
-constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<comm_split_attr_id, t, value_type> {
-    return details::attr_value_tripple<comm_split_attr_id, t, value_type>(v);
-}
-} // namespace ccl
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+namespace detail {
+class environment;
+}
+
+class ccl_comm_split_attr_impl;
+
+namespace v1 {
+
+struct ccl_empty_attr;
+
+/**
+ * Device attributes
+ */
+class comm_split_attr : public ccl_api_base_copyable<comm_split_attr,
+                                                     copy_on_write_access_policy,
+                                                     ccl_comm_split_attr_impl> {
+public:
+    using base_t = ccl_api_base_copyable<comm_split_attr,
+                                         copy_on_write_access_policy,
+                                         ccl_comm_split_attr_impl>;
+
+    /**
+     * Declare PIMPL type
+     */
+    using impl_value_t = typename base_t::impl_value_t;
+
+    /**
+     * Declare implementation type
+     */
+    using impl_t = typename impl_value_t::element_type;
+
+    comm_split_attr& operator=(const comm_split_attr& src);
+    comm_split_attr& operator=(comm_split_attr&& src);
+    comm_split_attr(comm_split_attr&& src);
+    comm_split_attr(const comm_split_attr& src);
+    comm_split_attr(ccl_empty_attr);
+    ~comm_split_attr() noexcept;
+
+    /**
+     * Set specific value for selft attribute by @attrId.
+     * Previous attibute value would be returned
+     */
+    template <comm_split_attr_id attrId,
+              class Value/*,
+              class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
+    Value set(const Value& v);
+
+    /**
+     * Get specific attribute value by @attrId
+     */
+    template <comm_split_attr_id attrId>
+    const typename detail::ccl_api_type_attr_traits<comm_split_attr_id, attrId>::type& get() const;
+
+    template <comm_split_attr_id attrId>
+    bool is_valid() const noexcept;
+
+private:
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
+    comm_split_attr(
+        const typename detail::ccl_api_type_attr_traits<comm_split_attr_id,
+                                                        comm_split_attr_id::version>::type&
+            version);
+};
+
+/**
+ * Declare extern empty attributes
+ */
+extern comm_split_attr default_comm_split_attr;
+
+/**
+ * Fabric helpers
+ */
+template <comm_split_attr_id t, class value_type>
+constexpr auto attr_val(value_type v)
+    -> detail::attr_value_triple<comm_split_attr_id, t, value_type> {
+    return detail::attr_value_triple<comm_split_attr_id, t, value_type>(v);
+}
+
+} // namespace v1
+
+using v1::comm_split_attr;
+using v1::default_comm_split_attr;
+using v1::attr_val;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/comm_split_attr_ids.hpp b/include/oneapi/ccl/comm_split_attr_ids.hpp
new file mode 100644
index 000000000..934b83630
--- /dev/null
+++ b/include/oneapi/ccl/comm_split_attr_ids.hpp
@@ -0,0 +1,46 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace v1 {
+
+enum class comm_split_attr_id : int {
+    version,
+
+    color,
+    group,
+
+    last_value
+};
+
+enum class split_group : int {
+    cluster,
+
+    last_value
+};
+
+} // namespace v1
+
+using v1::comm_split_attr_id;
+using v1::split_group;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp b/include/oneapi/ccl/comm_split_attr_ids_traits.hpp
similarity index 91%
rename from include/oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp
rename to include/oneapi/ccl/comm_split_attr_ids_traits.hpp
index ac4c630e9..3603aeb27 100644
--- a/include/oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp
+++ b/include/oneapi/ccl/comm_split_attr_ids_traits.hpp
@@ -13,31 +13,31 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#ifndef CCL_PRODUCT_FULL
-#error "Do not include this file directly. Please include 'ccl.hpp'"
-#endif
-
-namespace ccl {
-
-namespace details {
-
-template <>
-struct ccl_api_type_attr_traits<comm_split_attr_id, comm_split_attr_id::version> {
-    using type = ccl::library_version;
-};
-
-template <>
-struct ccl_api_type_attr_traits<comm_split_attr_id, comm_split_attr_id::color> {
-    using type = int;
-};
-
-template <>
-struct ccl_api_type_attr_traits<comm_split_attr_id, comm_split_attr_id::group> {
-    using type = group_split_type;
-};
-
-} // namespace details
-
-} // namespace ccl
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace detail {
+
+template <>
+struct ccl_api_type_attr_traits<comm_split_attr_id, comm_split_attr_id::version> {
+    using type = ccl::library_version;
+};
+
+template <>
+struct ccl_api_type_attr_traits<comm_split_attr_id, comm_split_attr_id::color> {
+    using type = int;
+};
+
+template <>
+struct ccl_api_type_attr_traits<comm_split_attr_id, comm_split_attr_id::group> {
+    using type = split_group;
+};
+
+} // namespace detail
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_communicator.hpp b/include/oneapi/ccl/communicator.hpp
similarity index 63%
rename from include/oneapi/ccl/ccl_communicator.hpp
rename to include/oneapi/ccl/communicator.hpp
index 1c1f1af47..d6ee6e477 100644
--- a/include/oneapi/ccl/ccl_communicator.hpp
+++ b/include/oneapi/ccl/communicator.hpp
@@ -20,20 +20,32 @@
 #endif
 
 namespace ccl {
-class event;
-class kvs_interface;
-using rank_t = size_t;
+namespace detail {
+class environment;
+}
 
 struct communicator_interface;
+
+template <cl_backend_type type>
+struct comm_impl_dispatch_selector;
+
+class comm_group;
+
+namespace v1 {
+class context;
+class device;
+class kvs_interface;
+struct impl_dispatch;
+
 /**
- * A device communicator that permits device communication operations
+ * A communicator that permits communication operations
  * Has no defined public constructor.
- * Use ccl::environment::create_device_communicator for communicator objects creation.
+ * Use ccl::create_communicator for communicator objects creation.
  */
 class communicator final : public ccl_api_base_movable<communicator,
-                                                              direct_access_policy,
-                                                              communicator_interface,
-                                                              std::shared_ptr> {
+                                                       direct_access_policy,
+                                                       communicator_interface,
+                                                       std::shared_ptr> {
 public:
     using base_t = ccl_api_base_movable<communicator,
                                         direct_access_policy,
@@ -54,14 +66,12 @@ class communicator final : public ccl_api_base_movable<communicator,
      * Type allows to get underlying device type,
      * which was used as communicator construction argument
      */
-    using ccl_device_t = typename unified_device_type::ccl_native_t;
+    using device_type = typename unified_device_type::ccl_native_t;
 
     /**
      * Declare communicator device context native type
      */
-    using ccl_context_t = typename unified_device_context_type::ccl_native_t;
-
-    using coll_request_t = ccl::event;
+    using context_type = typename unified_context_type::ccl_native_t;
 
     communicator(communicator&& src);
     communicator& operator=(communicator&& src);
@@ -71,23 +81,23 @@ class communicator final : public ccl_api_base_movable<communicator,
      * Retrieves the rank in a communicator
      * @return rank corresponding to communicator object
      */
-    size_t rank() const;
+    int rank() const;
 
     /**
      * Retrieves the number of rank in a communicator
      * @return number of the ranks
      */
-    size_t size() const;
+    int size() const;
 
     /**
      * Retrieves underlying device, which was used as communicator construction argument
      */
-    ccl_device_t get_device();
+    device get_device() const;
 
     /**
      * Retrieves underlying context, which was used as communicator construction argument
      */
-    ccl_context_t get_context();
+    context get_context() const;
 
     template <class... attr_value_pair_t>
     stream create_stream(attr_value_pair_t&&... avps) {
@@ -98,43 +108,49 @@ class communicator final : public ccl_api_base_movable<communicator,
     communicator split(const comm_split_attr& attr);
 
 private:
-    friend class environment;
-    friend class comm_group;
-    friend struct impl_dispatch;
+    friend class ccl::detail::environment;
+    friend class ccl::comm_group;
+    friend struct ccl::v1::impl_dispatch;
 
-    template<cl_backend_type type>
-    friend struct comm_impl_dispatch_selector;
+    template <cl_backend_type type>
+    friend struct ccl::comm_impl_dispatch_selector;
 
     communicator(impl_value_t&& impl);
 
     // factory methods
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators(
-        size_t comm_size,
+        int comm_size,
         const vector_class<DeviceType>& local_devices,
-        ContextType& context,
+        const ContextType& context,
         shared_ptr_class<kvs_interface> kvs);
 
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators(
-        size_t comm_size,
-        const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-        ContextType& context,
+        int comm_size,
+        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
+        const ContextType& context,
         shared_ptr_class<kvs_interface> kvs);
 
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators(
-        size_t comm_size,
-        const map_class<rank_t, DeviceType>& local_rank_device_map,
-        ContextType& context,
+        int comm_size,
+        const map_class<int, DeviceType>& local_rank_device_map,
+        const ContextType& context,
         shared_ptr_class<kvs_interface> kvs);
 
-    static communicator create_communicator();
-    static communicator create_communicator(size_t size,
-                                            shared_ptr_class<kvs_interface> kvs);
-    static communicator create_communicator(size_t size,
-                                            size_t rank,
-                                            shared_ptr_class<kvs_interface> kvs);
+    static communicator create_communicator(const comm_attr& attr);
+    static communicator create_communicator(int size,
+                                            shared_ptr_class<kvs_interface> kvs,
+                                            const comm_attr& attr);
+    static communicator create_communicator(int size,
+                                            int rank,
+                                            shared_ptr_class<kvs_interface> kvs,
+                                            const comm_attr& attr);
 };
 
+} // namespace v1
+
+using v1::communicator;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_config.h.in b/include/oneapi/ccl/config.h.in
similarity index 81%
rename from include/oneapi/ccl/ccl_config.h.in
rename to include/oneapi/ccl/config.h.in
index 8ca113864..82444621d 100644
--- a/include/oneapi/ccl/ccl_config.h.in
+++ b/include/oneapi/ccl/config.h.in
@@ -22,10 +22,6 @@
 #cmakedefine CCL_PRODUCT_BUILD_DATE         "@CCL_PRODUCT_BUILD_DATE@"
 #cmakedefine CCL_PRODUCT_FULL               "@CCL_PRODUCT_FULL@"
 
-
-/* Configuration settings for multi GPU extension support*/
-#cmakedefine MULTI_GPU_SUPPORT
-
 /* Auto-generated configuration settings for SYCL support */
 #cmakedefine CCL_ENABLE_SYCL
 
@@ -33,6 +29,5 @@
 @CCL_ENABLE_SYCL_CHECK_CONTRACT@
 #endif
 
-#define CCL_ENABLE_SYCL_V              @CCL_ENABLE_SYCL_V@
-#define CCL_ENABLE_SYCL_TRUE                1
-#define CCL_ENABLE_SYCL_FALSE               0
+/* Auto-generated configuration settings for multi GPU support*/
+#cmakedefine MULTI_GPU_SUPPORT
diff --git a/include/oneapi/ccl/ccl_context.hpp b/include/oneapi/ccl/context.hpp
similarity index 56%
rename from include/oneapi/ccl/ccl_context.hpp
rename to include/oneapi/ccl/context.hpp
index 0edd611bb..45ab52d19 100644
--- a/include/oneapi/ccl/ccl_context.hpp
+++ b/include/oneapi/ccl/context.hpp
@@ -21,10 +21,16 @@
 
 class ccl_context_impl;
 namespace ccl {
+namespace detail {
+class environment;
+}
+
+namespace v1 {
+class communicator;
 
 /**
  * A context object is an abstraction over CPU/GPU context
- * Has no defined public constructor. Use ccl::environment::create_context
+ * Has no defined public constructor. Use ccl::create_context
  * for context objects creation
  */
 /**
@@ -47,30 +53,35 @@ class context : public ccl_api_base_copyable<context, direct_access_policy, ccl_
     /**
      * Declare native context type
      */
-    using native_t = typename details::ccl_api_type_attr_traits<ccl::context_attr_id,
-                                                                ccl::context_attr_id::native_handle>::return_type;
+    using native_t =
+        typename detail::ccl_api_type_attr_traits<context_attr_id,
+                                                  context_attr_id::native_handle>::return_type;
     context(context&& src);
     context(const context& src);
     context& operator=(const context& src);
     context& operator=(context&& src);
     ~context();
 
+    bool operator==(const context& rhs) const noexcept;
+    bool operator!=(const context& rhs) const noexcept;
+    bool operator<(const context& rhs) const noexcept;
+
     /**
      * Get specific attribute value by @attrId
      */
     template <context_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<context_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<context_attr_id, attrId>::return_type& get()
         const;
 
     /**
      * Get native context object
      */
-     native_t& get_native();
-     const native_t& get_native() const;
+    native_t& get_native();
+    const native_t& get_native() const;
+
 private:
-    friend class environment;
-    friend class communicator;
-    friend class device_context_communicator;
+    friend class ccl::detail::environment;
+    friend class ccl::v1::communicator;
     context(impl_value_t&& impl);
 
     /**
@@ -79,27 +90,33 @@ class context : public ccl_api_base_copyable<context, direct_access_policy, ccl_
     template <context_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename ccl::details::ccl_api_type_attr_traits<ccl::context_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<context_attr_id, attrId>::return_type set(const Value& v);
 
     void build_from_params();
-    context(const typename details::ccl_api_type_attr_traits<context_attr_id,
-                                                           context_attr_id::version>::type& version);
+    context(
+        const typename detail::ccl_api_type_attr_traits<context_attr_id,
+                                                        context_attr_id::version>::type& version);
 
     /**
      * Factory methods
      */
-    template <class device_context_type,
-              class = typename std::enable_if<is_context_supported<device_context_type>()>::type>
-    static context create_context(device_context_type&& native_device_context);
+    template <class context_type,
+              class = typename std::enable_if<is_context_supported<context_type>()>::type>
+    static context create_context(context_type&& native_context);
 
-    template <class device_context_handle_type, class... attr_value_pair_t>
-    static context create_context_from_attr(device_context_handle_type& native_device_context_handle,
-                                        attr_value_pair_t&&... avps);
+    template <class context_handle_type, class... attr_value_pair_t>
+    static context create_context_from_attr(context_handle_type& native_context_handle,
+                                            attr_value_pair_t&&... avps);
 };
 
 template <context_attr_id t, class value_type>
-constexpr auto attr_val(value_type v) -> details::attr_value_tripple<context_attr_id, t, value_type> {
-    return details::attr_value_tripple<context_attr_id, t, value_type>(v);
+constexpr auto attr_val(value_type v) -> detail::attr_value_triple<context_attr_id, t, value_type> {
+    return detail::attr_value_triple<context_attr_id, t, value_type>(v);
 }
 
+} // namespace v1
+
+using v1::context;
+using v1::attr_val;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_context_attr_ids.hpp b/include/oneapi/ccl/context_attr_ids.hpp
similarity index 93%
rename from include/oneapi/ccl/ccl_context_attr_ids.hpp
rename to include/oneapi/ccl/context_attr_ids.hpp
index db24326e1..7facff3e7 100644
--- a/include/oneapi/ccl/ccl_context_attr_ids.hpp
+++ b/include/oneapi/ccl/context_attr_ids.hpp
@@ -20,6 +20,9 @@
 #endif
 
 namespace ccl {
+
+namespace v1 {
+
 /**
  * Context attribute ids
  */
@@ -31,4 +34,8 @@ enum class context_attr_id : int {
     last_value
 };
 
+} // namespace v1
+
+using v1::context_attr_id;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_context_attr_ids_traits.hpp b/include/oneapi/ccl/context_attr_ids_traits.hpp
similarity index 90%
rename from include/oneapi/ccl/ccl_context_attr_ids_traits.hpp
rename to include/oneapi/ccl/context_attr_ids_traits.hpp
index ccae970af..b85c2ef8a 100644
--- a/include/oneapi/ccl/ccl_context_attr_ids_traits.hpp
+++ b/include/oneapi/ccl/context_attr_ids_traits.hpp
@@ -20,7 +20,8 @@
 #endif
 
 namespace ccl {
-namespace details {
+
+namespace detail {
 
 /**
  * Traits for context attributes specializations
@@ -39,8 +40,10 @@ struct ccl_api_type_attr_traits<context_attr_id, context_attr_id::cl_backend> {
 
 template <>
 struct ccl_api_type_attr_traits<context_attr_id, context_attr_id::native_handle> {
-    using type = typename unified_device_context_type::ccl_native_t;
+    using type = typename unified_context_type::ccl_native_t;
     using return_type = type;
 };
-}
-}
+
+} // namespace detail
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_datatype_attr.hpp b/include/oneapi/ccl/datatype_attr.hpp
similarity index 78%
rename from include/oneapi/ccl/ccl_datatype_attr.hpp
rename to include/oneapi/ccl/datatype_attr.hpp
index da42d8cc1..4ed1863b2 100644
--- a/include/oneapi/ccl/ccl_datatype_attr.hpp
+++ b/include/oneapi/ccl/datatype_attr.hpp
@@ -13,67 +13,77 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#ifndef CCL_PRODUCT_FULL
-#error "Do not include this file directly. Please include 'ccl.hpp'"
-#endif
-
-namespace ccl {
-
-class ccl_datatype_attr_impl;
-
-class datatype_attr : public ccl_api_base_copyable<datatype_attr,
-                                                   copy_on_write_access_policy,
-                                                   ccl_datatype_attr_impl> {
-public:
-    using base_t =
-        ccl_api_base_copyable<datatype_attr, copy_on_write_access_policy, ccl_datatype_attr_impl>;
-
-    /**
-     * Declare PIMPL type
-     */
-    using impl_value_t = typename base_t::impl_value_t;
-
-    /**
-     * Declare implementation type
-     */
-    using impl_t = typename impl_value_t::element_type;
-
-    datatype_attr& operator=(const datatype_attr& src);
-    datatype_attr& operator=(datatype_attr&& src);
-    datatype_attr(datatype_attr&& src);
-    datatype_attr(const datatype_attr& src);
-    ~datatype_attr() noexcept;
-
-    /**
-     * Set specific value for selft attribute by @attrId.
-     * Previous attibute value would be returned
-     */
-    template <datatype_attr_id attrId,
-              class Value/*,
-              class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::return_type*/>
-    Value set(const Value& v);
-
-    /**
-     * Get specific attribute value by @attrId
-     */
-    template <datatype_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<datatype_attr_id, attrId>::return_type& get()
-        const;
-
-private:
-    friend class environment;
-    datatype_attr(
-        const typename details::ccl_api_type_attr_traits<datatype_attr_id,
-                                                         datatype_attr_id::version>::return_type&
-            version);
-};
-
-template <datatype_attr_id t, class value_type>
-constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<datatype_attr_id, t, value_type> {
-    return details::attr_value_tripple<datatype_attr_id, t, value_type>(v);
-}
-
-} // namespace ccl
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+namespace detail {
+class environment;
+}
+
+class ccl_datatype_attr_impl;
+
+namespace v1 {
+
+class datatype_attr : public ccl_api_base_copyable<datatype_attr,
+                                                   copy_on_write_access_policy,
+                                                   ccl_datatype_attr_impl> {
+public:
+    using base_t =
+        ccl_api_base_copyable<datatype_attr, copy_on_write_access_policy, ccl_datatype_attr_impl>;
+
+    /**
+     * Declare PIMPL type
+     */
+    using impl_value_t = typename base_t::impl_value_t;
+
+    /**
+     * Declare implementation type
+     */
+    using impl_t = typename impl_value_t::element_type;
+
+    datatype_attr& operator=(const datatype_attr& src);
+    datatype_attr& operator=(datatype_attr&& src);
+    datatype_attr(datatype_attr&& src);
+    datatype_attr(const datatype_attr& src);
+    ~datatype_attr() noexcept;
+
+    /**
+     * Set specific value for selft attribute by @attrId.
+     * Previous attibute value would be returned
+     */
+    template <datatype_attr_id attrId,
+              class Value/*,
+              class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::return_type*/>
+    Value set(const Value& v);
+
+    /**
+     * Get specific attribute value by @attrId
+     */
+    template <datatype_attr_id attrId>
+    const typename detail::ccl_api_type_attr_traits<datatype_attr_id, attrId>::return_type& get()
+        const;
+
+private:
+    friend class ccl::detail::environment;
+    datatype_attr(
+        const typename detail::ccl_api_type_attr_traits<datatype_attr_id,
+                                                        datatype_attr_id::version>::return_type&
+            version);
+};
+
+template <datatype_attr_id t, class value_type>
+constexpr auto attr_val(value_type v)
+    -> detail::attr_value_triple<datatype_attr_id, t, value_type> {
+    return detail::attr_value_triple<datatype_attr_id, t, value_type>(v);
+}
+
+} // namespace v1
+
+using v1::datatype_attr;
+using v1::attr_val;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_datatype_attr_ids.hpp b/include/oneapi/ccl/datatype_attr_ids.hpp
similarity index 90%
rename from include/oneapi/ccl/ccl_datatype_attr_ids.hpp
rename to include/oneapi/ccl/datatype_attr_ids.hpp
index 48a33aa85..3f07a8a81 100644
--- a/include/oneapi/ccl/ccl_datatype_attr_ids.hpp
+++ b/include/oneapi/ccl/datatype_attr_ids.hpp
@@ -13,20 +13,26 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#ifndef CCL_PRODUCT_FULL
-#error "Do not include this file directly. Please include 'ccl.hpp'"
-#endif
-
-namespace ccl {
-
-enum class datatype_attr_id : int {
-    version,
-
-    size,
-
-    last_value
-};
-
-}
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace v1 {
+
+enum class datatype_attr_id : int {
+    version,
+
+    size,
+
+    last_value
+};
+
+} // namespace v1
+
+using v1::datatype_attr_id;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_datatype_attr_ids_traits.hpp b/include/oneapi/ccl/datatype_attr_ids_traits.hpp
similarity index 93%
rename from include/oneapi/ccl/ccl_datatype_attr_ids_traits.hpp
rename to include/oneapi/ccl/datatype_attr_ids_traits.hpp
index 9f2c2ed18..ba67cebb3 100644
--- a/include/oneapi/ccl/ccl_datatype_attr_ids_traits.hpp
+++ b/include/oneapi/ccl/datatype_attr_ids_traits.hpp
@@ -13,27 +13,28 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#ifndef CCL_PRODUCT_FULL
-#error "Do not include this file directly. Please include 'ccl.hpp'"
-#endif
-
-namespace ccl {
-namespace details {
-
-template <>
-struct ccl_api_type_attr_traits<datatype_attr_id, datatype_attr_id::version> {
-    using type = ccl::library_version;
-    using return_type = type;
-};
-
-template <>
-struct ccl_api_type_attr_traits<datatype_attr_id, datatype_attr_id::size> {
-    using type = size_t;
-    using return_type = type;
-};
-
-} // namespace details
-
-} // namespace ccl
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace detail {
+
+template <>
+struct ccl_api_type_attr_traits<datatype_attr_id, datatype_attr_id::version> {
+    using type = ccl::library_version;
+    using return_type = type;
+};
+
+template <>
+struct ccl_api_type_attr_traits<datatype_attr_id, datatype_attr_id::size> {
+    using type = size_t;
+    using return_type = type;
+};
+
+} // namespace detail
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_device.hpp b/include/oneapi/ccl/device.hpp
similarity index 60%
rename from include/oneapi/ccl/ccl_device.hpp
rename to include/oneapi/ccl/device.hpp
index c38fa5470..2f18617ec 100644
--- a/include/oneapi/ccl/ccl_device.hpp
+++ b/include/oneapi/ccl/device.hpp
@@ -21,10 +21,16 @@
 
 class ccl_device_impl;
 namespace ccl {
+namespace detail {
+class environment;
+}
+
+namespace v1 {
+class communicator;
 
 /**
  * A device object is an abstraction over CPU/GPU device
- * Has no defined public constructor. Use ccl::environment::create_device
+ * Has no defined public constructor. Use ccl::create_device
  * for device objects creation
  */
 /**
@@ -47,8 +53,9 @@ class device : public ccl_api_base_copyable<device, direct_access_policy, ccl_de
     /**
      * Declare native device type
      */
-    using native_t = typename details::ccl_api_type_attr_traits<ccl::device_attr_id,
-                                                                ccl::device_attr_id::native_handle>::return_type;
+    using native_t =
+        typename detail::ccl_api_type_attr_traits<device_attr_id,
+                                                  device_attr_id::native_handle>::return_type;
 
     device(device&& src);
     device(const device& src);
@@ -56,21 +63,26 @@ class device : public ccl_api_base_copyable<device, direct_access_policy, ccl_de
     device& operator=(const device& src);
     ~device();
 
+    bool operator==(const device& rhs) const noexcept;
+    bool operator!=(const device& rhs) const noexcept;
+    bool operator<(const device& rhs) const noexcept;
+
     /**
      * Get specific attribute value by @attrId
      */
     template <device_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<device_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<device_attr_id, attrId>::return_type& get()
         const;
 
     /**
      * Get native device object
      */
-     native_t& get_native();
-     const native_t& get_native() const;
+    native_t& get_native();
+    const native_t& get_native() const;
+
 private:
-    friend class environment;
-    friend class communicator;
+    friend class ccl::detail::environment;
+    friend class ccl::v1::communicator;
     device(impl_value_t&& impl);
 
     /**
@@ -79,10 +91,10 @@ class device : public ccl_api_base_copyable<device, direct_access_policy, ccl_de
     template <device_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename ccl::details::ccl_api_type_attr_traits<ccl::device_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<device_attr_id, attrId>::return_type set(const Value& v);
 
     void build_from_params();
-    device(const typename details::ccl_api_type_attr_traits<device_attr_id,
+    device(const typename detail::ccl_api_type_attr_traits<device_attr_id,
                                                            device_attr_id::version>::type& version);
 
     /**
@@ -94,21 +106,28 @@ class device : public ccl_api_base_copyable<device, direct_access_policy, ccl_de
 
     template <class device_handle_type, class... attr_value_pair_t>
     static device create_device_from_attr(device_handle_type& native_device_handle,
-                                        attr_value_pair_t&&... avps);
+                                          attr_value_pair_t&&... avps);
 };
 
 template <device_attr_id t, class value_type>
-constexpr auto attr_val(value_type v) -> details::attr_value_tripple<device_attr_id, t, value_type> {
-    return details::attr_value_tripple<device_attr_id, t, value_type>(v);
+constexpr auto attr_val(value_type v) -> detail::attr_value_triple<device_attr_id, t, value_type> {
+    return detail::attr_value_triple<device_attr_id, t, value_type>(v);
 }
 
-template<class DeviceType>
-using rank_device_pair_t = ccl::pair_class<size_t, typename std::remove_reference<typename std::remove_cv<DeviceType>::type>::type>;
+template <class DeviceType>
+using rank_device_pair_t = ccl::pair_class<
+    size_t,
+    typename std::remove_reference<typename std::remove_cv<DeviceType>::type>::type>;
 
 template <class device_value_type>
-constexpr auto attr_val(size_t rank, device_value_type&& v)
-    -> rank_device_pair_t<device_value_type>{
-    return rank_device_pair_t<device_value_type>{rank, std::forward<device_value_type>(v)};
+constexpr auto attr_val(int rank, device_value_type&& v) -> rank_device_pair_t<device_value_type> {
+    return rank_device_pair_t<device_value_type>{ rank, std::forward<device_value_type>(v) };
 }
 
+} // namespace v1
+
+using v1::device;
+using v1::attr_val;
+using v1::rank_device_pair_t;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_device_attr_ids.hpp b/include/oneapi/ccl/device_attr_ids.hpp
similarity index 93%
rename from include/oneapi/ccl/ccl_device_attr_ids.hpp
rename to include/oneapi/ccl/device_attr_ids.hpp
index cb5644d9e..a3a4866dd 100644
--- a/include/oneapi/ccl/ccl_device_attr_ids.hpp
+++ b/include/oneapi/ccl/device_attr_ids.hpp
@@ -20,6 +20,9 @@
 #endif
 
 namespace ccl {
+
+namespace v1 {
+
 /**
  * Device attribute ids
  */
@@ -31,4 +34,8 @@ enum class device_attr_id : int {
     last_value
 };
 
+} // namespace v1
+
+using v1::device_attr_id;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_device_attr_ids_traits.hpp b/include/oneapi/ccl/device_attr_ids_traits.hpp
similarity index 95%
rename from include/oneapi/ccl/ccl_device_attr_ids_traits.hpp
rename to include/oneapi/ccl/device_attr_ids_traits.hpp
index c44af0c41..3e0ba6ed1 100644
--- a/include/oneapi/ccl/ccl_device_attr_ids_traits.hpp
+++ b/include/oneapi/ccl/device_attr_ids_traits.hpp
@@ -20,7 +20,8 @@
 #endif
 
 namespace ccl {
-namespace details {
+
+namespace detail {
 
 /**
  * Traits for device attributes specializations
@@ -42,5 +43,7 @@ struct ccl_api_type_attr_traits<device_attr_id, device_attr_id::native_handle> {
     using type = typename unified_device_type::ccl_native_t;
     using return_type = type;
 };
-}
-}
+
+} // namespace detail
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_device_type_traits.hpp b/include/oneapi/ccl/device_type_traits.hpp
similarity index 80%
rename from include/oneapi/ccl/ccl_device_type_traits.hpp
rename to include/oneapi/ccl/device_type_traits.hpp
index 08bbad9f7..90ce9d296 100644
--- a/include/oneapi/ccl/ccl_device_type_traits.hpp
+++ b/include/oneapi/ccl/device_type_traits.hpp
@@ -19,11 +19,13 @@
 #error "Do not include this file directly. Please include 'ccl_type_traits.hpp'"
 #endif
 
-#include  "oneapi/ccl/native_device_api/export_api.hpp"
+#include "oneapi/ccl/native_device_api/export_api.hpp"
 
 namespace ccl {
 
-#define SUPPORTED_KERNEL_NATIVE_DATA_TYPES char, int, float, ccl::bf16, double, int64_t, uint64_t
+#define SUPPORTED_KERNEL_NATIVE_DATA_TYPES \
+    int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, float, double, \
+        ccl::bfloat16
 
 template <class native_stream>
 constexpr bool is_stream_supported() {
@@ -40,13 +42,13 @@ constexpr bool is_event_supported() {
 template <class native_device>
 constexpr bool is_device_supported() {
     return api_type_info<typename std::remove_pointer<typename std::remove_cv<
-                         typename std::remove_reference<native_device>::type>::type>::type>::is_supported();
+        typename std::remove_reference<native_device>::type>::type>::type>::is_supported();
 }
 
 template <class native_context>
 constexpr bool is_context_supported() {
     return api_type_info<typename std::remove_pointer<typename std::remove_cv<
-                         typename std::remove_reference<native_context>::type>::type>::type>::is_supported();
+        typename std::remove_reference<native_context>::type>::type>::type>::is_supported();
 }
 
 /**
@@ -54,7 +56,7 @@ constexpr bool is_context_supported() {
  */
 API_CLASS_TYPE_INFO(empty_t);
 API_CLASS_TYPE_INFO(typename unified_device_type::ccl_native_t)
-API_CLASS_TYPE_INFO(typename unified_device_context_type::ccl_native_t);
+API_CLASS_TYPE_INFO(typename unified_context_type::ccl_native_t);
 API_CLASS_TYPE_INFO(typename unified_stream_type::ccl_native_t);
 API_CLASS_TYPE_INFO(typename unified_event_type::ccl_native_t);
 
diff --git a/include/oneapi/ccl/ccl_device_types.hpp b/include/oneapi/ccl/device_types.hpp
similarity index 78%
rename from include/oneapi/ccl/ccl_device_types.hpp
rename to include/oneapi/ccl/device_types.hpp
index 4a6bcb41e..09ad9b0c9 100644
--- a/include/oneapi/ccl/ccl_device_types.hpp
+++ b/include/oneapi/ccl/device_types.hpp
@@ -24,22 +24,6 @@ namespace ccl {
  * Push the following code into something similar with 'ccl_device_types.hpp'
  */
 
-/** Device topology group. */
-typedef enum {
-    device_group = 0,
-    thread_group = 1,
-    process_group = 2,
-
-    ccl_topology_group_last_value
-} ccl_topology_group_t;
-
-enum device_topology_type { undetermined = -1, ring, a2a, last_class_value };
-
-// TODO: tmp mapping
-#define ring_algo_class               device_topology_type::ring
-#define a2a_algo_class                device_topology_type::a2a
-#define ccl_topology_class_last_value device_topology_type::last_class_value
-
 using process_id = size_t;
 using host_id = std::string;
 
@@ -52,18 +36,18 @@ using cluster_aggregated_device_mask_t = std::map<host_id, process_aggregated_de
 
 using index_type = uint32_t;
 static constexpr index_type unused_index_value = std::numeric_limits<index_type>::max(); //TODO
+
 //TODO implement class instead
 using device_index_type = std::tuple<index_type, index_type, index_type>;
 enum device_index_enum { driver_index_id, device_index_id, subdevice_index_id };
 std::string to_string(const device_index_type& device_id);
 device_index_type from_string(const std::string& device_id_str);
 
-using device_indices_t = std::multiset<device_index_type>;
-using process_device_indices_t = std::map<process_id, device_indices_t>;
-using cluster_device_indices_t = std::map<host_id, process_device_indices_t>;
+using device_indices_type = std::multiset<device_index_type>;
+using process_device_indices_type = std::map<process_id, device_indices_type>;
+using cluster_device_indices_type = std::map<host_id, process_device_indices_type>;
 
-struct empty_t{
-};
+struct empty_t {};
 template <cl_backend_type config_backend>
 struct backend_info {};
 
@@ -71,7 +55,7 @@ template <cl_backend_type config_backend>
 struct generic_device_type {};
 
 template <cl_backend_type config_backend>
-struct generic_device_context_type {};
+struct generic_context_type {};
 
 template <cl_backend_type config_backend>
 struct generic_platform_type {};
diff --git a/include/oneapi/ccl/environment.hpp b/include/oneapi/ccl/environment.hpp
new file mode 100644
index 000000000..bf5c4193e
--- /dev/null
+++ b/include/oneapi/ccl/environment.hpp
@@ -0,0 +1,301 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+#include <memory>
+#include <ostream>
+#include <utility>
+#include <vector>
+
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
+
+#include "oneapi/ccl/comm_attr_ids.hpp"
+#include "oneapi/ccl/comm_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_attr.hpp"
+
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
+
+#include "oneapi/ccl/context_attr_ids.hpp"
+#include "oneapi/ccl/context_attr_ids_traits.hpp"
+#include "oneapi/ccl/context.hpp"
+
+#include "oneapi/ccl/datatype_attr_ids.hpp"
+#include "oneapi/ccl/datatype_attr_ids_traits.hpp"
+#include "oneapi/ccl/datatype_attr.hpp"
+
+#include "oneapi/ccl/device_attr_ids.hpp"
+#include "oneapi/ccl/device_attr_ids_traits.hpp"
+#include "oneapi/ccl/device.hpp"
+
+#include "oneapi/ccl/init_attr_ids.hpp"
+#include "oneapi/ccl/init_attr_ids_traits.hpp"
+#include "oneapi/ccl/init_attr.hpp"
+
+#include "oneapi/ccl/kvs_attr_ids.hpp"
+#include "oneapi/ccl/kvs_attr_ids_traits.hpp"
+#include "oneapi/ccl/kvs_attr.hpp"
+
+#include "oneapi/ccl/kvs.hpp"
+
+#include "oneapi/ccl/event.hpp"
+
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/stream.hpp"
+
+#include "oneapi/ccl/communicator.hpp"
+
+#include "oneapi/ccl/exception.hpp"
+
+namespace ccl {
+
+namespace detail {
+
+/**
+ * CCL environment singleton
+ */
+class environment {
+public:
+    ~environment();
+
+    /**
+     * Retrieves the unique environment object
+     * and makes the first-time initialization of CCL library
+     */
+    static environment& instance();
+
+    static ccl::library_version get_library_version();
+
+    template <class... attr_value_pair_t>
+    static init_attr create_init_attr(attr_value_pair_t&&... avps) {
+        auto init_create_attr = create_postponed_api_type<init_attr>();
+        int expander[]{ (init_create_attr.template set<attr_value_pair_t::idx()>(avps.val()),
+                         0)... };
+        (void)expander;
+        return init_create_attr;
+    }
+
+    template <class coll_attribute_type, class... attr_value_pair_t>
+    static coll_attribute_type create_operation_attr(attr_value_pair_t&&... avps) {
+        auto op_attr = create_postponed_api_type<coll_attribute_type>();
+        int expander[]{ (op_attr.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
+        (void)expander;
+        return op_attr;
+    }
+
+    /******************** DATATYPE ********************/
+
+    template <class... attr_value_pair_t>
+    static datatype_attr create_datatype_attr(attr_value_pair_t&&... avps) {
+        static_assert(sizeof...(avps) > 0, "At least one argument must be specified");
+        auto attr = create_postponed_api_type<datatype_attr>();
+        int expander[]{ (attr.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
+        (void)expander;
+        return attr;
+    }
+
+    ccl::datatype register_datatype(const datatype_attr& attr);
+    void deregister_datatype(ccl::datatype dtype);
+    size_t get_datatype_size(ccl::datatype dtype) const;
+
+    /******************** KVS ********************/
+
+    template <class... attr_value_pair_t>
+    static kvs_attr create_kvs_attr(attr_value_pair_t&&... avps) {
+        auto kvs_create_attr = create_postponed_api_type<kvs_attr>();
+        int expander[]{ (kvs_create_attr.template set<attr_value_pair_t::idx()>(avps.val()),
+                         0)... };
+        (void)expander;
+        return kvs_create_attr;
+    }
+
+    shared_ptr_class<kvs> create_main_kvs(const kvs_attr& attr) const;
+    shared_ptr_class<kvs> create_kvs(const kvs::address_type& addr, const kvs_attr& attr) const;
+
+    /******************** DEVICE ********************/
+
+    device create_device(empty_t empty) const;
+
+    template <class native_device_type,
+              class = typename std::enable_if<is_device_supported<native_device_type>()>::type>
+    device create_device(native_device_type&& native_device) const;
+
+    template <class... attr_value_pair_t>
+    device create_device_from_attr(typename unified_device_type::ccl_native_t dev,
+                                   attr_value_pair_t&&... avps) const {
+        device str = create_postponed_api_type<device>(dev);
+        int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
+        (void)expander;
+        str.build_from_params();
+        return str;
+    }
+
+    /******************** CONTEXT ********************/
+
+    context create_context(empty_t empty) const;
+
+    template <
+        class native_device_contex_type,
+        class = typename std::enable_if<is_device_supported<native_device_contex_type>()>::type>
+    context create_context(native_device_contex_type&& native_context) const;
+
+    template <class... attr_value_pair_t>
+    context create_context_from_attr(typename unified_context_type::ccl_native_t ctx,
+                                     attr_value_pair_t&&... avps) const {
+        context str = create_postponed_api_type<context>(ctx);
+        int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
+        (void)expander;
+        str.build_from_params();
+        return str;
+    }
+
+    /******************** EVENT ********************/
+
+    template <class event_type,
+              class = typename std::enable_if<is_event_supported<event_type>()>::type>
+    event create_event(event_type& native_event) {
+        return event::create_from_native(native_event);
+    }
+
+    template <class event_handle_type,
+              class = typename std::enable_if<is_event_supported<event_handle_type>()>::type>
+    event create_event(event_handle_type& native_event_handle, event::context_t& context) {
+        return event::create_from_native(native_event_handle, context);
+    }
+
+    /******************** STREAM ********************/
+
+    template <class native_stream_type,
+              class = typename std::enable_if<is_stream_supported<native_stream_type>()>::type>
+    stream create_stream(native_stream_type& native_stream);
+
+    template <class native_stream_type,
+              class native_context_type,
+              class = typename std::enable_if<is_stream_supported<native_stream_type>()>::type>
+    stream create_stream(native_stream_type& native_stream, native_context_type& native_ctx);
+
+    template <class... attr_value_pair_t>
+    stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
+                                   attr_value_pair_t&&... avps) {
+        stream str = create_stream(device);
+        int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
+        (void)expander;
+        str.build_from_params();
+        return str;
+    }
+
+    template <class... attr_value_pair_t>
+    stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
+                                   typename unified_context_type::ccl_native_t context,
+                                   attr_value_pair_t&&... avps) {
+        stream str = create_stream(device, context);
+        int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
+        (void)expander;
+        str.build_from_params();
+        return str;
+    }
+
+    /******************** COMMUNICATOR ********************/
+
+#ifdef CCL_ENABLE_SYCL
+    communicator create_single_device_communicator(int comm_size,
+                                                   int rank,
+                                                   const cl::sycl::device& device,
+                                                   const cl::sycl::context& context,
+                                                   shared_ptr_class<kvs_interface> kvs) const;
+#endif
+
+    template <class... attr_value_pair_t>
+    static comm_split_attr create_comm_split_attr(attr_value_pair_t&&... avps) {
+        auto split_attr = create_postponed_api_type<comm_split_attr>();
+        int expander[]{ (split_attr.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
+        (void)expander;
+        return split_attr;
+    }
+
+    template <class... attr_value_pair_t>
+    static comm_attr create_comm_attr(attr_value_pair_t&&... avps) {
+        auto comm_create_attr = create_postponed_api_type<comm_attr>();
+        int expander[]{ (comm_create_attr.template set<attr_value_pair_t::idx()>(avps.val()),
+                         0)... };
+        (void)expander;
+        return comm_create_attr;
+    }
+
+    communicator create_communicator(const comm_attr& attr) const;
+    communicator create_communicator(size_t size,
+                                     shared_ptr_class<kvs_interface> kvs,
+                                     const comm_attr& attr) const;
+    communicator create_communicator(size_t size,
+                                     int rank,
+                                     shared_ptr_class<kvs_interface> kvs,
+                                     const comm_attr& attr) const;
+
+    template <class DeviceType, class ContextType>
+    vector_class<communicator> create_communicators(int comm_size,
+                                                    const vector_class<DeviceType>& local_devices,
+                                                    const ContextType& context,
+                                                    shared_ptr_class<kvs_interface> kvs,
+                                                    const comm_attr& attr) const;
+
+    template <class DeviceType, class ContextType>
+    vector_class<communicator> create_communicators(
+        int comm_size,
+        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs,
+        const comm_attr& attr) const;
+
+    template <class DeviceType, class ContextType>
+    vector_class<communicator> create_communicators(
+        int comm_size,
+        const map_class<int, DeviceType>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs,
+        const comm_attr& attr) const;
+
+    vector_class<communicator> split_communicators(
+        const vector_class<pair_class<communicator, comm_split_attr>>& attrs) const;
+
+private:
+    environment();
+
+    template <class ccl_api_type, class... args_type>
+    static ccl_api_type create_postponed_api_type(args_type... args) {
+        auto version = get_library_version();
+        return ccl_api_type(std::forward<args_type>(args)..., version);
+    }
+
+    stream create_stream(typename unified_device_type::ccl_native_t device);
+
+    stream create_stream(typename unified_device_type::ccl_native_t device,
+                         typename unified_context_type::ccl_native_t context);
+};
+
+} // namespace detail
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_event.hpp b/include/oneapi/ccl/event.hpp
similarity index 88%
rename from include/oneapi/ccl/ccl_event.hpp
rename to include/oneapi/ccl/event.hpp
index ea7339572..61e49311a 100644
--- a/include/oneapi/ccl/ccl_event.hpp
+++ b/include/oneapi/ccl/event.hpp
@@ -20,9 +20,14 @@
 #endif
 
 namespace ccl {
+namespace detail {
+class environment;
+}
 
 class event_impl;
 
+namespace v1 {
+
 /**
  * event's interface that allows users to track communication operation progress
  */
@@ -41,6 +46,8 @@ class event : public ccl_api_base_movable<event, direct_access_policy, event_imp
     using impl_t = typename impl_value_t::element_type;
 
     using native_t = typename unified_event_type::ccl_native_t;
+    using native_handle_t = typename unified_event_type::handle_t;
+    using context_t = typename unified_context_type::ccl_native_t;
 
     event() noexcept;
     event(event&& src) noexcept;
@@ -87,9 +94,14 @@ class event : public ccl_api_base_movable<event, direct_access_policy, event_imp
     const native_t& get_native() const;
 
 private:
-    friend class environment;
+    friend class ccl::detail::environment;
 
     static event create_from_native(native_t& native_event);
+    static event create_from_native(native_handle_t native_event_handle, context_t context);
 };
 
+} // namespace v1
+
+using v1::event;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_exception.hpp b/include/oneapi/ccl/exception.hpp
similarity index 53%
rename from include/oneapi/ccl/ccl_exception.hpp
rename to include/oneapi/ccl/exception.hpp
index 6a5ed8f7f..a5d03b400 100644
--- a/include/oneapi/ccl/ccl_exception.hpp
+++ b/include/oneapi/ccl/exception.hpp
@@ -21,6 +21,8 @@
 
 namespace ccl {
 
+namespace v1 {
+
 class exception : public std::exception {
     std::string msg;
 
@@ -28,19 +30,17 @@ class exception : public std::exception {
     exception(const std::string &domain, const std::string &function, const std::string &info = "")
             : std::exception() {
         msg = std::string("oneCCL: ") + domain +
-               ((domain.length() != 0 && function.length() != 0) ? "/" : "") + function +
-               ((info.length() != 0)
-                    ? (((domain.length() + function.length() != 0) ? ": " : "") + info)
-                    : "");
+              ((domain.length() != 0 && function.length() != 0) ? "/" : "") + function +
+              ((info.length() != 0)
+                   ? (((domain.length() + function.length() != 0) ? ": " : "") + info)
+                   : "");
     }
 
-    exception(const std::string &info = "")
-            : std::exception() {
+    exception(const std::string &info = "") : std::exception() {
         msg = std::string("oneCCL: ") + info;
     }
 
-    exception(const char* info)
-            : std::exception() {
+    exception(const char *info) : std::exception() {
         msg = std::string("oneCCL: ") + std::string(info);
     }
 
@@ -49,40 +49,52 @@ class exception : public std::exception {
     }
 };
 
-class invalid_argument : public ccl::exception {
+class invalid_argument : public exception {
 public:
-    invalid_argument(const std::string &domain, const std::string &function,
+    invalid_argument(const std::string &domain,
+                     const std::string &function,
                      const std::string &info = "")
-            : ccl::exception(domain, function, "invalid argument " + info) {}
+            : exception(domain, function, "invalid argument " + info) {}
 };
 
-class host_bad_alloc : public ccl::exception {
+class host_bad_alloc : public exception {
 public:
     host_bad_alloc(const std::string &domain, const std::string &function)
-            : ccl::exception(domain, function, "cannot allocate memory on host") {}
+            : exception(domain, function, "cannot allocate memory on host") {}
 };
 
-// class device_bad_alloc : public ccl::exception {
+// class device_bad_alloc : public exception {
 // public:
 //     device_bad_alloc(const std::string &domain, const std::string &function,
 //                      const cl::sycl::device &device)
-//             : ccl::exception(
+//             : exception(
 //                   domain, function,
 //                   "cannot allocate memory on " + device.get_info<cl::sycl::info::device::name>()) {}
 // };
 
-class unimplemented : public ccl::exception {
+class unimplemented : public exception {
 public:
-    unimplemented(const std::string &domain, const std::string &function,
+    unimplemented(const std::string &domain,
+                  const std::string &function,
                   const std::string &info = "")
-            : ccl::exception(domain, function, "function is not implemented " + info) {}
+            : exception(domain, function, "function is not implemented " + info) {}
 };
 
-class unsupported : public ccl::exception {
+class unsupported : public exception {
 public:
-    unsupported(const std::string &domain, const std::string &function,
-                  const std::string &info = "")
-            : ccl::exception(domain, function, "function is not supported " + info) {}
+    unsupported(const std::string &domain,
+                const std::string &function,
+                const std::string &info = "")
+            : exception(domain, function, "function is not supported " + info) {}
 };
 
-}
+} // namespace v1
+
+using v1::exception;
+using v1::invalid_argument;
+using v1::host_bad_alloc;
+// using v1::device_bad_alloc;
+using v1::unimplemented;
+using v1::unsupported;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/init_attr.hpp b/include/oneapi/ccl/init_attr.hpp
new file mode 100644
index 000000000..39e801fb4
--- /dev/null
+++ b/include/oneapi/ccl/init_attr.hpp
@@ -0,0 +1,96 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+namespace detail {
+class environment;
+}
+
+class init_attr_impl;
+
+namespace v1 {
+
+struct ccl_empty_attr;
+
+class init_attr
+        : public ccl_api_base_copyable<init_attr, copy_on_write_access_policy, init_attr_impl> {
+public:
+    using base_t = ccl_api_base_copyable<init_attr, copy_on_write_access_policy, init_attr_impl>;
+
+    /**
+     * Declare PIMPL type
+     */
+    using impl_value_t = typename base_t::impl_value_t;
+
+    /**
+     * Declare implementation type
+     */
+    using impl_t = typename impl_value_t::element_type;
+
+    init_attr& operator=(const init_attr& src);
+    init_attr& operator=(init_attr&& src);
+    init_attr(init_attr&& src);
+    init_attr(const init_attr& src);
+    ~init_attr() noexcept;
+
+    /**
+     * Set specific value for selft attribute by @attrId.
+     * Previous attibute value would be returned
+     */
+    template <init_attr_id attrId,
+              class Value/*,
+              class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::return_type*/>
+    Value set(const Value& v);
+
+    /**
+     * Get specific attribute value by @attrId
+     */
+    template <init_attr_id attrId>
+    const typename detail::ccl_api_type_attr_traits<init_attr_id, attrId>::return_type& get() const;
+
+private:
+    friend class ccl::detail::environment;
+    friend struct ccl::ccl_empty_attr;
+    init_attr(const typename detail::ccl_api_type_attr_traits<init_attr_id,
+                                                              init_attr_id::version>::return_type&
+                  version);
+};
+
+/**
+ * Declare extern empty attributes
+ */
+extern init_attr default_init_attr;
+
+/**
+ * Fabric helpers
+ */
+template <init_attr_id t, class value_type>
+constexpr auto attr_val(value_type v) -> detail::attr_value_triple<init_attr_id, t, value_type> {
+    return detail::attr_value_triple<init_attr_id, t, value_type>(v);
+}
+
+} // namespace v1
+
+using v1::init_attr;
+using v1::default_init_attr;
+using v1::attr_val;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/init_attr_ids.hpp b/include/oneapi/ccl/init_attr_ids.hpp
new file mode 100644
index 000000000..6196fde4a
--- /dev/null
+++ b/include/oneapi/ccl/init_attr_ids.hpp
@@ -0,0 +1,36 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace v1 {
+
+enum class init_attr_id : int {
+    version,
+
+    last_value
+};
+
+} // namespace v1
+
+using v1::init_attr_id;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/init_attr_ids_traits.hpp b/include/oneapi/ccl/init_attr_ids_traits.hpp
new file mode 100644
index 000000000..5b36bb2e2
--- /dev/null
+++ b/include/oneapi/ccl/init_attr_ids_traits.hpp
@@ -0,0 +1,34 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace detail {
+
+template <>
+struct ccl_api_type_attr_traits<init_attr_id, init_attr_id::version> {
+    using type = ccl::library_version;
+    using return_type = type;
+};
+
+} // namespace detail
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_kvs.hpp b/include/oneapi/ccl/kvs.hpp
similarity index 76%
rename from include/oneapi/ccl/ccl_kvs.hpp
rename to include/oneapi/ccl/kvs.hpp
index fb97c44df..45de27595 100644
--- a/include/oneapi/ccl/ccl_kvs.hpp
+++ b/include/oneapi/ccl/kvs.hpp
@@ -20,18 +20,23 @@
 #endif
 
 namespace ccl {
+namespace detail {
+class environment;
+}
+
+class kvs_impl;
+
+namespace v1 {
 
 class CCL_API kvs_interface {
 public:
-
     virtual ~kvs_interface() = default;
 
-    virtual vector_class<char> get(const string_class& key) const = 0;
+    virtual vector_class<char> get(const string_class& key) = 0;
 
-    virtual void set(const string_class& key, const vector_class<char>& data) const = 0;
+    virtual void set(const string_class& key, const vector_class<char>& data) = 0;
 };
 
-class kvs_impl;
 class CCL_API kvs final : public kvs_interface {
 public:
     static constexpr size_t address_max_size = 256;
@@ -41,18 +46,24 @@ class CCL_API kvs final : public kvs_interface {
 
     address_type get_address() const;
 
-    vector_class<char> get(const string_class& key) const override;
+    vector_class<char> get(const string_class& key) override;
 
-    void set(const string_class& key, const vector_class<char>& data) const override;
+    void set(const string_class& key, const vector_class<char>& data) override;
 
 private:
-    friend class environment;
+    friend class ccl::detail::environment;
 
-    kvs();
-    kvs(const address_type& addr);
+    kvs(const kvs_attr& attr);
+    kvs(const address_type& addr, const kvs_attr& attr);
     const kvs_impl& get_impl();
 
     address_type addr;
     unique_ptr_class<kvs_impl> pimpl;
 };
+
+} // namespace v1
+
+using v1::kvs_interface;
+using v1::kvs;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/kvs_attr.hpp b/include/oneapi/ccl/kvs_attr.hpp
new file mode 100644
index 000000000..253b13dd0
--- /dev/null
+++ b/include/oneapi/ccl/kvs_attr.hpp
@@ -0,0 +1,98 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+namespace detail {
+class environment;
+}
+
+class ccl_kvs_attr_impl;
+
+namespace v1 {
+
+struct ccl_empty_attr;
+
+/**
+ * kvsunicator attributes
+ */
+class kvs_attr
+        : public ccl_api_base_copyable<kvs_attr, copy_on_write_access_policy, ccl_kvs_attr_impl> {
+public:
+    using base_t = ccl_api_base_copyable<kvs_attr, copy_on_write_access_policy, ccl_kvs_attr_impl>;
+
+    /**
+     * Declare PIMPL type
+     */
+    using impl_value_t = typename base_t::impl_value_t;
+
+    /**
+     * Declare implementation type
+     */
+    using impl_t = typename impl_value_t::element_type;
+
+    kvs_attr& operator=(const kvs_attr& src);
+    kvs_attr& operator=(kvs_attr&& src);
+    kvs_attr(kvs_attr&& src);
+    kvs_attr(const kvs_attr& src);
+    kvs_attr(ccl_empty_attr);
+    ~kvs_attr() noexcept;
+
+    /**
+     * Set specific value for selft attribute by @attrId.
+     * Previous attibute value would be returned
+     */
+    template <kvs_attr_id attrId,
+        class Value/*,
+              class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
+    Value set(const Value& v);
+
+    /**
+     * Get specific attribute value by @attrId
+     */
+    template <kvs_attr_id attrId>
+    const typename detail::ccl_api_type_attr_traits<kvs_attr_id, attrId>::type& get() const;
+
+    template <kvs_attr_id attrId>
+    bool is_valid() const noexcept;
+
+private:
+    friend class ccl::detail::environment;
+    friend struct ccl::v1::ccl_empty_attr;
+
+    kvs_attr(const typename detail::ccl_api_type_attr_traits<kvs_attr_id,
+                                                             kvs_attr_id::version>::return_type&
+                 version);
+};
+
+extern kvs_attr default_kvs_attr;
+
+template <kvs_attr_id t, class value_type>
+constexpr auto attr_val(value_type v) -> detail::attr_value_triple<kvs_attr_id, t, value_type> {
+    return detail::attr_value_triple<kvs_attr_id, t, value_type>(v);
+}
+
+} // namespace v1
+
+using v1::kvs_attr;
+using v1::default_kvs_attr;
+using v1::attr_val;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/kvs_attr_ids.hpp b/include/oneapi/ccl/kvs_attr_ids.hpp
new file mode 100644
index 000000000..f753a2f76
--- /dev/null
+++ b/include/oneapi/ccl/kvs_attr_ids.hpp
@@ -0,0 +1,36 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace v1 {
+
+enum class kvs_attr_id : int {
+    version,
+
+    last_value
+};
+
+} // namespace v1
+
+using v1::kvs_attr_id;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/kvs_attr_ids_traits.hpp b/include/oneapi/ccl/kvs_attr_ids_traits.hpp
new file mode 100644
index 000000000..281955994
--- /dev/null
+++ b/include/oneapi/ccl/kvs_attr_ids_traits.hpp
@@ -0,0 +1,34 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#ifndef CCL_PRODUCT_FULL
+#error "Do not include this file directly. Please include 'ccl.hpp'"
+#endif
+
+namespace ccl {
+
+namespace detail {
+
+template <>
+struct ccl_api_type_attr_traits<kvs_attr_id, kvs_attr_id::version> {
+    using type = ccl::library_version;
+    using return_type = type;
+};
+
+} // namespace detail
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/lp_types.hpp b/include/oneapi/ccl/lp_types.hpp
new file mode 100644
index 000000000..59109e3e6
--- /dev/null
+++ b/include/oneapi/ccl/lp_types.hpp
@@ -0,0 +1,72 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <string>
+
+namespace ccl {
+
+namespace preview {
+
+// struct float16 {
+//     constexpr float16() : data(0) {}
+//     constexpr float16(uint16_t v) : data(v) {}
+//     uint16_t data;
+
+//     friend std::ostream& operator<<(std::ostream& out, const float16& v) {
+//         out << v.data;
+//         return out;
+//     }
+
+//     friend bool operator==(const float16& v1, const float16& v2) {
+//         return (v1.data == v2.data) ? true : false;
+//     }
+
+//     friend bool operator!=(const float16& v1, const float16& v2) {
+//         return !(v1 == v2);
+//     }
+
+// } __attribute__((packed));
+
+} // namespace preview
+
+namespace v1 {
+
+struct bfloat16 {
+    constexpr bfloat16() : data(0) {}
+    constexpr bfloat16(uint16_t v) : data(v) {}
+    uint16_t data;
+
+    friend std::ostream& operator<<(std::ostream& out, const bfloat16& v) {
+        out << v.data;
+        return out;
+    }
+
+    friend bool operator==(const bfloat16& v1, const bfloat16& v2) {
+        return (v1.data == v2.data) ? true : false;
+    }
+
+    friend bool operator!=(const bfloat16& v1, const bfloat16& v2) {
+        return !(v1 == v2);
+    }
+
+} __attribute__((packed));
+
+} // namespace v1
+
+using v1::bfloat16;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/native_device_api/empty/context.hpp b/include/oneapi/ccl/native_device_api/empty/context.hpp
index 92291cea7..d326b2392 100644
--- a/include/oneapi/ccl/native_device_api/empty/context.hpp
+++ b/include/oneapi/ccl/native_device_api/empty/context.hpp
@@ -16,6 +16,5 @@
 #pragma once
 
 namespace native {
-struct ccl_context {
-};
-}
+struct ccl_context {};
+} // namespace native
diff --git a/include/oneapi/ccl/native_device_api/empty/device.hpp b/include/oneapi/ccl/native_device_api/empty/device.hpp
index 3970fabed..7105be02a 100644
--- a/include/oneapi/ccl/native_device_api/empty/device.hpp
+++ b/include/oneapi/ccl/native_device_api/empty/device.hpp
@@ -21,4 +21,4 @@ struct ccl_device {
     using device_event = ccl_device_event;
     using device_queue = ccl_device_queue;
 };
-}
+} // namespace native
diff --git a/include/oneapi/ccl/native_device_api/empty/event.hpp b/include/oneapi/ccl/native_device_api/empty/event.hpp
index c0d098acb..1e8edd786 100644
--- a/include/oneapi/ccl/native_device_api/empty/event.hpp
+++ b/include/oneapi/ccl/native_device_api/empty/event.hpp
@@ -16,6 +16,5 @@
 #pragma once
 namespace native {
 
-struct ccl_device_queue {
-};
-}
+struct ccl_device_queue {};
+} // namespace native
diff --git a/include/oneapi/ccl/native_device_api/empty/export.hpp b/include/oneapi/ccl/native_device_api/empty/export.hpp
index bfcad4b3c..9e66d3138 100644
--- a/include/oneapi/ccl/native_device_api/empty/export.hpp
+++ b/include/oneapi/ccl/native_device_api/empty/export.hpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 
 #define CL_BACKEND_TYPE ccl::cl_backend_type::empty_backend
 
@@ -23,16 +23,16 @@
 #include "oneapi/ccl/native_device_api/empty/platform.hpp"
 #include "oneapi/ccl/native_device_api/empty/primitives.hpp"
 
-
-namespace ccl
-{
+namespace ccl {
 
 template <>
 struct backend_info<CL_BACKEND_TYPE> {
     CCL_API static constexpr ccl::cl_backend_type type() {
-        return CL_BACKEND_TYPE; }
+        return CL_BACKEND_TYPE;
+    }
     CCL_API static constexpr const char* name() {
-        return "CL_BACKEND_UNAVAILABLE"; }
+        return "CL_BACKEND_UNAVAILABLE";
+    }
 };
 
 template <>
@@ -41,20 +41,25 @@ struct generic_device_type<CL_BACKEND_TYPE> {
     using impl_t = native::ccl_device;
     using ccl_native_t = std::shared_ptr<impl_t>;
 
-    template<class T>
-    generic_device_type(T&& not_used) {(void)not_used;};
+    template <class T>
+    generic_device_type(T&& not_used) {
+        (void)not_used;
+    };
     void get_id() const noexcept;
-    ccl_native_t get() noexcept;
+    ccl_native_t& get() noexcept;
+    const ccl_native_t& get() const noexcept;
 };
 
 template <>
-struct generic_device_context_type<CL_BACKEND_TYPE> {
+struct generic_context_type<CL_BACKEND_TYPE> {
     using handle_t = empty_t;
     using impl_t = native::ccl_context;
     using ccl_native_t = std::shared_ptr<impl_t>;
 
-    template<class T>
-    generic_device_context_type(T&& not_used) {(void)not_used;};
+    template <class T>
+    generic_context_type(T&& not_used) {
+        (void)not_used;
+    };
     ccl_native_t get() noexcept;
     const ccl_native_t& get() const noexcept;
 
@@ -92,4 +97,4 @@ struct generic_event_type<CL_BACKEND_TYPE> {
     ccl_native_t get() noexcept;
     const ccl_native_t& get() const noexcept;
 };
-}
+} // namespace ccl
diff --git a/include/oneapi/ccl/native_device_api/empty/platform.hpp b/include/oneapi/ccl/native_device_api/empty/platform.hpp
index 70a27b014..f9fcecbd7 100644
--- a/include/oneapi/ccl/native_device_api/empty/platform.hpp
+++ b/include/oneapi/ccl/native_device_api/empty/platform.hpp
@@ -16,6 +16,5 @@
 #pragma once
 
 namespace native {
-struct ccl_device_platform {
-};
-}
+struct ccl_device_platform {};
+} // namespace native
diff --git a/include/oneapi/ccl/native_device_api/empty/primitives.hpp b/include/oneapi/ccl/native_device_api/empty/primitives.hpp
index 769e06a23..8bd66a66d 100644
--- a/include/oneapi/ccl/native_device_api/empty/primitives.hpp
+++ b/include/oneapi/ccl/native_device_api/empty/primitives.hpp
@@ -16,8 +16,6 @@
 #pragma once
 namespace native {
 
-struct ccl_device_event {
-};
-struct ccl_device_queue {
-};
-}
+struct ccl_device_event {};
+struct ccl_device_queue {};
+} // namespace native
diff --git a/include/oneapi/ccl/native_device_api/empty/queue.hpp b/include/oneapi/ccl/native_device_api/empty/queue.hpp
index c0d098acb..1e8edd786 100644
--- a/include/oneapi/ccl/native_device_api/empty/queue.hpp
+++ b/include/oneapi/ccl/native_device_api/empty/queue.hpp
@@ -16,6 +16,5 @@
 #pragma once
 namespace native {
 
-struct ccl_device_queue {
-};
-}
+struct ccl_device_queue {};
+} // namespace native
diff --git a/include/oneapi/ccl/native_device_api/export_api.hpp b/include/oneapi/ccl/native_device_api/export_api.hpp
index 3a1a79fd8..17d015351 100644
--- a/include/oneapi/ccl/native_device_api/export_api.hpp
+++ b/include/oneapi/ccl/native_device_api/export_api.hpp
@@ -14,12 +14,12 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_config.h"
+#include "oneapi/ccl/config.h"
 
 #ifdef CCL_ENABLE_SYCL
-    #ifdef MULTI_GPU_SUPPORT
-        #include "sycl_l0/export.hpp"
-       /*
+#ifdef MULTI_GPU_SUPPORT
+#include "sycl_l0/export.hpp"
+/*
         #include "oneapi/ccl/native_device_api/l0/base.hpp"
         #include "oneapi/ccl/native_device_api/l0/base_impl.hpp"
 
@@ -32,15 +32,15 @@
         #include "oneapi/ccl/native_device_api/l0/driver.hpp"
         #include "oneapi/ccl/native_device_api/l0/platform.hpp"
         */
-    #else
-        #include "sycl/export.hpp"
-    #endif
 #else
-    #ifdef MULTI_GPU_SUPPORT
-        #include "l0/export.hpp"
-    #else
-        #include "empty/export.hpp"
-    #endif
+#include "sycl/export.hpp"
+#endif
+#else
+#ifdef MULTI_GPU_SUPPORT
+#include "l0/export.hpp"
+#else
+#include "empty/export.hpp"
+#endif
 #endif
 
 #ifndef CL_BACKEND_TYPE
@@ -49,10 +49,10 @@
 namespace ccl {
 using backend_traits = backend_info<CL_BACKEND_TYPE>;
 using unified_device_type = generic_device_type<CL_BACKEND_TYPE>;
-using unified_device_context_type = generic_device_context_type<CL_BACKEND_TYPE>;
+using unified_context_type = generic_context_type<CL_BACKEND_TYPE>;
 using unified_platform_type = generic_platform_type<CL_BACKEND_TYPE>;
 using unified_stream_type = generic_stream_type<CL_BACKEND_TYPE>;
 using unified_event_type = generic_event_type<CL_BACKEND_TYPE>;
-}
+} // namespace ccl
 
 #include "interop_utils.hpp"
diff --git a/include/oneapi/ccl/native_device_api/interop_utils.hpp b/include/oneapi/ccl/native_device_api/interop_utils.hpp
index ccaae1b13..15587b6b8 100644
--- a/include/oneapi/ccl/native_device_api/interop_utils.hpp
+++ b/include/oneapi/ccl/native_device_api/interop_utils.hpp
@@ -14,14 +14,14 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
 #endif
 
 namespace native {
-namespace details {
+namespace detail {
 
 #ifdef CCL_ENABLE_SYCL
 size_t get_sycl_device_id(const cl::sycl::device& dev);
@@ -31,24 +31,28 @@ std::string usm_to_string(cl::sycl::usm::alloc val);
 enum usm_support_mode { prohibited = 0, direct, shared, need_conversion, last_value };
 std::string to_string(usm_support_mode val);
 
-using assoc_retult = std::tuple<usm_support_mode, const void*, std::string>;
+using assoc_result = std::tuple<usm_support_mode, const void*, std::string>;
 enum assoc_result_index { SUPPORT_MODE = 0, POINTER_VALUE, ERROR_CAUSE };
 
 #if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-assoc_retult check_assoc_device_memory(const void* mem,
+assoc_result check_assoc_device_memory(const void* mem,
                                        const ccl::unified_device_type::ccl_native_t& device,
-                                       const ccl::unified_device_context_type::ccl_native_t& ctx);
+                                       const ccl::unified_context_type::ccl_native_t& ctx);
+
+usm_support_mode check_assoc_device_memory(const std::vector<void*>& mems,
+                                           const ccl::unified_device_type::ccl_native_t& device,
+                                           const ccl::unified_context_type::ccl_native_t& ctx);
 
 #endif //defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-std::string to_string(const assoc_retult& res);
+std::string to_string(const assoc_result& res);
 
 #if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
 template <size_t N>
-using multiple_assoc_result = std::array<assoc_retult, N>;
+using multiple_assoc_result = std::array<assoc_result, N>;
 
 template <class... mem_type>
 auto check_multiple_assoc_device_memory(const ccl::unified_device_type::ccl_native_t& device,
-                                        const ccl::unified_device_context_type::ccl_native_t& ctx,
+                                        const ccl::unified_context_type::ccl_native_t& ctx,
                                         const mem_type*... mem)
     -> multiple_assoc_result<sizeof...(mem)> {
     multiple_assoc_result<sizeof...(mem)> ret{ check_assoc_device_memory(mem, device, ctx)... };
@@ -64,5 +68,5 @@ std::string to_string(const multiple_assoc_result<N>& res) {
     return ss.str();
 }
 #endif //defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/include/oneapi/ccl/native_device_api/l0/base.hpp b/include/oneapi/ccl/native_device_api/l0/base.hpp
index c7429a056..18e8721fa 100644
--- a/include/oneapi/ccl/native_device_api/l0/base.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/base.hpp
@@ -23,9 +23,9 @@
 #include <ze_api.h>
 
 #ifndef UT
-//#include "oneapi/ccl/ccl_types.hpp"
-//#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
+//#include "oneapi/ccl/types.hpp"
+//#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
 #endif
 
 namespace native {
diff --git a/include/oneapi/ccl/native_device_api/l0/base_impl.hpp b/include/oneapi/ccl/native_device_api/l0/base_impl.hpp
index 19447d44b..61a951030 100644
--- a/include/oneapi/ccl/native_device_api/l0/base_impl.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/base_impl.hpp
@@ -59,7 +59,7 @@ template <TEMPLATE_DECL_ARG>
 cl_base<TEMPLATE_DEF_ARG>::~cl_base() noexcept {
     auto lock = owner.lock();
     // auto ctx = context.lock(); ctx->get();
-    ze_context_handle_t ctxtmp;
+    ze_context_handle_t ctxtmp = nullptr;
     if (lock) {
         lock->on_delete(handle, ctxtmp);
     }
@@ -226,7 +226,7 @@ indexed_storage<value_type> merge_indexed_values(const IndexedContainer& indexes
 }
 
 template <ccl::device_index_enum index_id, class value_type, class value_type_index_extractor>
-indexed_storage<value_type> collect_indexed_data(const ccl::device_indices_t& indexes,
+indexed_storage<value_type> collect_indexed_data(const ccl::device_indices_type& indexes,
                                                  std::vector<value_type>& collected_values,
                                                  value_type_index_extractor functor) {
     indexed_storage<value_type> ret;
diff --git a/include/oneapi/ccl/native_device_api/l0/context.hpp b/include/oneapi/ccl/native_device_api/l0/context.hpp
index ea108184f..c5fb08f28 100644
--- a/include/oneapi/ccl/native_device_api/l0/context.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/context.hpp
@@ -41,16 +41,29 @@ struct ccl_context : public cl_base<ze_context_handle_t, ccl_device_platform, cc
     std::shared_ptr<ccl_context> get_ptr() {
         return this->shared_from_this();
     }
-
 };
 
-struct ccl_context_holder
-{
-    std::map<ccl_device_driver*, std::vector<std::shared_ptr<ccl_context>>> map_context;
+class context_array_t {
+public:
+    using value_type = std::vector<std::shared_ptr<ccl_context>>;
+    using context_array_accessor = detail::unique_accessor<std::mutex, value_type>;
 
-    ze_context_handle_t get() {
-        return nullptr;
-    }
+    context_array_accessor access();
+
+private:
+    std::mutex m;
+    value_type contexts;
 };
 
+struct ccl_context_holder {
+    ze_context_handle_t get();
+    std::shared_ptr<ccl_context> emplace(ccl_device_driver* driver,
+                                         std::shared_ptr<ccl_context>&& ctx);
+    context_array_t& get_context_storage(ccl_device_driver* driver);
+
+private:
+    std::mutex m;
+    std::map<ccl_device_driver*, context_array_t> drivers_context;
+};
+using ccl_driver_context_ptr = std::shared_ptr<ccl_context>;
 } // namespace native
diff --git a/include/oneapi/ccl/native_device_api/l0/device.hpp b/include/oneapi/ccl/native_device_api/l0/device.hpp
index 635989847..a6ccafc5b 100644
--- a/include/oneapi/ccl/native_device_api/l0/device.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/device.hpp
@@ -27,9 +27,9 @@ struct ccl_subdevice;
 struct ccl_device;
 struct ccl_context;
 
-details::cross_device_rating property_p2p_rating_calculator(const ccl_device& lhs,
-                                                            const ccl_device& rhs,
-                                                            size_t weight);
+detail::cross_device_rating property_p2p_rating_calculator(const ccl_device& lhs,
+                                                           const ccl_device& rhs,
+                                                           size_t weight);
 
 // TODO not thread-safe!!!
 struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_context_holder>,
@@ -75,11 +75,11 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
 
     static indexed_handles get_handles(
         const ccl_device_driver& driver,
-        const ccl::device_indices_t& indexes = ccl::device_indices_t());
+        const ccl::device_indices_type& indexes = ccl::device_indices_type());
     static std::shared_ptr<ccl_device> create(
         handle_t h,
         owner_ptr_t&& driver,
-        const ccl::device_indices_t& indexes = ccl::device_indices_t());
+        const ccl::device_indices_type& indexes = ccl::device_indices_type());
 
     std::shared_ptr<ccl_device> get_ptr() {
         return this->shared_from_this();
@@ -89,7 +89,7 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
         return this->shared_from_this();
     }
 
-    context_storage_type get_device_contexts();
+    context_storage_type get_contexts();
     sub_devices_container_type& get_subdevices();
     const sub_devices_container_type& get_subdevices() const;
 
@@ -108,43 +108,59 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
     template <class elem_t>
     device_memory<elem_t> alloc_memory(
         size_t count,
-        size_t alignment, std::shared_ptr<ccl_context> ctx,
+        size_t alignment,
+        std::shared_ptr<ccl_context> ctx,
         const ze_device_mem_alloc_desc_t& mem_descr = get_default_mem_alloc_desc(),
         const ze_host_mem_alloc_desc_t& host_descr = get_default_host_alloc_desc()) {
-        return device_memory<elem_t>(reinterpret_cast<elem_t*>(device_alloc_memory(
-                                         count * sizeof(elem_t), alignment, mem_descr, host_descr, ctx)),
-                                     count,
-                                     get_ptr(), ctx);
+        return device_memory<elem_t>(
+            reinterpret_cast<elem_t*>(
+                device_alloc_memory(count * sizeof(elem_t), alignment, mem_descr, host_descr, ctx)),
+            count,
+            get_ptr(),
+            ctx);
     }
 
     template <class elem_t>
     device_memory_ptr<elem_t> alloc_shared_memory(
         size_t count,
         size_t alignment,
-        const ze_host_mem_alloc_desc_t& host_desc, std::shared_ptr<ccl_context> ctx,
+        const ze_host_mem_alloc_desc_t& host_desc,
+        std::shared_ptr<ccl_context> ctx,
         const ze_device_mem_alloc_desc_t& mem_descr = get_default_mem_alloc_desc()) {
         return std::make_shared<device_memory<elem_t>>(
             reinterpret_cast<elem_t*>(device_alloc_shared_memory(
                 count * sizeof(elem_t), alignment, host_desc, mem_descr, ctx)),
             count,
-            get_ptr(), ctx);
+            get_ptr(),
+            ctx);
     }
-    device_ipc_memory_handle create_ipc_memory_handle(void* device_mem_ptr, std::shared_ptr<ccl_context> ctx);
-    std::shared_ptr<device_ipc_memory_handle> create_shared_ipc_memory_handle(void* device_mem_ptr, std::shared_ptr<ccl_context> ctx);
+    device_ipc_memory_handle create_ipc_memory_handle(void* device_mem_ptr,
+                                                      std::shared_ptr<ccl_context> ctx);
+    std::shared_ptr<device_ipc_memory_handle> create_shared_ipc_memory_handle(
+        void* device_mem_ptr,
+        std::shared_ptr<ccl_context> ctx);
 
-    device_ipc_memory get_ipc_memory(std::shared_ptr<device_ipc_memory_handle>&& handle, std::shared_ptr<ccl_context> ctx);
+    device_ipc_memory get_ipc_memory(std::shared_ptr<device_ipc_memory_handle>&& handle,
+                                     std::shared_ptr<ccl_context> ctx);
     std::shared_ptr<device_ipc_memory> restore_shared_ipc_memory(
-        std::shared_ptr<device_ipc_memory_handle>&& handle, std::shared_ptr<ccl_context> ctx);
+        std::shared_ptr<device_ipc_memory_handle>&& handle,
+        std::shared_ptr<ccl_context> ctx);
 
-    device_queue create_cmd_queue(std::shared_ptr<ccl_context> ctx,
+    device_queue create_cmd_queue(
+        std::shared_ptr<ccl_context> ctx,
         const ze_command_queue_desc_t& properties = get_default_queue_desc());
-    ze_fence_handle_t create_or_get_fence(const device_queue& queue, std::shared_ptr<ccl_context> ctx);
-    device_queue& get_cmd_queue(const ze_command_queue_desc_t& properties, std::shared_ptr<ccl_context> ctx);
-    device_cmd_list create_cmd_list(std::shared_ptr<ccl_context> ctx,
+    device_queue_fence& get_fence(const device_queue& queue, std::shared_ptr<ccl_context> ctx);
+    device_queue& get_cmd_queue(const ze_command_queue_desc_t& properties,
+                                std::shared_ptr<ccl_context> ctx);
+    device_cmd_list create_cmd_list(
+        std::shared_ptr<ccl_context> ctx,
         const ze_command_list_desc_t& properties = get_default_list_desc());
-    device_cmd_list& get_cmd_list(std::shared_ptr<ccl_context> ctx,
+    device_cmd_list& get_cmd_list(
+        std::shared_ptr<ccl_context> ctx,
         const ze_command_list_desc_t& properties = get_default_list_desc());
-    device_module_ptr create_module(const ze_module_desc_t& descr, size_t hash, std::shared_ptr<ccl_context> ctx);
+    device_module_ptr create_module(const ze_module_desc_t& descr,
+                                    size_t hash,
+                                    std::shared_ptr<ccl_context> ctx);
 
     template <class elem_t>
     bool is_own_memory(const device_memory<elem_t>& mem_handle) {
@@ -154,7 +170,8 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
             //return true;
         }*/
 
-        handle_t assoc_handle = get_assoc_device_handle(mem_handle.handle, get_owner(), std::shared_ptr<ccl_context> { });
+        handle_t assoc_handle =
+            get_assoc_device_handle(mem_handle.handle, get_owner(), std::shared_ptr<ccl_context>{});
         return assoc_handle == handle;
     }
 
@@ -170,7 +187,7 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
     template <class elem_t>
     void on_delete(elem_t* mem_handle, ze_context_handle_t& ctx) {
         // TODO: ctx
-        device_free_memory(static_cast<void*>(mem_handle), std::shared_ptr<ccl_context> { });
+        device_free_memory(static_cast<void*>(mem_handle), std::shared_ptr<ccl_context>{});
     }
 
     // serialize/deserialize
@@ -185,10 +202,13 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
     virtual size_t serialize(std::vector<uint8_t>& out,
                              size_t from_pos,
                              size_t expected_size) const;
-    std::shared_ptr<ccl_context> get_default_device_context();
+    std::shared_ptr<ccl_context> get_default_context();
 
 private:
-    ccl_device(handle_t h, owner_ptr_t&& parent, std::weak_ptr<ccl_context_holder>&& ctx, std::false_type);
+    ccl_device(handle_t h,
+               owner_ptr_t&& parent,
+               std::weak_ptr<ccl_context_holder>&& ctx,
+               std::false_type);
     void initialize_device_data();
     void* device_alloc_memory(size_t bytes_count,
                               size_t alignment,
@@ -201,8 +221,9 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
                                      const ze_device_mem_alloc_desc_t& mem_descr,
                                      std::shared_ptr<ccl_context> ctx);
 
-    static handle_t get_assoc_device_handle(const void* ptr, const ccl_device_driver* driver,
-                                                            std::shared_ptr<ccl_context> ctx);
+    static handle_t get_assoc_device_handle(const void* ptr,
+                                            const ccl_device_driver* driver,
+                                            std::shared_ptr<ccl_context> ctx);
     void device_free_memory(void* mem_ptr, std::shared_ptr<ccl_context> ctx);
 
     //TODO shared mutex?
@@ -210,6 +231,7 @@ struct ccl_device : public cl_base<ze_device_handle_t, ccl_device_driver, ccl_co
     std::map<ze_command_queue_desc_t, device_queue, command_queue_desc_comparator> cmd_queus;
     std::map<ze_command_queue_handle_t, device_queue_fence> queue_fences;
 
+    std::mutex list_mutex;
     std::map<ze_command_list_desc_t, device_cmd_list, command_list_desc_comparator> cmd_lists;
     sub_devices_container_type sub_devices;
 
diff --git a/include/oneapi/ccl/native_device_api/l0/driver.hpp b/include/oneapi/ccl/native_device_api/l0/driver.hpp
index 5ee420051..60612c510 100644
--- a/include/oneapi/ccl/native_device_api/l0/driver.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/driver.hpp
@@ -14,6 +14,7 @@
  limitations under the License.
 */
 #pragma once
+
 #include <iostream>
 #include <map>
 #include <memory>
@@ -27,8 +28,9 @@ struct ccl_device_driver;
 struct ccl_device;
 struct ccl_context;
 struct ccl_context_holder;
-struct ccl_device_driver : public cl_base<ze_driver_handle_t, ccl_device_platform, ccl_context_holder>,
-                           std::enable_shared_from_this<ccl_device_driver> {
+struct ccl_device_driver
+        : public cl_base<ze_driver_handle_t, ccl_device_platform, ccl_context_holder>,
+          std::enable_shared_from_this<ccl_device_driver> {
     friend std::ostream& operator<<(std::ostream&, const ccl_device_driver&);
 
     using base = cl_base<ze_driver_handle_t, ccl_device_platform, ccl_context_holder>;
@@ -44,10 +46,13 @@ struct ccl_device_driver : public cl_base<ze_driver_handle_t, ccl_device_platfor
     using devices_storage_type = std::map<ccl::index_type, device_ptr>;
     using indexed_driver_handles = indexed_storage<handle_t>;
 
-    ccl_device_driver(handle_t h, uint32_t id, owner_ptr_t&& platform, std::weak_ptr<ccl_context_holder>&& ctx);
+    ccl_device_driver(handle_t h,
+                      uint32_t id,
+                      owner_ptr_t&& platform,
+                      std::weak_ptr<ccl_context_holder>&& ctx);
 
     static indexed_driver_handles get_handles(
-        const ccl::device_indices_t& requested_driver_indexes = ccl::device_indices_t());
+        const ccl::device_indices_type& requested_driver_indexes = ccl::device_indices_type());
     static std::shared_ptr<ccl_device_driver> create(
         handle_t h,
         uint32_t id,
@@ -58,7 +63,7 @@ struct ccl_device_driver : public cl_base<ze_driver_handle_t, ccl_device_platfor
         handle_t h,
         uint32_t id,
         owner_ptr_t&& platform,
-        const ccl::device_indices_t& rank_device_affinity = ccl::device_indices_t());
+        const ccl::device_indices_type& rank_device_affinity = ccl::device_indices_type());
 
     std::shared_ptr<ccl_device_driver> get_ptr() {
         return this->shared_from_this();
@@ -74,6 +79,7 @@ struct ccl_device_driver : public cl_base<ze_driver_handle_t, ccl_device_platfor
     const_device_ptr get_device(const ccl::device_index_type& path) const;
 
     std::shared_ptr<ccl_context> create_context();
+    std::shared_ptr<ccl_context> create_context_from_handle(ccl_context::handle_t);
 
     std::string to_string(const std::string& prefix = std::string()) const;
 
@@ -92,8 +98,8 @@ struct ccl_device_driver : public cl_base<ze_driver_handle_t, ccl_device_platfor
     // utility
     static ccl::device_mask_t create_device_mask(const std::string& str_mask,
                                                  std::ios_base::fmtflags flag = std::ios_base::hex);
-    static ccl::device_indices_t get_device_indices(const ccl::device_mask_t& mask);
-    static ccl::device_mask_t get_device_mask(const ccl::device_indices_t& device_idx);
+    static ccl::device_indices_type get_device_indices(const ccl::device_mask_t& mask);
+    static ccl::device_mask_t get_device_mask(const ccl::device_indices_type& device_idx);
 
     uint32_t driver_id;
 
diff --git a/include/oneapi/ccl/native_device_api/l0/export.hpp b/include/oneapi/ccl/native_device_api/l0/export.hpp
index 8c5540f96..c80283a3b 100644
--- a/include/oneapi/ccl/native_device_api/l0/export.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/export.hpp
@@ -14,21 +14,22 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 
 #define CL_BACKEND_TYPE ccl::cl_backend_type::l0
 
 #include "oneapi/ccl/native_device_api/l0/declarations.hpp"
 
-namespace ccl
-{
+namespace ccl {
 
 template <>
 struct backend_info<CL_BACKEND_TYPE> {
     CCL_API static constexpr ccl::cl_backend_type type() {
-        return CL_BACKEND_TYPE; }
+        return CL_BACKEND_TYPE;
+    }
     CCL_API static constexpr const char* name() {
-        return "CL_INTEL_L0_BACKEND"; }
+        return "CL_INTEL_L0_BACKEND";
+    }
 };
 
 template <>
@@ -38,25 +39,23 @@ struct generic_device_type<CL_BACKEND_TYPE> {
     using ccl_native_t = std::shared_ptr<impl_t>;
 
     generic_device_type(device_index_type id);
+    generic_device_type(ccl_native_t dev);
     device_index_type get_id() const noexcept;
-    ccl_native_t get() noexcept;
+    ccl_native_t& get() noexcept;
+    const ccl_native_t& get() const noexcept;
 
     handle_t device;
 };
 
-#ifndef ze_context_handle_t
-#define ze_context_handle_t void*
-#endif
-
 template <>
-struct generic_device_context_type<CL_BACKEND_TYPE> {
+struct generic_context_type<CL_BACKEND_TYPE> {
     using handle_t = ze_context_handle_t;
     using impl_t = native::ccl_context;
     using ccl_native_t = std::shared_ptr<impl_t>;
 
-    generic_device_context_type();
-    generic_device_context_type(handle_t ctx);
-    ccl_native_t get() noexcept;
+    generic_context_type();
+    generic_context_type(ccl_native_t ctx);
+    ccl_native_t& get() noexcept;
     const ccl_native_t& get() const noexcept;
 
     ccl_native_t context;
@@ -66,20 +65,20 @@ template <>
 struct generic_platform_type<CL_BACKEND_TYPE> {
     using handle_t = native::ccl_device_platform;
     using impl_t = handle_t;
-    using ccl_native_t = std::shared_ptr<impl_t>;
+    using ccl_native_t = impl_t;
 
-    ccl_native_t get() noexcept;
+    ccl_native_t& get() noexcept;
     const ccl_native_t& get() const noexcept;
 };
 
 template <>
 struct generic_stream_type<CL_BACKEND_TYPE> {
     using handle_t = ze_command_queue_handle_t;
-    using impl_t = handle_t;
-    using ccl_native_t = std::shared_ptr<native::ccl_device::device_queue>;
+    using impl_t = native::ccl_device::device_queue;
+    using ccl_native_t = std::shared_ptr<impl_t>;
 
     generic_stream_type(handle_t q);
-    ccl_native_t get() noexcept;
+    ccl_native_t& get() noexcept;
     const ccl_native_t& get() const noexcept;
 
     ccl_native_t queue;
@@ -92,7 +91,7 @@ struct generic_event_type<CL_BACKEND_TYPE> {
     using ccl_native_t = std::shared_ptr<native::ccl_device::device_event>;
 
     generic_event_type(handle_t e);
-    ccl_native_t get() noexcept;
+    ccl_native_t& get() noexcept;
     const ccl_native_t& get() const noexcept;
 
     ccl_native_t event;
@@ -104,4 +103,4 @@ struct generic_event_type<CL_BACKEND_TYPE> {
 API_CLASS_TYPE_INFO(native::ccl_device::device_queue);
 //API_CLASS_TYPE_INFO(ze_command_queue_handle_t);
 API_CLASS_TYPE_INFO(ze_event_handle_t);
-}
+} // namespace ccl
diff --git a/include/oneapi/ccl/native_device_api/l0/platform.hpp b/include/oneapi/ccl/native_device_api/l0/platform.hpp
index ba27309dc..4b625376a 100644
--- a/include/oneapi/ccl/native_device_api/l0/platform.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/platform.hpp
@@ -28,7 +28,7 @@ struct ccl_device_platform : std::enable_shared_from_this<ccl_device_platform> {
     using context_storage_type = std::shared_ptr<ccl_context_holder>;
 
     //void init_drivers(const device_affinity_per_driver& affinities / * = device_affinity_per_driver()* /);
-    void init_drivers(const ccl::device_indices_t& indices = ccl::device_indices_t());
+    void init_drivers(const ccl::device_indices_type& indices = ccl::device_indices_type());
 
     std::shared_ptr<ccl_device_platform> get_ptr() {
         return this->shared_from_this();
@@ -50,12 +50,12 @@ struct ccl_device_platform : std::enable_shared_from_this<ccl_device_platform> {
     void on_delete(ccl_context::handle_t& context, ze_context_handle_t& ctx);
 
     static std::shared_ptr<ccl_device_platform> create(
-        const ccl::device_indices_t& indices = ccl::device_indices_t());
+        const ccl::device_indices_type& indices = ccl::device_indices_type());
     //static std::shared_ptr<ccl_device_platform> create(const device_affinity_per_driver& affinities);
 
-    details::adjacency_matrix calculate_device_access_metric(
-        const ccl::device_indices_t& indices = ccl::device_indices_t(),
-        details::p2p_rating_function func = details::binary_p2p_rating_calculator) const;
+    detail::adjacency_matrix calculate_device_access_metric(
+        const ccl::device_indices_type& indices = ccl::device_indices_type(),
+        detail::p2p_rating_function func = detail::binary_p2p_rating_calculator) const;
 
 private:
     ccl_device_platform();
diff --git a/include/oneapi/ccl/native_device_api/l0/primitives.hpp b/include/oneapi/ccl/native_device_api/l0/primitives.hpp
index d55431b71..ddd61523b 100644
--- a/include/oneapi/ccl/native_device_api/l0/primitives.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/primitives.hpp
@@ -34,6 +34,7 @@ std::string to_string(const ze_device_compute_properties_t& compute_properties,
                       const std::string& prefix = std::string("\n"));
 std::string to_string(const ze_memory_allocation_properties_t& prop);
 std::string to_string(const ze_device_p2p_properties_t& properties);
+std::string to_string(const ze_device_mem_alloc_desc_t& mem_descr);
 std::string to_string(const ze_ipc_mem_handle_t& handle);
 
 /**
@@ -66,13 +67,18 @@ template <class resource_owner, class cl_context>
 using event = cl_base<ze_event_handle_t, resource_owner, cl_context>;
 
 template <class elem_t, class resource_owner, class cl_context>
-struct memory/*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t*, resource_owner, cl_context> {
+struct memory /*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t*,
+                                                                         resource_owner,
+                                                                         cl_context> {
     using base = cl_base<elem_t*, resource_owner, cl_context>;
     using base::get_owner;
     using base::get_ctx;
     using base::handle;
 
-    memory(elem_t* h, size_t count, std::weak_ptr<resource_owner>&& owner, std::weak_ptr<cl_context>&& context);
+    memory(elem_t* h,
+           size_t count,
+           std::weak_ptr<resource_owner>&& owner,
+           std::weak_ptr<cl_context>&& context);
 
     /**
      *  Memory operations
@@ -84,16 +90,18 @@ struct memory/*<elem_t, resource_owner, cl_context>*/ : private cl_base<elem_t*,
     template <int N>
     void enqueue_write_sync(const std::array<elem_t, N>& src);
     void enqueue_write_sync(const elem_t* src, size_t n);
+    void enqueue_write_sync(const elem_t* src, int n);
 
     // async
-    queue_fence<resource_owner, cl_context> enqueue_write_async(const std::vector<elem_t>& src,
-                                                    queue<resource_owner, cl_context>& queue);
+    queue_fence<resource_owner, cl_context> enqueue_write_async(
+        const std::vector<elem_t>& src,
+        queue<resource_owner, cl_context>& queue);
     template <int N>
-    queue_fence<resource_owner, cl_context> enqueue_write_async(const std::array<elem_t, N>& src,
-                                                    queue<resource_owner, cl_context>& queue);
-    queue_fence<resource_owner, cl_context> enqueue_write_async(const elem_t* src,
-                                                    size_t n,
-                                                    queue<resource_owner, cl_context>& queue);
+    queue_fence<resource_owner, cl_context> enqueue_write_async(
+        const std::array<elem_t, N>& src,
+        queue<resource_owner, cl_context>& queue);
+    queue_fence<resource_owner, cl_context>
+    enqueue_write_async(const elem_t* src, size_t n, queue<resource_owner, cl_context>& queue);
 
     // sync memory-copy read
     std::vector<elem_t> enqueue_read_sync(size_t requested_size = 0) const;
diff --git a/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp b/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp
index 7af8fef44..7a3294ecd 100644
--- a/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/primitives_impl.hpp
@@ -34,7 +34,10 @@ void copy_memory_to_device_sync_unsafe(void* dst,
 }
 
 template <TEMPLATE_DECL_ARG>
-memory<TEMPLATE_DEF_ARG>::memory(elem_t* h, size_t count, std::weak_ptr<resource_owner>&& owner, std::weak_ptr<cl_context>&& context)
+memory<TEMPLATE_DEF_ARG>::memory(elem_t* h,
+                                 size_t count,
+                                 std::weak_ptr<resource_owner>&& owner,
+                                 std::weak_ptr<cl_context>&& context)
         : base(h, std::move(owner), std::move(context)),
           elem_count(count) {}
 
@@ -117,6 +120,7 @@ void memory<TEMPLATE_DEF_ARG>::enqueue_write_sync(const std::array<elem_t, N>& s
         throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + "\n" + ex.what());
     }
 }
+
 template <TEMPLATE_DECL_ARG>
 void memory<TEMPLATE_DEF_ARG>::enqueue_write_sync(const elem_t* src, size_t src_elem_count) {
     if (!src) {
@@ -142,6 +146,12 @@ void memory<TEMPLATE_DEF_ARG>::enqueue_write_sync(const elem_t* src, size_t src_
     }
 }
 
+template <TEMPLATE_DECL_ARG>
+void memory<TEMPLATE_DEF_ARG>::enqueue_write_sync(const elem_t* src, int src_elem_count) {
+    size_t elem_count = src_elem_count;
+    enqueue_write_sync(src, elem_count);
+}
+
 template <TEMPLATE_DECL_ARG>
 std::vector<elem_t> memory<TEMPLATE_DEF_ARG>::enqueue_read_sync(
     size_t src_elem_count /* = 0*/) const {
diff --git a/include/oneapi/ccl/native_device_api/l0/subdevice.hpp b/include/oneapi/ccl/native_device_api/l0/subdevice.hpp
index a7c800c91..35bfe08bb 100644
--- a/include/oneapi/ccl/native_device_api/l0/subdevice.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/subdevice.hpp
@@ -32,13 +32,16 @@ struct ccl_subdevice : public ccl_device {
 
     friend std::ostream& operator<<(std::ostream&, const ccl_subdevice& node);
 
-    ccl_subdevice(handle_t h, owner_ptr_t&& device, base::owner_ptr_t&& driver, base::context_ptr_t&& ctx);
+    ccl_subdevice(handle_t h,
+                  owner_ptr_t&& device,
+                  base::owner_ptr_t&& driver,
+                  base::context_ptr_t&& ctx);
     virtual ~ccl_subdevice();
 
     // factory
     static indexed_handles get_handles(
         const ccl_device& device,
-        const ccl::device_indices_t& requested_indices = ccl::device_indices_t());
+        const ccl::device_indices_type& requested_indices = ccl::device_indices_type());
     static std::shared_ptr<ccl_subdevice> create(handle_t h,
                                                  owner_ptr_t&& device,
                                                  base::owner_ptr_t&& driver);
@@ -64,7 +67,11 @@ struct ccl_subdevice : public ccl_device {
                                                     ccl_device_platform& platform);
 
 private:
-    ccl_subdevice(handle_t h, owner_ptr_t&& device, base::owner_ptr_t&& driver, base::context_ptr_t&& ctx, std::false_type);
+    ccl_subdevice(handle_t h,
+                  owner_ptr_t&& device,
+                  base::owner_ptr_t&& driver,
+                  base::context_ptr_t&& ctx,
+                  std::false_type);
     void initialize_subdevice_data();
     owner_ptr_t parent_device;
 };
diff --git a/include/oneapi/ccl/native_device_api/l0/utils.hpp b/include/oneapi/ccl/native_device_api/l0/utils.hpp
index 4b95ab064..c89e9af82 100644
--- a/include/oneapi/ccl/native_device_api/l0/utils.hpp
+++ b/include/oneapi/ccl/native_device_api/l0/utils.hpp
@@ -16,13 +16,13 @@
 #pragma once
 #include <functional>
 
-#include "oneapi/ccl/ccl_types.hpp"
-//#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+//#include "oneapi/ccl/type_traits.hpp"
 
 namespace native {
 
 struct ccl_device;
-namespace details {
+namespace detail {
 
 /*
  * Boolean matrix represents P2P device capable connectivity 'cross_device_rating'
@@ -51,5 +51,19 @@ using p2p_rating_function =
     std::function<cross_device_rating(const ccl_device&, const ccl_device&)>;
 
 cross_device_rating binary_p2p_rating_calculator(const ccl_device& lhs, const ccl_device& rhs);
-} // namespace details
+
+template <class Lock, class Resource>
+struct unique_accessor {
+    unique_accessor(Lock& mutex, Resource& storage) : lock(mutex), inner_data(storage) {}
+    unique_accessor(unique_accessor&& src) = default;
+
+    Resource& get() {
+        return inner_data;
+    }
+
+private:
+    std::unique_lock<Lock> lock;
+    Resource& inner_data;
+};
+} // namespace detail
 } // namespace native
diff --git a/include/oneapi/ccl/native_device_api/sycl/export.hpp b/include/oneapi/ccl/native_device_api/sycl/export.hpp
index 510e41dbc..17b665e05 100644
--- a/include/oneapi/ccl/native_device_api/sycl/export.hpp
+++ b/include/oneapi/ccl/native_device_api/sycl/export.hpp
@@ -14,24 +14,25 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 
 #define CL_BACKEND_TYPE ccl::cl_backend_type::dpcpp_sycl
 #include <CL/sycl.hpp>
 
-namespace ccl
-{
+namespace ccl {
 template <>
 struct backend_info<CL_BACKEND_TYPE> {
     CCL_API static constexpr ccl::cl_backend_type type() {
-        return CL_BACKEND_TYPE; }
+        return CL_BACKEND_TYPE;
+    }
     CCL_API static constexpr const char* name() {
-        return "CL_DPCPP_BACKEND"; }
+        return "CL_DPCPP_BACKEND";
+    }
 };
 
 template <>
 struct generic_device_type<CL_BACKEND_TYPE> {
-    using handle_t = cl_device_id;//cl::sycl::device;
+    using handle_t = cl_device_id; //cl::sycl::device;
     using impl_t = cl::sycl::device;
     using ccl_native_t = impl_t;
 
@@ -40,18 +41,19 @@ struct generic_device_type<CL_BACKEND_TYPE> {
     generic_device_type(const cl::sycl::device& device);
     device_index_type get_id() const;
     ccl_native_t& get() noexcept;
+    const ccl_native_t& get() const noexcept;
 
     cl::sycl::device device;
 };
 
 template <>
-struct generic_device_context_type<CL_BACKEND_TYPE> {
+struct generic_context_type<CL_BACKEND_TYPE> {
     using handle_t = cl_context;
     using impl_t = cl::sycl::context;
     using ccl_native_t = impl_t;
 
-    generic_device_context_type();
-    generic_device_context_type(ccl_native_t ctx);
+    generic_context_type();
+    generic_context_type(ccl_native_t ctx);
     ccl_native_t& get() noexcept;
     const ccl_native_t& get() const noexcept;
 
@@ -102,4 +104,4 @@ struct generic_event_type<CL_BACKEND_TYPE> {
 API_CLASS_TYPE_INFO(cl_command_queue);
 API_CLASS_TYPE_INFO(cl_context);
 API_CLASS_TYPE_INFO(cl_event)
-}
+} // namespace ccl
diff --git a/include/oneapi/ccl/native_device_api/sycl_l0/export.hpp b/include/oneapi/ccl/native_device_api/sycl_l0/export.hpp
index cc8d49ac6..07d06eaba 100644
--- a/include/oneapi/ccl/native_device_api/sycl_l0/export.hpp
+++ b/include/oneapi/ccl/native_device_api/sycl_l0/export.hpp
@@ -14,24 +14,26 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
+
+#include "oneapi/ccl/types.hpp"
 
 #define CL_BACKEND_TYPE ccl::cl_backend_type::dpcpp_sycl_l0
 #include <CL/sycl.hpp>
 
-namespace ccl
-{
+namespace ccl {
 template <>
 struct backend_info<CL_BACKEND_TYPE> {
     CCL_API static constexpr ccl::cl_backend_type type() {
-        return CL_BACKEND_TYPE; }
+        return CL_BACKEND_TYPE;
+    }
     CCL_API static constexpr const char* name() {
-        return "CL_DPCPP_POWERED_L0_BACKEND"; }
+        return "CL_DPCPP_POWERED_L0_BACKEND";
+    }
 };
 
 template <>
 struct generic_device_type<CL_BACKEND_TYPE> {
-    using handle_t = cl_device_id;//cl::sycl::device;
+    using handle_t = cl_device_id; //cl::sycl::device;
     using impl_t = cl::sycl::device;
     using ccl_native_t = impl_t;
 
@@ -40,18 +42,19 @@ struct generic_device_type<CL_BACKEND_TYPE> {
     generic_device_type(const cl::sycl::device& device);
     device_index_type get_id() const;
     ccl_native_t& get() noexcept;
+    const ccl_native_t& get() const noexcept;
 
     cl::sycl::device device;
 };
 
 template <>
-struct generic_device_context_type<CL_BACKEND_TYPE> {
+struct generic_context_type<CL_BACKEND_TYPE> {
     using handle_t = cl_context;
     using impl_t = cl::sycl::context;
     using ccl_native_t = impl_t;
 
-    generic_device_context_type();
-    generic_device_context_type(ccl_native_t ctx);
+    generic_context_type();
+    generic_context_type(ccl_native_t ctx);
     ccl_native_t& get() noexcept;
     const ccl_native_t& get() const noexcept;
 
@@ -102,4 +105,4 @@ struct generic_event_type<CL_BACKEND_TYPE> {
 API_CLASS_TYPE_INFO(cl_command_queue);
 API_CLASS_TYPE_INFO(cl_context);
 API_CLASS_TYPE_INFO(cl_event)
-}
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_stream.hpp b/include/oneapi/ccl/stream.hpp
similarity index 63%
rename from include/oneapi/ccl/ccl_stream.hpp
rename to include/oneapi/ccl/stream.hpp
index 2dc7f82f9..9fbb07089 100644
--- a/include/oneapi/ccl/ccl_stream.hpp
+++ b/include/oneapi/ccl/stream.hpp
@@ -21,10 +21,18 @@
 
 class ccl_stream;
 namespace ccl {
+namespace detail {
+class environment;
+}
+
+namespace v1 {
+struct ccl_empty_attr;
+class communicator;
+struct impl_dispatch;
 
 /**
  * A stream object is an abstraction over CPU/GPU streams
- * Has no defined public constructor. Use ccl::environment::create_stream
+ * Has no defined public constructor. Use ccl::create_stream
  * for stream objects creation
  */
 /**
@@ -47,8 +55,9 @@ class stream : public ccl_api_base_copyable<stream, direct_access_policy, ccl_st
     /**
      * Declare native stream type
      */
-    using native_t = typename details::ccl_api_type_attr_traits<ccl::stream_attr_id,
-                                                                ccl::stream_attr_id::native_handle>::return_type;
+    using native_t =
+        typename detail::ccl_api_type_attr_traits<stream_attr_id,
+                                                  stream_attr_id::native_handle>::return_type;
 
     ~stream();
 
@@ -60,25 +69,25 @@ class stream : public ccl_api_base_copyable<stream, direct_access_policy, ccl_st
      * Get specific attribute value by @attrId
      */
     template <stream_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<stream_attr_id, attrId>::return_type& get()
+    const typename detail::ccl_api_type_attr_traits<stream_attr_id, attrId>::return_type& get()
         const;
 
     /**
      * Get native stream object
      */
-     native_t& get_native();
-     const native_t& get_native() const;
+    native_t& get_native();
+    const native_t& get_native() const;
+
 private:
-    friend class environment;
-    friend class communicator;
-    friend struct ccl_empty_attr;
-    friend struct impl_dispatch;
+    friend class ccl::detail::environment;
+    friend class ccl::v1::communicator;
+    friend struct ccl::ccl_empty_attr;
+    friend struct ccl::v1::impl_dispatch;
 
     template <class... attr_value_pair_t>
-    friend stream create_stream_from_attr(
-        typename unified_device_type::ccl_native_t device,
-        typename unified_device_context_type::ccl_native_t context,
-        attr_value_pair_t&&... avps);
+    friend stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
+                                          typename unified_context_type::ccl_native_t context,
+                                          attr_value_pair_t&&... avps);
     template <class... attr_value_pair_t>
     friend stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
                                           attr_value_pair_t&&... avps);
@@ -91,12 +100,11 @@ class stream : public ccl_api_base_copyable<stream, direct_access_policy, ccl_st
     template <stream_attr_id attrId,
               class Value/*,
               class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename details::ccl_api_type_attr_traits<stream_attr_id, attrId>::return_type set(const Value& v);
+    typename detail::ccl_api_type_attr_traits<stream_attr_id, attrId>::return_type set(const Value& v);
 
     void build_from_params();
-    stream(
-        const typename details::ccl_api_type_attr_traits<stream_attr_id,
-                                                         stream_attr_id::version>::type& version);
+    stream(const typename detail::ccl_api_type_attr_traits<stream_attr_id,
+                                                           stream_attr_id::version>::type& version);
 
     /**
      *  Factory methods
@@ -115,20 +123,25 @@ class stream : public ccl_api_base_copyable<stream, direct_access_policy, ccl_st
                                           attr_value_pair_t&&... avps);
 
     template <class... attr_value_pair_t>
-    static stream create_stream_from_attr(
-        typename unified_device_type::ccl_native_t device,
-        typename unified_device_context_type::ccl_native_t context,
-        attr_value_pair_t&&... avps);
+    static stream create_stream_from_attr(typename unified_device_type::ccl_native_t device,
+                                          typename unified_context_type::ccl_native_t context,
+                                          attr_value_pair_t&&... avps);
 };
 
-template <stream_attr_id t, class value_type>
-constexpr auto attr_val(value_type v)
-    -> details::attr_value_tripple<stream_attr_id, t, value_type> {
-    return details::attr_value_tripple<stream_attr_id, t, value_type>(v);
-}
-
 /**
  * Declare extern empty attributes
  */
 extern stream default_stream;
+
+template <stream_attr_id t, class value_type>
+constexpr auto attr_val(value_type v) -> detail::attr_value_triple<stream_attr_id, t, value_type> {
+    return detail::attr_value_triple<stream_attr_id, t, value_type>(v);
+}
+
+} // namespace v1
+
+using v1::stream;
+using v1::default_stream;
+using v1::attr_val;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_stream_attr_ids.hpp b/include/oneapi/ccl/stream_attr_ids.hpp
similarity index 93%
rename from include/oneapi/ccl/ccl_stream_attr_ids.hpp
rename to include/oneapi/ccl/stream_attr_ids.hpp
index d86dfedab..17080c98b 100644
--- a/include/oneapi/ccl/ccl_stream_attr_ids.hpp
+++ b/include/oneapi/ccl/stream_attr_ids.hpp
@@ -21,6 +21,9 @@
 
 class ccl_stream;
 namespace ccl {
+
+namespace v1 {
+
 /**
  * Stream attribute ids
  */
@@ -39,4 +42,8 @@ enum class stream_attr_id : int {
     last_value
 };
 
+} // namespace v1
+
+using v1::stream_attr_id;
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/ccl_stream_attr_ids_traits.hpp b/include/oneapi/ccl/stream_attr_ids_traits.hpp
similarity index 92%
rename from include/oneapi/ccl/ccl_stream_attr_ids_traits.hpp
rename to include/oneapi/ccl/stream_attr_ids_traits.hpp
index c50f05075..9d44e8f01 100644
--- a/include/oneapi/ccl/ccl_stream_attr_ids_traits.hpp
+++ b/include/oneapi/ccl/stream_attr_ids_traits.hpp
@@ -20,7 +20,8 @@
 #endif
 
 namespace ccl {
-namespace details {
+
+namespace detail {
 
 /**
  * Traits for stream attributes specializations
@@ -47,8 +48,8 @@ struct ccl_api_type_attr_traits<stream_attr_id, stream_attr_id::device> {
 
 template <>
 struct ccl_api_type_attr_traits<stream_attr_id, stream_attr_id::context> {
-    using type = typename unified_device_context_type::ccl_native_t;
-    using handle_t = typename unified_device_context_type::handle_t;
+    using type = typename unified_context_type::ccl_native_t;
+    using handle_t = typename unified_context_type::handle_t;
     using return_type = type;
 };
 
@@ -82,5 +83,6 @@ struct ccl_api_type_attr_traits<stream_attr_id, stream_attr_id::priority> {
     using return_type = type;
 };
 
-} // namespace details
+} // namespace detail
+
 } // namespace ccl
diff --git a/include/oneapi/ccl/string.hpp b/include/oneapi/ccl/string.hpp
new file mode 100644
index 000000000..072642012
--- /dev/null
+++ b/include/oneapi/ccl/string.hpp
@@ -0,0 +1,161 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+namespace ccl {
+
+namespace v1 {
+
+class string {
+public:
+    ~string() {
+        delete[] storage;
+        storage = nullptr;
+        len = 0;
+    }
+
+    string() {
+        storage = new char[1];
+        *storage = '\0';
+        len = 0;
+    }
+
+    string(const char* str) {
+        len = strlen(str);
+        storage = new char[len + 1];
+        memcpy(storage, str, len * sizeof(char));
+        storage[len] = '\0';
+    }
+
+    string(const string& str) {
+        len = str.len;
+        storage = new char[len + 1];
+        memcpy(storage, str.storage, len * sizeof(char));
+        storage[len] = '\0';
+    }
+
+    string(string&& str) noexcept {
+        len = str.len;
+        storage = str.storage;
+        str.len = 0;
+        str.storage = nullptr;
+    }
+
+    string(const std::string& str) {
+        len = str.length();
+        storage = new char[len + 1];
+        memcpy(storage, str.c_str(), len * sizeof(char));
+        storage[len] = '\0';
+    }
+
+    string& operator=(const string& str) {
+        if (this != &str) {
+            if (len != str.len) {
+                len = str.len;
+                delete[] storage;
+                storage = new char[len + 1];
+            }
+            memcpy(storage, str.storage, len * sizeof(char));
+            storage[len] = '\0';
+        }
+        return *this;
+    }
+
+    string& operator=(string&& str) noexcept {
+        len = str.len;
+        storage = str.storage;
+        str.len = 0;
+        str.storage = nullptr;
+        return *this;
+    }
+
+    size_t length() const {
+        return len;
+    }
+
+    const char* c_str() const {
+        return storage;
+    };
+
+    operator std::string() const {
+        return std::string(storage);
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const string& str) {
+        out << str.storage;
+        return out;
+    }
+
+    string operator+(const char* str) {
+        auto str_len = strlen(str);
+        if (str_len > 0) {
+            auto new_storage = new char[len + str_len + 1];
+            memcpy(new_storage, storage, len * sizeof(char));
+            memcpy(&new_storage[len], str, str_len * sizeof(char));
+            new_storage[len + str_len] = '\0';
+            string res(new_storage);
+            delete[] new_storage;
+            return res;
+        }
+        return string(storage);
+    }
+
+    string operator+(const string& str) {
+        return (*this + str.c_str());
+    }
+
+    string operator+(const std::string& str) {
+        return (*this + str.c_str());
+    }
+
+    friend std::string operator+(const std::string& str1, const string& str2) {
+        return (str1 + str2.c_str());
+    }
+
+    friend bool operator>(const string& str1, const string& str2) {
+        return strcmp(str1.c_str(), str2.c_str()) > 0;
+    }
+
+    friend bool operator<=(const string& str1, const string& str2) {
+        return strcmp(str1.c_str(), str2.c_str()) <= 0;
+    }
+
+    friend bool operator<(const string& str1, const string& str2) {
+        return strcmp(str1.c_str(), str2.c_str()) < 0;
+    }
+
+    friend bool operator>=(const string& str1, const string& str2) {
+        return strcmp(str1.c_str(), str2.c_str()) >= 0;
+    }
+
+    friend bool operator==(const string& str1, const string& str2) {
+        return strcmp(str1.c_str(), str2.c_str()) == 0;
+    }
+
+private:
+    size_t len;
+    char* storage;
+};
+
+} // namespace v1
+
+using v1::string;
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/type_traits.hpp b/include/oneapi/ccl/type_traits.hpp
new file mode 100644
index 000000000..43db30e95
--- /dev/null
+++ b/include/oneapi/ccl/type_traits.hpp
@@ -0,0 +1,168 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+
+#ifdef CCL_ENABLE_SYCL
+#include <CL/sycl.hpp>
+#endif
+
+#include "oneapi/ccl/lp_types.hpp"
+#include "oneapi/ccl/types.hpp"
+
+namespace ccl {
+/**
+ * Base type-trait helpers for "unknown" types
+ */
+template <ccl::datatype type>
+struct type_info {
+    static constexpr bool is_supported = false;
+    static constexpr bool is_class = false;
+};
+
+template <class type>
+struct native_type_info {
+    static constexpr bool is_supported = false;
+    static constexpr bool is_class = false;
+};
+
+#define CCL_TYPE_TRAITS(ccl_type, cpp_type, bytes, str) \
+    template <> \
+    struct type_info<ccl_type> \
+            : public ccl_type_info_export<cpp_type, bytes, ccl_type, false, true> { \
+        static constexpr const char* name() { \
+            return #str; \
+        } \
+    }; \
+    template <> \
+    struct native_type_info<cpp_type> : public type_info<ccl_type> {};
+
+#define CCL_CLASS_TYPE_TRAITS(ccl_type, cpp_type, bytes, str) \
+    template <> \
+    struct native_type_info<cpp_type> \
+            : public ccl_type_info_export<cpp_type, bytes, ccl_type, true, true> { \
+        static constexpr const char* name() { \
+            return #str; \
+        } \
+    };
+
+#define COMMA ,
+
+/**
+ * Enumeration of supported CCL API data types
+ */
+
+CCL_TYPE_TRAITS(ccl::datatype::int8, int8_t, sizeof(int8_t), int8)
+CCL_TYPE_TRAITS(ccl::datatype::uint8, uint8_t, sizeof(uint8_t), uint8)
+CCL_TYPE_TRAITS(ccl::datatype::int16, int16_t, sizeof(int16_t), int16)
+CCL_TYPE_TRAITS(ccl::datatype::uint16, uint16_t, sizeof(uint16_t), uint16)
+CCL_TYPE_TRAITS(ccl::datatype::int32, int32_t, sizeof(int32_t), int32)
+CCL_TYPE_TRAITS(ccl::datatype::uint32, uint32_t, sizeof(uint32_t), uint32)
+CCL_TYPE_TRAITS(ccl::datatype::int64, int64_t, sizeof(int64_t), int64)
+CCL_TYPE_TRAITS(ccl::datatype::uint64, uint64_t, sizeof(uint64_t), uint64)
+//CCL_TYPE_TRAITS(ccl::datatype::float16, float16, sizeof(float16), float16)
+CCL_TYPE_TRAITS(ccl::datatype::float32, float, sizeof(float), float32)
+CCL_TYPE_TRAITS(ccl::datatype::float64, double, sizeof(double), float64)
+CCL_TYPE_TRAITS(ccl::datatype::bfloat16, bfloat16, sizeof(bfloat16), bfloat16)
+
+#ifdef CCL_ENABLE_SYCL
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::int8, cl::sycl::buffer<int8_t COMMA 1>, sizeof(int8_t), int8)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::uint8,
+                      cl::sycl::buffer<uint8_t COMMA 1>,
+                      sizeof(uint8_t),
+                      uint8)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::int16,
+                      cl::sycl::buffer<int16_t COMMA 1>,
+                      sizeof(int16_t),
+                      int16)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::uint16,
+                      cl::sycl::buffer<uint16_t COMMA 1>,
+                      sizeof(uint16_t),
+                      uint16)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::int32,
+                      cl::sycl::buffer<int32_t COMMA 1>,
+                      sizeof(int32_t),
+                      int32)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::uint32,
+                      cl::sycl::buffer<uint32_t COMMA 1>,
+                      sizeof(uint32_t),
+                      uint32)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::int64,
+                      cl::sycl::buffer<int64_t COMMA 1>,
+                      sizeof(int64_t),
+                      int64)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::uint64,
+                      cl::sycl::buffer<uint64_t COMMA 1>,
+                      sizeof(uint64_t),
+                      uint64)
+// CCL_CLASS_TYPE_TRAITS(ccl::datatype::float16,
+//                       cl::sycl::buffer<float16 COMMA 1>,
+//                       sizeof(float16),
+//                       float16)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::float32,
+                      cl::sycl::buffer<float COMMA 1>,
+                      sizeof(float),
+                      float32)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::float64,
+                      cl::sycl::buffer<double COMMA 1>,
+                      sizeof(double),
+                      float64)
+CCL_CLASS_TYPE_TRAITS(ccl::datatype::bfloat16,
+                      cl::sycl::buffer<bfloat16 COMMA 1>,
+                      sizeof(bfloat16),
+                      bfloat16)
+#endif /* CCL_ENABLE_SYCL */
+
+/**
+ * Checks for supporting @c type in ccl API
+ */
+template <class type>
+constexpr bool is_supported() {
+    using clear_type = typename std::remove_pointer<type>::type;
+    //    static_assert(native_type_info<clear_type>::is_supported, "type is not supported by ccl API");
+    return native_type_info<clear_type>::is_supported;
+}
+
+/**
+ * Checks is @c type a class
+ */
+template <class type>
+constexpr bool is_class() {
+    using clear_type = typename std::remove_pointer<type>::type;
+    return native_type_info<clear_type>::is_class;
+}
+
+/**
+ * SFINAE checks for supporting native type @c type in ccl API
+ */
+template <class type>
+constexpr bool is_native_type_supported() {
+    return (not is_class<type>() and is_supported<type>());
+}
+
+/**
+  * SFINAE checks for supporting class @c type in ccl API
+  */
+template <class type>
+constexpr bool is_class_supported() {
+    return (is_class<type>() and is_supported<type>());
+}
+
+} // namespace ccl
+
+#include "oneapi/ccl/device_type_traits.hpp"
diff --git a/include/oneapi/ccl/types.hpp b/include/oneapi/ccl/types.hpp
new file mode 100644
index 000000000..ce7b2bb14
--- /dev/null
+++ b/include/oneapi/ccl/types.hpp
@@ -0,0 +1,195 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "oneapi/ccl/config.h"
+
+#include <bitset>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <stdexcept>
+#include <vector>
+
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/exception.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+/**
+ * Supported reduction operations
+ */
+enum class reduction : int {
+    sum = 0,
+    prod,
+    min,
+    max,
+    custom,
+
+    last_value
+};
+
+/**
+ * Supported datatypes
+ */
+enum class datatype : int {
+    int8 = 0,
+    uint8,
+    int16,
+    uint16,
+    int32,
+    uint32,
+    int64,
+    uint64,
+
+    float16,
+    float32,
+    float64,
+
+    bfloat16,
+
+    last_predefined = bfloat16
+};
+
+/**
+ * Supported CL backend types
+ */
+enum class cl_backend_type : int {
+    empty_backend = 0x0,
+    dpcpp_sycl = 0x1,
+    l0 = 0x2,
+    dpcpp_sycl_l0 = 0x3,
+
+    last_value
+};
+
+} // namespace v1
+
+using v1::reduction;
+using v1::datatype;
+using v1::cl_backend_type;
+
+/**
+ * Type traits, which describes how-to types would be interpretered by ccl API
+ */
+template <class ntype_t,
+          size_t size_of_type,
+          ccl::datatype ccl_type_v,
+          bool iclass = false,
+          bool supported = false>
+struct ccl_type_info_export {
+    using native_type = ntype_t;
+    using ccl_type = std::integral_constant<ccl::datatype, ccl_type_v>;
+    static constexpr size_t size = size_of_type;
+    static constexpr datatype dtype = static_cast<enum datatype>(ccl_type::value);
+    static constexpr bool is_class = iclass;
+    static constexpr bool is_supported = supported;
+};
+
+namespace v1 {
+
+/**
+ * Library version description
+ */
+typedef struct {
+    unsigned int major;
+    unsigned int minor;
+    unsigned int update;
+    const char* product_status;
+    const char* build_date;
+    const char* full;
+    string_class cl_backend_name;
+} library_version;
+
+typedef struct {
+    const char* match_id;
+    const size_t offset;
+} fn_context;
+
+/* in_buf, in_count, inout_buf, out_count, dtype, context */
+typedef void (
+    *reduction_fn)(const void*, size_t, void*, size_t*, ccl::datatype, const ccl::v1::fn_context*);
+
+struct ccl_empty_attr {
+    static ccl::v1::library_version version;
+
+    template <class attr>
+    static attr create_empty();
+};
+
+/**
+ * Sparse coalesce modes
+ * 
+ * Use this variable to set sparse_allreduce coalescing mode:
+ * regular        - run regular coalesce funtion;
+ * disable        - disables coalesce function in sparse_allreduce,
+ *                  allgathered data is returned;
+ * keep_precision - on every local reduce bf16 data is converted to fp32,
+ *                  reduced and then converted back to bf16.
+ */
+enum class sparse_coalesce_mode : int {
+    regular = 0,
+    disable,
+    keep_precision,
+
+    last_value
+};
+
+/* idx_buf, idx_count, idx_dtype, val_buf, val_count, val_dtype, user_context */
+typedef void (*sparse_allreduce_completion_fn)(const void*,
+                                               size_t,
+                                               ccl::datatype,
+                                               const void*,
+                                               size_t,
+                                               ccl::datatype,
+                                               const void*);
+
+/* idx_count, idx_dtype, val_count, val_dtype, user_context, out_idx_buf, out_val_buf */
+typedef void (*sparse_allreduce_alloc_fn)(size_t,
+                                          ccl::datatype,
+                                          size_t,
+                                          ccl::datatype,
+                                          const void*,
+                                          void**,
+                                          void**);
+} // namespace v1
+
+using v1::library_version;
+using v1::fn_context;
+using v1::reduction_fn;
+using v1::ccl_empty_attr;
+
+using v1::sparse_coalesce_mode;
+using v1::sparse_allreduce_completion_fn;
+using v1::sparse_allreduce_alloc_fn;
+
+/**
+ * API object attributes traits
+ */
+namespace info {
+template <class param_type, param_type value>
+struct param_traits {};
+
+} //namespace info
+} // namespace ccl
+
+#include "oneapi/ccl/device_types.hpp"
diff --git a/include/oneapi/ccl/ccl_types_policy.hpp b/include/oneapi/ccl/types_policy.hpp
similarity index 97%
rename from include/oneapi/ccl/ccl_types_policy.hpp
rename to include/oneapi/ccl/types_policy.hpp
index aebedd25a..4ebd0bffc 100644
--- a/include/oneapi/ccl/ccl_types_policy.hpp
+++ b/include/oneapi/ccl/types_policy.hpp
@@ -188,19 +188,20 @@ class ccl_api_base_movable : protected access_policy_t<impl_t> {
 private:
     impl_value_t pimpl;
 };
-namespace details {
+
+namespace detail {
 template <class attr, attr id>
 struct ccl_api_type_attr_traits {};
 
 template <class attrib_id, attrib_id attrId, class value_type>
-struct attr_value_tripple {
+struct attr_value_triple {
     using type_t = attrib_id;
     using value_t = value_type;
     static constexpr attrib_id idx() {
         return attrId;
     }
 
-    explicit attr_value_tripple(value_t val) : m_val(val) {}
+    explicit attr_value_triple(value_t val) : m_val(val) {}
     const value_type& val() {
         return m_val;
     }
@@ -208,6 +209,6 @@ struct attr_value_tripple {
 private:
     value_t m_val;
 };
+} // namespace detail
 
-} // namespace details
 } // namespace ccl
diff --git a/mpi/bin/hydra_bstrap_proxy b/mpi/bin/hydra_bstrap_proxy
index 6ce0a503d..03d31da92 100755
Binary files a/mpi/bin/hydra_bstrap_proxy and b/mpi/bin/hydra_bstrap_proxy differ
diff --git a/mpi/bin/hydra_nameserver b/mpi/bin/hydra_nameserver
index b6471ef4a..dc1b3e064 100755
Binary files a/mpi/bin/hydra_nameserver and b/mpi/bin/hydra_nameserver differ
diff --git a/mpi/bin/hydra_pmi_proxy b/mpi/bin/hydra_pmi_proxy
index 67b22b917..73978d8d1 100755
Binary files a/mpi/bin/hydra_pmi_proxy and b/mpi/bin/hydra_pmi_proxy differ
diff --git a/mpi/bin/mpiexec b/mpi/bin/mpiexec
deleted file mode 120000
index 482a69296..000000000
--- a/mpi/bin/mpiexec
+++ /dev/null
@@ -1 +0,0 @@
-mpiexec.hydra
\ No newline at end of file
diff --git a/mpi/bin/mpiexec b/mpi/bin/mpiexec
new file mode 100755
index 000000000..423fc1a36
Binary files /dev/null and b/mpi/bin/mpiexec differ
diff --git a/mpi/bin/mpiexec.hydra b/mpi/bin/mpiexec.hydra
index 3e67f050e..423fc1a36 100755
Binary files a/mpi/bin/mpiexec.hydra and b/mpi/bin/mpiexec.hydra differ
diff --git a/mpi/bin/mpigcc b/mpi/bin/mpigcc
index 1be1d3ef5..ad2cc08a6 100755
--- a/mpi/bin/mpigcc
+++ b/mpi/bin/mpigcc
@@ -104,7 +104,7 @@ CFLAGS=""
 CPPFLAGS=""
 LDFLAGS=" -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl "
 LIBS="-lm   -lpthread  -lfabric -lrt "
-MPIVERSION="2021.1-beta10"
+MPIVERSION="2021.1"
 MPILIBNAME="mpi"                           
 
 
diff --git a/mpi/bin/mpigxx b/mpi/bin/mpigxx
index 2d3a8ad14..5524e9a49 100755
--- a/mpi/bin/mpigxx
+++ b/mpi/bin/mpigxx
@@ -101,7 +101,7 @@ MPICH_VERSION="3.3"
 CXXFLAGS=""
 LDFLAGS=" -Wl,-z,now -Wl,-z,relro -Wl,-z,noexecstack -Xlinker --enable-new-dtags -ldl "
 LIBS="-lm   -lpthread  -lfabric -lrt "
-MPIVERSION="2021.1-beta10"
+MPIVERSION="2021.1"
 MPILIBNAME="mpi"
 MPICXXLIBNAME="mpicxx"
 
diff --git a/mpi/bin/mpiicc b/mpi/bin/mpiicc
index b938fe315..aff4c8024 100755
--- a/mpi/bin/mpiicc
+++ b/mpi/bin/mpiicc
@@ -122,7 +122,7 @@ MPILIBNAME="mpi"
 PMPILIBNAME="pmpi"
 
 # MPIVERSION is the version of the MPICH2 library that mpicc is intended for
-MPIVERSION="2021.1-beta10"
+MPIVERSION="2021.1"
 #
 # Internal variables
 # Show is set to echo to cause the compilation command to be echoed instead
diff --git a/mpi/bin/mpiicpc b/mpi/bin/mpiicpc
index 62ce4df55..1e172575b 100755
--- a/mpi/bin/mpiicpc
+++ b/mpi/bin/mpiicpc
@@ -121,7 +121,7 @@ PMPILIBNAME="pmpi"
 MPICXXLIBNAME="mpicxx"
 
 # MPIVERSION is the version of the Intel(R) MPI Library that mpiicpc is intended for
-MPIVERSION="2021.1-beta10"
+MPIVERSION="2021.1"
 #
 # Internal variables
 # Show is set to echo to cause the compilation command to be echoed instead
diff --git a/mpi/etc/tuning_clx-ap_shm-ofi.dat b/mpi/etc/tuning_clx-ap_shm-ofi.dat
index 841f2d778..ce04f0d57 100755
Binary files a/mpi/etc/tuning_clx-ap_shm-ofi.dat and b/mpi/etc/tuning_clx-ap_shm-ofi.dat differ
diff --git a/mpi/include/mpi.h b/mpi/include/mpi.h
index 301e0e9eb..de4cba113 100755
--- a/mpi/include/mpi.h
+++ b/mpi/include/mpi.h
@@ -595,8 +595,8 @@ typedef int (MPI_Delete_function) ( MPI_Comm, int, void *, void * );
  * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So,
  * 2019.0.0b0 will have the numeric version 20190000100.
  */
-#define I_MPI_VERSION "2021.1.0b10"
-#define I_MPI_NUMVERSION 20210100110
+#define I_MPI_VERSION "2021.1.0"
+#define I_MPI_NUMVERSION 20210100300
 
 /* for the datatype decoders */
 enum MPIR_Combiner_enum {
diff --git a/mpi/lib/libmpi.so b/mpi/lib/libmpi.so
deleted file mode 120000
index 9e4b9f431..000000000
--- a/mpi/lib/libmpi.so
+++ /dev/null
@@ -1 +0,0 @@
-libmpi.so.12.0
\ No newline at end of file
diff --git a/mpi/lib/libmpi.so b/mpi/lib/libmpi.so
new file mode 100755
index 000000000..caeb9d1ac
Binary files /dev/null and b/mpi/lib/libmpi.so differ
diff --git a/mpi/lib/libmpi.so.12 b/mpi/lib/libmpi.so.12
deleted file mode 120000
index 5a0e391d4..000000000
--- a/mpi/lib/libmpi.so.12
+++ /dev/null
@@ -1 +0,0 @@
-libmpi.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpi.so.12 b/mpi/lib/libmpi.so.12
new file mode 100755
index 000000000..caeb9d1ac
Binary files /dev/null and b/mpi/lib/libmpi.so.12 differ
diff --git a/mpi/lib/libmpi.so.12.0 b/mpi/lib/libmpi.so.12.0
deleted file mode 120000
index 5a0e391d4..000000000
--- a/mpi/lib/libmpi.so.12.0
+++ /dev/null
@@ -1 +0,0 @@
-libmpi.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpi.so.12.0 b/mpi/lib/libmpi.so.12.0
new file mode 100755
index 000000000..caeb9d1ac
Binary files /dev/null and b/mpi/lib/libmpi.so.12.0 differ
diff --git a/mpi/lib/libmpi.so.12.0.0 b/mpi/lib/libmpi.so.12.0.0
index 54cbc716e..caeb9d1ac 100755
Binary files a/mpi/lib/libmpi.so.12.0.0 and b/mpi/lib/libmpi.so.12.0.0 differ
diff --git a/mpi/lib/libmpicxx.so b/mpi/lib/libmpicxx.so
deleted file mode 120000
index 9e27e2a69..000000000
--- a/mpi/lib/libmpicxx.so
+++ /dev/null
@@ -1 +0,0 @@
-libmpicxx.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpicxx.so b/mpi/lib/libmpicxx.so
new file mode 100755
index 000000000..ee69659ef
Binary files /dev/null and b/mpi/lib/libmpicxx.so differ
diff --git a/mpi/lib/libmpicxx.so.12 b/mpi/lib/libmpicxx.so.12
deleted file mode 120000
index 9e27e2a69..000000000
--- a/mpi/lib/libmpicxx.so.12
+++ /dev/null
@@ -1 +0,0 @@
-libmpicxx.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpicxx.so.12 b/mpi/lib/libmpicxx.so.12
new file mode 100755
index 000000000..ee69659ef
Binary files /dev/null and b/mpi/lib/libmpicxx.so.12 differ
diff --git a/mpi/lib/libmpicxx.so.12.0 b/mpi/lib/libmpicxx.so.12.0
deleted file mode 120000
index 9e27e2a69..000000000
--- a/mpi/lib/libmpicxx.so.12.0
+++ /dev/null
@@ -1 +0,0 @@
-libmpicxx.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpicxx.so.12.0 b/mpi/lib/libmpicxx.so.12.0
new file mode 100755
index 000000000..ee69659ef
Binary files /dev/null and b/mpi/lib/libmpicxx.so.12.0 differ
diff --git a/mpi/lib/libmpifort.so b/mpi/lib/libmpifort.so
deleted file mode 120000
index 3dc64470d..000000000
--- a/mpi/lib/libmpifort.so
+++ /dev/null
@@ -1 +0,0 @@
-libmpifort.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpifort.so b/mpi/lib/libmpifort.so
new file mode 100755
index 000000000..7e12bff9b
Binary files /dev/null and b/mpi/lib/libmpifort.so differ
diff --git a/mpi/lib/libmpifort.so.12 b/mpi/lib/libmpifort.so.12
deleted file mode 120000
index 3dc64470d..000000000
--- a/mpi/lib/libmpifort.so.12
+++ /dev/null
@@ -1 +0,0 @@
-libmpifort.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpifort.so.12 b/mpi/lib/libmpifort.so.12
new file mode 100755
index 000000000..7e12bff9b
Binary files /dev/null and b/mpi/lib/libmpifort.so.12 differ
diff --git a/mpi/lib/libmpifort.so.12.0 b/mpi/lib/libmpifort.so.12.0
deleted file mode 120000
index 3dc64470d..000000000
--- a/mpi/lib/libmpifort.so.12.0
+++ /dev/null
@@ -1 +0,0 @@
-libmpifort.so.12.0.0
\ No newline at end of file
diff --git a/mpi/lib/libmpifort.so.12.0 b/mpi/lib/libmpifort.so.12.0
new file mode 100755
index 000000000..7e12bff9b
Binary files /dev/null and b/mpi/lib/libmpifort.so.12.0 differ
diff --git a/mpi/lib/libmpifort.so.12.0.0 b/mpi/lib/libmpifort.so.12.0.0
index e8bf6822d..7e12bff9b 100755
Binary files a/mpi/lib/libmpifort.so.12.0.0 and b/mpi/lib/libmpifort.so.12.0.0 differ
diff --git a/mpi/licensing/license.txt b/mpi/licensing/license.txt
old mode 100644
new mode 100755
index 6ca7736f1..ffffdc860
--- a/mpi/licensing/license.txt
+++ b/mpi/licensing/license.txt
@@ -1,77 +1,77 @@
-Intel Simplified Software License (Version April 2018)
+Intel Simplified Software License (Version February 2020)
 
-Copyright (c) 2018 Intel Corporation.
-
-Use and Redistribution.  You may use and redistribute the software (the
+Use and Redistribution. You may use and redistribute the software (the 
 "Software"), without modification, provided the following conditions are met:
 
-* Redistributions must reproduce the above copyright notice and the following
-  terms of use in the Software and in the documentation and/or other materials
+* Redistributions must reproduce the above copyright notice and the following 
+  terms of use in the Software and in the documentation and/or other materials 
   provided with the distribution.
-* Neither the name of Intel nor the names of its suppliers may be used to
-  endorse or promote products derived from this Software without specific prior
+* Neither the name of Intel nor the names of its suppliers may be used to 
+  endorse or promote products derived from this Software without specific prior 
   written permission.
-* No reverse engineering, decompilation, or disassembly of this Software is
+* No reverse engineering, decompilation, or disassembly of this Software is 
   permitted.
 
-Limited patent license.  Intel grants you a world-wide, royalty-free,
-non-exclusive license under patents it now or hereafter owns or controls to
-make, have made, use, import, offer to sell and sell ("Utilize") this Software,
-but solely to the extent that any such patent is necessary to Utilize the
-Software alone. The patent license shall not apply to any combinations which
+Limited patent license. Intel grants you a world-wide, royalty-free, 
+non-exclusive license under patents it now or hereafter owns or controls to 
+make, have made, use, import, offer to sell and sell ("Utilize") this Software, 
+but solely to the extent that any such patent is necessary to Utilize the 
+Software alone. The patent license shall not apply to any combinations which 
 include this software. No hardware per se is licensed hereunder.
 
-Third party and other Intel programs.  "Third Party Programs" are the files
-listed in the "third-party-programs.txt" text file that is included with the
-Software and may include Intel programs under separate license terms. Third
-Party Programs, even if included with the distribution of the Materials, are
-governed by separate license terms and those license terms solely govern your
-use of those programs.
+Third party programs. The Software may contain Third Party Programs. "Third 
+Party Programs" are third party software, open source software or other Intel 
+software listed in the "third-party-programs.txt"  or other similarly named text 
+file that is included with the Software. Third Party Programs, even if included 
+with the distribution of the Software, may be governed by separate license 
+terms, including without limitation, third party license terms, open source 
+software notices and terms, and/or other Intel software license terms. These 
+separate license terms may govern your use of the Third Party Programs.  
 
-DISCLAIMER.  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE
-DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS
-WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE
-THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND
-ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT
+DISCLAIMER. THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED 
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE 
+DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS 
+WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE 
+THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND 
+ATTORNEYS' FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT 
 INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE MATERIALS.
 
-LIMITATION OF LIABILITY.  IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
-ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD INTEL
-HARMLESS AGAINST ANY CLAIMS AND EXPENSES RESULTING FROM YOUR USE OR UNAUTHORIZED
-USE OF THE SOFTWARE.
+LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT, 
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD 
+INTEL HARMLESS AGAINST ANY CLAIMS AND EXPENSES RESULTING FROM YOUR USE OR 
+UNAUTHORIZED USE OF THE SOFTWARE.
 
-No support.  Intel may make changes to the Software, at any time without notice,
-and is not obligated to support, update or provide training for the Software.
+No support. Intel may make changes to the Software, at any time without notice, 
+and is not obligated to support, update or provide training for the Software. 
 
-Termination.  Intel may terminate your right to use the Software in the event of
-your breach of this Agreement and you fail to cure the breach within a
+Termination. Intel may terminate your right to use the Software in the event of 
+your breach of this Agreement and you fail to cure the breach within a 
 reasonable period of time.
 
-Feedback.  Should you provide Intel with comments, modifications, corrections,
-enhancements or other input ("Feedback") related to the Software Intel will be
-free to use, disclose, reproduce, license or otherwise distribute or exploit the
-Feedback in its sole discretion without any obligations or restrictions of any
-kind, including without limitation, intellectual property rights or licensing
+Feedback. Should you provide Intel with comments, modifications, corrections, 
+enhancements or other input ("Feedback") related to the Software Intel will be 
+free to use, disclose, reproduce, license or otherwise distribute or exploit the 
+Feedback in its sole discretion without any obligations or restrictions of any 
+kind, including without limitation, intellectual property rights or licensing 
 obligations.
 
-Compliance with laws.  You agree to comply with all relevant laws and
-regulations governing your use, transfer, import or export (or prohibition
-thereof) of the Software.
+Compliance with laws. You agree to comply with all relevant laws and regulations 
+governing your use, transfer, import or export (or prohibition thereof) of the 
+Software.
 
-Governing law.  All disputes will be governed by the laws of the United States
-of America and the State of Delaware without reference to conflict of law
-principles and subject to the exclusive jurisdiction of the state or federal
-courts sitting in the State of Delaware, and each party agrees that it submits
-to the personal jurisdiction and venue of those courts and waives any
-objections. The United Nations Convention on Contracts for the International
-Sale of Goods (1980) is specifically excluded and will not apply to the
+Governing law. All disputes will be governed by the laws of the United States of 
+America and the State of Delaware without reference to conflict of law 
+principles and subject to the exclusive jurisdiction of the state or federal 
+courts sitting in the State of Delaware, and each party agrees that it submits 
+to the personal jurisdiction and venue of those courts and waives any 
+objections. The United Nations Convention on Contracts for the International 
+Sale of Goods (1980) is specifically excluded and will not apply to the 
 Software.
 
 *Other names and brands may be claimed as the property of others.
diff --git a/mpi/licensing/third-party-programs.txt b/mpi/licensing/third-party-programs.txt
index 34278eb40..0dbbc92ff 100755
--- a/mpi/licensing/third-party-programs.txt
+++ b/mpi/licensing/third-party-programs.txt
@@ -1,4 +1,4 @@
-Intel(R) MPI Library 2021.1-beta10 Third Party Programs File
+Intel(R) MPI Library 2021.1 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
diff --git a/ofi/bin/fi_info b/ofi/bin/fi_info
index d50d3dfba..463945e25 100755
Binary files a/ofi/bin/fi_info and b/ofi/bin/fi_info differ
diff --git a/ofi/include/rdma/fabric.h b/ofi/include/rdma/fabric.h
index 9e53861e2..ce0b91805 100644
--- a/ofi/include/rdma/fabric.h
+++ b/ofi/include/rdma/fabric.h
@@ -16,6 +16,7 @@
 /*
  * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
  * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -53,6 +54,7 @@
 #include <stddef.h>
 #include <sys/types.h>
 #include <sys/uio.h>
+#include <rdma/fi_errno.h>
 
 #ifdef __GNUC__
 #define FI_DEPRECATED_FUNC __attribute__((deprecated))
@@ -92,8 +94,8 @@ extern "C" {
 #endif
 
 #define FI_MAJOR_VERSION 1
-#define FI_MINOR_VERSION 10
-#define FI_REVISION_VERSION 1
+#define FI_MINOR_VERSION 11
+#define FI_REVISION_VERSION 0
 
 enum {
 	FI_PATH_MAX		= 256,
@@ -167,6 +169,7 @@ typedef struct fid *fid_t;
 #define FI_PEEK			(1ULL << 19)
 #define FI_TRIGGER		(1ULL << 20)
 #define FI_FENCE		(1ULL << 21)
+#define FI_PRIORITY		(1ULL << 22)
 
 #define FI_COMPLETION		(1ULL << 24)
 #define FI_EVENT		FI_COMPLETION
@@ -545,6 +548,8 @@ struct fi_ops {
 	int	(*ops_open)(struct fid *fid, const char *name,
 			    uint64_t flags, void **ops, void *context);
 	int	(*tostr)(const struct fid *fid, char *buf, size_t len);
+	int	(*ops_set)(struct fid *fid, const char *name, uint64_t flags,
+			   void *ops, void *context);
 };
 
 /* All fabric interface descriptors must start with this structure */
@@ -664,6 +669,14 @@ fi_open_ops(struct fid *fid, const char *name, uint64_t flags,
 	return fid->ops->ops_open(fid, name, flags, ops, context);
 }
 
+static inline int
+fi_set_ops(struct fid *fid, const char *name, uint64_t flags,
+	   void *ops, void *context)
+{
+	return FI_CHECK_OP(fid->ops, struct fi_ops, ops_set) ?
+		fid->ops->ops_set(fid, name, flags, ops, context) : -FI_ENOSYS;
+}
+
 enum fi_type {
 	FI_TYPE_INFO,
 	FI_TYPE_EP_TYPE,
@@ -690,6 +703,7 @@ enum fi_type {
 	FI_TYPE_OP_TYPE,
 	FI_TYPE_FID,
 	FI_TYPE_COLLECTIVE_OP,
+	FI_TYPE_HMEM_IFACE,
 };
 
 char *fi_tostr(const void *data, enum fi_type datatype);
diff --git a/ofi/include/rdma/fi_domain.h b/ofi/include/rdma/fi_domain.h
index 3682b019a..99cde56cc 100644
--- a/ofi/include/rdma/fi_domain.h
+++ b/ofi/include/rdma/fi_domain.h
@@ -15,6 +15,7 @@
 */
 /*
  * Copyright (c) 2013-2017 Intel Corporation. All rights reserved.
+ * (C) Copyright 2020 Hewlett Packard Enterprise Development LP
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -130,6 +131,8 @@ struct fid_mr {
 enum fi_hmem_iface {
 	FI_HMEM_SYSTEM	= 0,
 	FI_HMEM_CUDA,
+	FI_HMEM_ROCR,
+	FI_HMEM_ZE,
 };
 
 struct fi_mr_attr {
@@ -145,6 +148,7 @@ struct fi_mr_attr {
 	union {
 		uint64_t	reserved;
 		int		cuda;
+		int		ze;
 	} device;
 };
 
@@ -153,6 +157,23 @@ struct fi_mr_modify {
 	struct fi_mr_attr	attr;
 };
 
+#define FI_SET_OPS_HMEM_OVERRIDE "hmem_override_ops"
+
+struct fi_hmem_override_ops {
+	size_t	size;
+
+	ssize_t	(*copy_from_hmem_iov)(void *dest, size_t size,
+				      enum fi_hmem_iface iface, uint64_t device,
+				      const struct iovec *hmem_iov,
+				      size_t hmem_iov_count,
+				      uint64_t hmem_iov_offset);
+
+	ssize_t (*copy_to_hmem_iov)(enum fi_hmem_iface iface, uint64_t device,
+				    const struct iovec *hmem_iov,
+				    size_t hmem_iov_count,
+				    uint64_t hmem_iov_offset, const void *src,
+				    size_t size);
+};
 
 #ifdef FABRIC_DIRECT
 #include <rdma/fi_direct_atomic_def.h>
@@ -258,8 +279,9 @@ struct fi_ops_domain {
 	int	(*query_atomic)(struct fid_domain *domain,
 			enum fi_datatype datatype, enum fi_op op,
 			struct fi_atomic_attr *attr, uint64_t flags);
-	int (*query_collective)(struct fid_domain *domain, enum fi_collective_op coll,
-				struct fi_collective_attr *attr, uint64_t flags);
+	int	(*query_collective)(struct fid_domain *domain,
+			enum fi_collective_op coll,
+			struct fi_collective_attr *attr, uint64_t flags);
 };
 
 /* Memory registration flags */
diff --git a/ofi/lib/libfabric.so b/ofi/lib/libfabric.so
deleted file mode 120000
index 878a6164e..000000000
--- a/ofi/lib/libfabric.so
+++ /dev/null
@@ -1 +0,0 @@
-libfabric.so.1
\ No newline at end of file
diff --git a/ofi/lib/libfabric.so b/ofi/lib/libfabric.so
new file mode 100755
index 000000000..5a7dcef2a
Binary files /dev/null and b/ofi/lib/libfabric.so differ
diff --git a/ofi/lib/libfabric.so.1 b/ofi/lib/libfabric.so.1
index e4771f8ba..5a7dcef2a 100755
Binary files a/ofi/lib/libfabric.so.1 and b/ofi/lib/libfabric.so.1 differ
diff --git a/ofi/lib/prov/libpsmx2-fi.so b/ofi/lib/prov/libpsmx2-fi.so
index 33f555375..a925b003b 100755
Binary files a/ofi/lib/prov/libpsmx2-fi.so and b/ofi/lib/prov/libpsmx2-fi.so differ
diff --git a/ofi/lib/prov/librxm-fi.so b/ofi/lib/prov/librxm-fi.so
index 5f3a5eb2b..989b2e7f7 100755
Binary files a/ofi/lib/prov/librxm-fi.so and b/ofi/lib/prov/librxm-fi.so differ
diff --git a/ofi/lib/prov/libshm-fi.so b/ofi/lib/prov/libshm-fi.so
index 3cc3e8885..7128a4f80 100755
Binary files a/ofi/lib/prov/libshm-fi.so and b/ofi/lib/prov/libshm-fi.so differ
diff --git a/ofi/lib/prov/libsockets-fi.so b/ofi/lib/prov/libsockets-fi.so
index a0955552b..44ee76c31 100755
Binary files a/ofi/lib/prov/libsockets-fi.so and b/ofi/lib/prov/libsockets-fi.so differ
diff --git a/ofi/lib/prov/libtcp-fi.so b/ofi/lib/prov/libtcp-fi.so
index 741e90301..fbeaf72cb 100755
Binary files a/ofi/lib/prov/libtcp-fi.so and b/ofi/lib/prov/libtcp-fi.so differ
diff --git a/ofi/lib/prov/libverbs-fi.so b/ofi/lib/prov/libverbs-fi.so
index fb1dc7cd4..0803b1680 100755
Binary files a/ofi/lib/prov/libverbs-fi.so and b/ofi/lib/prov/libverbs-fi.so differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 737ab770c..d5a25c0dd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -19,8 +19,6 @@ set (EXTENSIONS_SRC)
 
 if (CCL_ENABLE_SYCL)
     list (APPEND EXTENSIONS_SRC
-                    ccl_cpp_gpu_api.cpp
-
                     native_device_api/l0/utils.cpp
                     native_device_api/sycl/export.cpp
                     native_device_api/interop_utils.cpp
@@ -32,7 +30,6 @@ endif(CCL_ENABLE_SYCL)
 
 if (MULTI_GPU_SUPPORT)
 list (APPEND EXTENSIONS_SRC
-                    ccl_cpp_gpu_api.cpp
                     ccl_gpu_modules.cpp
                     ccl_cpp_utils.cpp
 
@@ -82,15 +79,16 @@ list (APPEND EXTENSIONS_SRC
 endif(MULTI_GPU_SUPPORT)
 
 set(CCL_SRC
-    ccl.cpp
-    ccl_cpp_api.cpp
     ccl_cpp_communicator.cpp
     ccl_cpp_environment.cpp
     ccl_api_functions.cpp
     ccl_app_api_coll_attr.cpp
+    ccl_app_api_comm_attr.cpp
     ccl_app_api_comm_split_attr.cpp
     ccl_app_api_datatype_attr.cpp
+    ccl_app_api_kvs_attr.cpp
     ccl_app_api_event.cpp
+    ccl_app_api_init_attr.cpp
     ccl_cpp_kvs.cpp
     ccl_cpp_device.cpp
     ccl_cpp_stream.cpp
@@ -98,6 +96,10 @@ set(CCL_SRC
     ccl_cpp_utils.cpp
     ccl_empty_attr.cpp
     ccl_empty_coll_attr.cpp
+    ccl_empty_comm_attr.cpp
+    ccl_empty_init_attr.cpp
+    ccl_empty_comm_split_attr.cpp
+    ccl_empty_kvs_attr.cpp
     ccl_empty_stream.cpp
     native_device_api/sycl_l0/export.cpp
     native_device_api/empty/export.cpp
@@ -108,12 +110,12 @@ set(CCL_SRC
     atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
-    atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.c
+    atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
-    atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.c
-    atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.c
+    atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.cpp
+    atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
-    atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.c
+    atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.cpp
         atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
     atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp
     atl/util/pm/pmi_rt/pmi_simple.cpp
@@ -165,6 +167,7 @@ set(CCL_SRC
     sched/entry/coll/coll_entry_helper.cpp
     sched/entry/entry.cpp
     sched/entry/factory/chunked_entry_factory.cpp
+    sched/entry/sycl_entry_helper.cpp
     exec/exec.cpp
     exec/thread/base_thread.cpp
     exec/thread/listener.cpp
@@ -173,6 +176,7 @@ set(CCL_SRC
     fusion/fusion.cpp
     parallelizer/parallelizer.cpp
     unordered_coll/unordered_coll.cpp
+
     common/comm/atl_tag.cpp
     common/comm/comm.cpp
     common/comm/comm_interface.cpp
@@ -182,7 +186,6 @@ set(CCL_SRC
     common/datatype/datatype.cpp
     common/device/device.cpp
     common/event/ccl_event.cpp
-    common/event/event_internal/event_internal.cpp
     common/stream/stream.cpp
 
     common/env/env.cpp
@@ -192,7 +195,9 @@ set(CCL_SRC
     common/event/impls/native_event.cpp
     common/request/request.cpp
     common/utils/spinlock.cpp
+    common/utils/version.cpp
     common/utils/yield.cpp
+
     ${EXTENSIONS_SRC})
 
 list(APPEND CCL_INC_DIRS
@@ -248,7 +253,7 @@ install(FILES
 add_library(ccl-static STATIC $<TARGET_OBJECTS:ccl-objects>)
 set_target_properties(ccl-static PROPERTIES OUTPUT_NAME ccl)
 set_target_properties(ccl-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
-install(TARGETS ccl-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB})
+install(TARGETS ccl-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB} OPTIONAL)
 
 if(MULTI_GPU_SUPPORT)
     message("Turn on L0 multi-gpu unit tests")
diff --git a/src/atl/CMakeLists.txt b/src/atl/CMakeLists.txt
index bbf54f42c..e80b7b4c1 100644
--- a/src/atl/CMakeLists.txt
+++ b/src/atl/CMakeLists.txt
@@ -13,54 +13,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-#builds ccl_atl_ofi
-
-add_subdirectory(mpi)
-
-add_subdirectory(util/pm/pmi_rt/pmi)
-add_subdirectory(util/pm/pmi_resizable_rt/pmi_resizable)
-
-set(OFI_SRC
-    ofi/atl_ofi.c
-    util/pm/pmi_rt/pmi_rt.c
-    util/pm/pmi_resizable_rt/pmi_resizable_rt.c)
-
-set(COMMON_OFI_INC_DIRS
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    ${CMAKE_CURRENT_SOURCE_DIR}/util/pm
-    ${CMAKE_CURRENT_SOURCE_DIR}/util/pm/codec
-    ${CMAKE_CURRENT_SOURCE_DIR}/util/pm/pmi_rt
-    ${CMAKE_CURRENT_SOURCE_DIR}/util/pm/pmi_resizable_rt
-    ${LIBFABRIC_INCLUDE_DIR})
-
-#special library that holds objects only
-add_library(ccl_atl_ofi-objects OBJECT ${OFI_SRC})
-set_target_properties(ccl_atl_ofi-objects PROPERTIES POSITION_INDEPENDENT_CODE 1)
-target_include_directories(ccl_atl_ofi-objects PRIVATE ${COMMON_OFI_INC_DIRS})
-target_include_directories(ccl_atl_ofi-objects PRIVATE $<TARGET_PROPERTY:pmi,INTERFACE_INCLUDE_DIRECTORIES>)
-target_include_directories(ccl_atl_ofi-objects PRIVATE $<TARGET_PROPERTY:resizable_pmi,INTERFACE_INCLUDE_DIRECTORIES>)
-
-#add library search directory
-
-#shared
-add_library(ccl_atl_ofi SHARED $<TARGET_OBJECTS:ccl_atl_ofi-objects>)
-target_include_directories(ccl_atl_ofi PRIVATE ${COMMON_OFI_INC_DIRS})
-
-target_link_libraries(ccl_atl_ofi PRIVATE pmi)
-target_link_libraries(ccl_atl_ofi PRIVATE resizable_pmi)
-target_link_libraries(ccl_atl_ofi PRIVATE fabric m)
-
-if (NOT LIB_ATL_OFI_SO_VERSION AND NOT LIB_ATL_OFI_MAJOR_VERSION)
-        set_target_properties(ccl_atl_ofi PROPERTIES VERSION 1 SOVERSION 1.0)
-else()
-        set_target_properties(ccl_atl_ofi PROPERTIES VERSION ${LIB_ATL_OFI_SO_VERSION} SOVERSION ${LIB_ATL_OFI_MAJOR_VERSION})
-endif()
-
-set_target_properties(ccl_atl_ofi PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
-install(TARGETS ccl_atl_ofi LIBRARY DESTINATION ${CCL_INSTALL_LIB})
-
-#static
-add_library(ccl_atl_ofi-static STATIC $<TARGET_OBJECTS:ccl_atl_ofi-objects>)
-set_target_properties(ccl_atl_ofi-static PROPERTIES OUTPUT_NAME ccl_atl_ofi)
-set_target_properties(ccl_atl_ofi-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
-install(TARGETS ccl_atl_ofi-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB})
+#builds ccl_atl_ofi
+
+add_subdirectory(mpi)
+
+add_subdirectory(util/pm/pmi_rt/pmi)
+add_subdirectory(util/pm/pmi_resizable_rt/pmi_resizable)
+
+set(OFI_SRC
+    ofi/atl_ofi.c
+    util/pm/pmi_rt/pmi_rt.c
+    util/pm/pmi_resizable_rt/pmi_resizable_rt.c)
+
+set(COMMON_OFI_INC_DIRS
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/util/pm
+    ${CMAKE_CURRENT_SOURCE_DIR}/util/pm/codec
+    ${CMAKE_CURRENT_SOURCE_DIR}/util/pm/pmi_rt
+    ${CMAKE_CURRENT_SOURCE_DIR}/util/pm/pmi_resizable_rt
+    ${LIBFABRIC_INCLUDE_DIR})
+
+#special library that holds objects only
+add_library(ccl_atl_ofi-objects OBJECT ${OFI_SRC})
+set_target_properties(ccl_atl_ofi-objects PROPERTIES POSITION_INDEPENDENT_CODE 1)
+target_include_directories(ccl_atl_ofi-objects PRIVATE ${COMMON_OFI_INC_DIRS})
+target_include_directories(ccl_atl_ofi-objects PRIVATE $<TARGET_PROPERTY:pmi,INTERFACE_INCLUDE_DIRECTORIES>)
+target_include_directories(ccl_atl_ofi-objects PRIVATE $<TARGET_PROPERTY:resizable_pmi,INTERFACE_INCLUDE_DIRECTORIES>)
+
+#add library search directory
+
+#shared
+add_library(ccl_atl_ofi SHARED $<TARGET_OBJECTS:ccl_atl_ofi-objects>)
+target_include_directories(ccl_atl_ofi PRIVATE ${COMMON_OFI_INC_DIRS})
+
+target_link_libraries(ccl_atl_ofi PRIVATE pmi)
+target_link_libraries(ccl_atl_ofi PRIVATE resizable_pmi)
+target_link_libraries(ccl_atl_ofi PRIVATE fabric m)
+
+if (NOT LIB_ATL_OFI_SO_VERSION AND NOT LIB_ATL_OFI_MAJOR_VERSION)
+        set_target_properties(ccl_atl_ofi PROPERTIES VERSION 1 SOVERSION 1.0)
+else()
+        set_target_properties(ccl_atl_ofi PROPERTIES VERSION ${LIB_ATL_OFI_SO_VERSION} SOVERSION ${LIB_ATL_OFI_MAJOR_VERSION})
+endif()
+
+set_target_properties(ccl_atl_ofi PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
+install(TARGETS ccl_atl_ofi LIBRARY DESTINATION ${CCL_INSTALL_LIB})
+
+#static
+add_library(ccl_atl_ofi-static STATIC $<TARGET_OBJECTS:ccl_atl_ofi-objects>)
+set_target_properties(ccl_atl_ofi-static PROPERTIES OUTPUT_NAME ccl_atl_ofi)
+set_target_properties(ccl_atl_ofi-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
+install(TARGETS ccl_atl_ofi-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB})
diff --git a/src/atl/atl.cpp b/src/atl/atl.cpp
index 71776f9db..5b95447d0 100644
--- a/src/atl/atl.cpp
+++ b/src/atl/atl.cpp
@@ -26,7 +26,7 @@
 #define ATL_LIB_PREFIX "libccl_atl_"
 
 static int initialized = 0;
-static int is_main_addr_reserv = 0;
+static int should_reserve_addr = 0;
 
 static int atl_lib_filter(const struct dirent* entry) {
     size_t entry_len = strlen(entry->d_name);
@@ -106,8 +106,8 @@ static void atl_ini_dir(const char* transport_name,
                 continue;
             }
 
-            if (is_main_addr_reserv) {
-                ret = transport.main_addr_reserv(const_cast<char*>(main_addr));
+            if (should_reserve_addr) {
+                ret = transport.reserve_addr(const_cast<char*>(main_addr));
             }
             else {
                 ret = transport.init(argc, argv, attr, ctx, main_addr);
@@ -228,8 +228,8 @@ atl_status_t atl_init(const char* transport_name,
     return ATL_STATUS_FAILURE;
 }
 
-void atl_main_addr_reserv(char* main_addr) {
-    is_main_addr_reserv = 1;
+void atl_main_addr_reserve(char* main_addr) {
+    should_reserve_addr = 1;
     atl_init("ofi", NULL, NULL, NULL, NULL, main_addr);
-    is_main_addr_reserv = 0;
+    should_reserve_addr = 0;
 }
diff --git a/src/atl/atl.h b/src/atl/atl.h
index 006bf3a48..2fa6a4e93 100644
--- a/src/atl/atl.h
+++ b/src/atl/atl.h
@@ -29,7 +29,7 @@ atl_status_t atl_init(const char* transport_name,
                       atl_ctx_t** ctx,
                       const char* main_addr);
 
-void atl_main_addr_reserv(char* main_addr);
+void atl_main_addr_reserve(char* main_addr);
 
 static inline atl_status_t atl_finalize(atl_ctx_t* ctx) {
     return ctx->ops->finalize(ctx);
@@ -70,7 +70,7 @@ static inline atl_status_t atl_mr_dereg(atl_ctx_t* ctx, atl_mr_t* mr) {
 static inline atl_status_t atl_ep_send(atl_ep_t* ep,
                                        const void* buf,
                                        size_t len,
-                                       size_t dst_proc_idx,
+                                       int dst_proc_idx,
                                        uint64_t tag,
                                        atl_req_t* req) {
     return ep->p2p_ops->send(ep, buf, len, dst_proc_idx, tag, req);
@@ -79,14 +79,14 @@ static inline atl_status_t atl_ep_send(atl_ep_t* ep,
 static inline atl_status_t atl_ep_recv(atl_ep_t* ep,
                                        void* buf,
                                        size_t len,
-                                       size_t src_proc_idx,
+                                       int src_proc_idx,
                                        uint64_t tag,
                                        atl_req_t* req) {
     return ep->p2p_ops->recv(ep, buf, len, src_proc_idx, tag, req);
 }
 
 static inline atl_status_t atl_ep_probe(atl_ep_t* ep,
-                                        size_t src_proc_idx,
+                                        int src_proc_idx,
                                         uint64_t tag,
                                         int* found,
                                         size_t* recv_len) {
@@ -140,7 +140,7 @@ static inline atl_status_t atl_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
 static inline atl_status_t atl_ep_bcast(atl_ep_t* ep,
                                         void* buf,
                                         size_t len,
-                                        size_t root,
+                                        int root,
                                         atl_req_t* req) {
     return ep->coll_ops->bcast(ep, buf, len, root, req);
 }
@@ -149,7 +149,7 @@ static inline atl_status_t atl_ep_reduce(atl_ep_t* ep,
                                          const void* send_buf,
                                          void* recv_buf,
                                          size_t len,
-                                         size_t root,
+                                         int root,
                                          atl_datatype_t dtype,
                                          atl_reduction_t op,
                                          atl_req_t* req) {
@@ -162,7 +162,7 @@ static inline atl_status_t atl_ep_read(atl_ep_t* ep,
                                        atl_mr_t* mr,
                                        uint64_t addr,
                                        uintptr_t remote_key,
-                                       size_t dst_proc_idx,
+                                       int dst_proc_idx,
                                        atl_req_t* req) {
     return ep->rma_ops->read(ep, buf, len, mr, addr, remote_key, dst_proc_idx, req);
 }
@@ -173,7 +173,7 @@ static inline atl_status_t atl_ep_write(atl_ep_t* ep,
                                         atl_mr_t* mr,
                                         uint64_t addr,
                                         uintptr_t remote_key,
-                                        size_t dst_proc_idx,
+                                        int dst_proc_idx,
                                         atl_req_t* req) {
     return ep->rma_ops->write(ep, buf, len, mr, addr, remote_key, dst_proc_idx, req);
 }
@@ -226,19 +226,19 @@ class iatl {
     virtual atl_status_t atl_ep_send(atl_ep_t* ep,
                                      const void* buf,
                                      size_t len,
-                                     size_t dst_proc_idx,
+                                     int dst_proc_idx,
                                      uint64_t tag,
                                      atl_req_t* req) = 0;
 
     virtual atl_status_t atl_ep_recv(atl_ep_t* ep,
                                      void* buf,
                                      size_t len,
-                                     size_t src_proc_idx,
+                                     int src_proc_idx,
                                      uint64_t tag,
                                      atl_req_t* req) = 0;
 
     virtual atl_status_t atl_ep_probe(atl_ep_t* ep,
-                                      size_t src_proc_idx,
+                                      int src_proc_idx,
                                       uint64_t tag,
                                       int* found,
                                       size_t* recv_len) = 0;
@@ -279,14 +279,14 @@ class iatl {
     virtual atl_status_t atl_ep_bcast(atl_ep_t* ep,
                                       void* buf,
                                       size_t len,
-                                      size_t root,
+                                      int root,
                                       atl_req_t* req) = 0;
 
     virtual atl_status_t atl_ep_reduce(atl_ep_t* ep,
                                        const void* send_buf,
                                        void* recv_buf,
                                        size_t len,
-                                       size_t root,
+                                       int root,
                                        atl_datatype_t dtype,
                                        atl_reduction_t op,
                                        atl_req_t* req) = 0;
@@ -305,7 +305,7 @@ class iatl {
                                      atl_mr_t* mr,
                                      uint64_t addr,
                                      uintptr_t remote_key,
-                                     size_t dst_proc_idx,
+                                     int dst_proc_idx,
                                      atl_req_t* req) = 0;
 
     virtual atl_status_t atl_ep_write(atl_ep_t* ep,
@@ -314,7 +314,7 @@ class iatl {
                                       atl_mr_t* mr,
                                       uint64_t addr,
                                       uintptr_t remote_key,
-                                      size_t dst_proc_idx,
+                                      int dst_proc_idx,
                                       atl_req_t* req) = 0;
 
     virtual atl_status_t atl_ep_wait(atl_ep_t* ep, atl_req_t* req) = 0;
@@ -326,5 +326,6 @@ class iatl {
     virtual atl_status_t atl_ep_poll(atl_ep_t* ep) = 0;
 
     virtual atl_status_t atl_ep_check(atl_ep_t* ep, int* is_completed, atl_req_t* req) = 0;
+    virtual bool is_inited() = 0;
 };
 #endif
diff --git a/src/atl/atl_def.h b/src/atl/atl_def.h
index 1d5f5cc9a..b068b7019 100644
--- a/src/atl/atl_def.h
+++ b/src/atl/atl_def.h
@@ -54,7 +54,7 @@ typedef enum { ATL_PROGRESS_POLL, ATL_PROGRESS_CHECK } atl_progress_mode_t;
 
 typedef enum { ATL_RA_WAIT, ATL_RA_RUN, ATL_RA_FINALIZE } atl_resize_action_t;
 
-typedef atl_resize_action_t (*atl_resize_fn_t)(size_t size);
+typedef atl_resize_action_t (*atl_resize_fn_t)(int size);
 
 typedef enum {
     ATL_STATUS_SUCCESS,
@@ -116,10 +116,10 @@ typedef struct {
 } atl_mr_t;
 
 typedef struct {
-    size_t global_idx;
-    size_t global_count;
-    size_t local_idx;
-    size_t local_count;
+    int global_idx;
+    int global_count;
+    int local_idx;
+    int local_count;
 } atl_proc_coord_t;
 
 typedef struct {
@@ -132,7 +132,7 @@ typedef struct {
     const char* name;
     atl_status_t (
         *init)(int* argc, char*** argv, atl_attr_t* attr, atl_ctx_t** ctx, const char* main_addr);
-    atl_status_t (*main_addr_reserv)(char* main_addr);
+    atl_status_t (*reserve_addr)(char* main_addr);
 } atl_transport_t;
 
 typedef struct {
@@ -165,17 +165,13 @@ typedef struct {
     atl_status_t (*send)(atl_ep_t* ep,
                          const void* buf,
                          size_t len,
-                         size_t dst_proc_idx,
-                         uint64_t tag,
-                         atl_req_t* req);
-    atl_status_t (*recv)(atl_ep_t* ep,
-                         void* buf,
-                         size_t len,
-                         size_t src_proc_idx,
+                         int dst_proc_idx,
                          uint64_t tag,
                          atl_req_t* req);
     atl_status_t (
-        *probe)(atl_ep_t* ep, size_t src_proc_idx, uint64_t tag, int* found, size_t* recv_len);
+        *recv)(atl_ep_t* ep, void* buf, size_t len, int src_proc_idx, uint64_t tag, atl_req_t* req);
+    atl_status_t (
+        *probe)(atl_ep_t* ep, int src_proc_idx, uint64_t tag, int* found, size_t* recv_len);
 } atl_p2p_ops_t;
 
 typedef struct {
@@ -205,12 +201,12 @@ typedef struct {
                               const int* recv_offsets,
                               atl_req_t* req);
     atl_status_t (*barrier)(atl_ep_t* ep, atl_req_t* req);
-    atl_status_t (*bcast)(atl_ep_t* ep, void* buf, size_t len, size_t root, atl_req_t* req);
+    atl_status_t (*bcast)(atl_ep_t* ep, void* buf, size_t len, int root, atl_req_t* req);
     atl_status_t (*reduce)(atl_ep_t* ep,
                            const void* send_buf,
                            void* recv_buf,
                            size_t count,
-                           size_t root,
+                           int root,
                            atl_datatype_t dtype,
                            atl_reduction_t op,
                            atl_req_t* req);
@@ -230,7 +226,7 @@ typedef struct {
                          atl_mr_t* mr,
                          uint64_t addr,
                          uintptr_t remote_key,
-                         size_t dst_proc_idx,
+                         int dst_proc_idx,
                          atl_req_t* req);
     atl_status_t (*write)(atl_ep_t* ep,
                           const void* buf,
@@ -238,7 +234,7 @@ typedef struct {
                           atl_mr_t* mr,
                           uint64_t addr,
                           uintptr_t remote_key,
-                          size_t dst_proc_idx,
+                          int dst_proc_idx,
                           atl_req_t* req);
 } atl_rma_ops_t;
 
diff --git a/src/atl/atl_wrapper.cpp b/src/atl/atl_wrapper.cpp
index 7f89d24ba..df2878ad8 100644
--- a/src/atl/atl_wrapper.cpp
+++ b/src/atl/atl_wrapper.cpp
@@ -35,8 +35,7 @@ atl_attr_t atl_wrapper::attr = {
     0 /* extra_ep */
 };
 
-void atl_wrapper::set_internal_env(const atl_attr_t& attr)
-{
+void atl_wrapper::set_internal_env(const atl_attr_t& attr) {
     auto transport_type = ccl::global_data::env().atl_transport;
 
     if (transport_type == ccl_atl_mpi)
@@ -46,12 +45,10 @@ void atl_wrapper::set_internal_env(const atl_attr_t& attr)
 }
 
 atl_wrapper::atl_wrapper() {
-
     auto transport_type = ccl::global_data::env().atl_transport;
 
     char* pm_type_str;
-    switch (transport_type)
-    {
+    switch (transport_type) {
         case ccl_atl_ofi:
             pm_type_str = getenv(PM_TYPE);
             if (pm_type_str) {
@@ -71,24 +68,18 @@ atl_wrapper::atl_wrapper() {
             }
             transport = std::shared_ptr<iatl>(new atl_ofi());
             break;
-        case ccl_atl_mpi:
-            transport = std::shared_ptr<iatl>(new atl_mpi());
-            break;
-        default:
-            LOG_ERROR("Unsupported yet");
-            break;
+        case ccl_atl_mpi: transport = std::shared_ptr<iatl>(new atl_mpi()); break;
+        default: LOG_ERROR("Unsupported yet"); break;
     }
 
     init_transport();
 }
 
 atl_wrapper::atl_wrapper(std::shared_ptr<ikvs_wrapper> k) {
-
     auto transport_type = ccl::global_data::env().atl_transport;
 
     char* pm_type_str;
-    switch (transport_type)
-    {
+    switch (transport_type) {
         case ccl_atl_ofi:
             pm_type_str = getenv(PM_TYPE);
             if (pm_type_str) {
@@ -107,70 +98,67 @@ atl_wrapper::atl_wrapper(std::shared_ptr<ikvs_wrapper> k) {
             }
             transport = std::shared_ptr<iatl>(new atl_ofi());
             break;
-        case ccl_atl_mpi:
-            transport = std::shared_ptr<iatl>(new atl_mpi());
-            break;
-        default:
-            LOG_ERROR("Unsupported yet");
-            break;
+        case ccl_atl_mpi: transport = std::shared_ptr<iatl>(new atl_mpi()); break;
+        default: LOG_ERROR("Unsupported yet"); break;
     }
 
     init_transport();
 }
 
-atl_wrapper::atl_wrapper(size_t dev_count,
-                         const std::vector<size_t> &ranks,
+atl_wrapper::atl_wrapper(int total_rank_count,
+                         const std::vector<int>& ranks,
                          std::shared_ptr<ikvs_wrapper> k) {
     auto transport_type = ccl::global_data::env().atl_transport;
 
-    switch (transport_type)
-    {
-        case ccl_atl_ofi:
-            pmi = std::unique_ptr<ipmi>(new pmi_resizable_simple(dev_count, ranks, k));
+    switch (transport_type) {
+        case ccl_atl_ofi: {
+            size_t transorts_count = transports.size();
+            pmi = std::unique_ptr<ipmi>(new pmi_resizable_simple(total_rank_count, ranks, k));
 
-            if (pmi->get_thread() == 0) {
+            if (pmi->get_local_thread_idx() == 0) {
                 transports.push_back(std::shared_ptr<iatl>(new atl_ofi()));
             }
-            pmi->pmrt_barrier();
+            //TODO: Rework it on barrier
+            while (transorts_count == transports.size()) {
+                ccl_yield(ccl::global_data::env().yield_type);
+            }
             static std::mutex memory_mutex;
             {
                 std::lock_guard<std::mutex> lock(memory_mutex);
                 transport = transports.back();
             }
-            break;
-        case ccl_atl_mpi:
-             transport = std::shared_ptr<iatl>(new atl_mpi());
-             break;
-        default:
-            LOG_ERROR("Unsupported yet");
-            break;
+        } break;
+        case ccl_atl_mpi: transport = std::shared_ptr<iatl>(new atl_mpi()); break;
+        default: LOG_ERROR("Unsupported yet"); break;
     }
 
     init_transport();
 }
 void atl_wrapper::init_transport() {
-
     LOG_INFO("init ATL, requested ep_count ", attr.ep_count);
-
-    transport->atl_init(nullptr, nullptr, &attr, nullptr, pmi);
+    static std::mutex memory_mutex;
+    {
+        std::lock_guard<std::mutex> lock(memory_mutex);
+        if (!transport->is_inited())
+            transport->atl_init(nullptr, nullptr, &attr, nullptr, pmi);
+    }
     eps = transport->atl_get_eps();
     tag = std::unique_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.tag_bits, attr.max_tag));
 
     if (pmi) {
-        threads_count = pmi->get_threads_count();
-        devices_per_rank_count = pmi->get_devices_per_rank_count();
+        threads_per_process = pmi->get_threads_per_process();
+        ranks_per_process = pmi->get_ranks_per_process();
         rank = pmi->get_rank();
         size = pmi->get_size();
     }
     else {
-        threads_count = 1;
-        devices_per_rank_count = 1;
-        rank = static_cast<atl_mpi *>(transport.get())->get_rank();
-        size = static_cast<atl_mpi *>(transport.get())->get_size();
+        threads_per_process = 1;
+        ranks_per_process = 1;
+        rank = static_cast<atl_mpi*>(transport.get())->get_rank();
+        size = static_cast<atl_mpi*>(transport.get())->get_size();
     }
 
-    if (rank == 0)
-    {
+    if (rank == 0) {
         tag->print();
 
         LOG_INFO("\n",
diff --git a/src/atl/atl_wrapper.h b/src/atl/atl_wrapper.h
index 4886a6de8..6891a53f2 100644
--- a/src/atl/atl_wrapper.h
+++ b/src/atl/atl_wrapper.h
@@ -28,14 +28,13 @@
 
 class atl_wrapper {
 public:
-
     static void set_internal_env(const atl_attr_t& attr);
 
     ~atl_wrapper();
     atl_wrapper();
     atl_wrapper(std::shared_ptr<ikvs_wrapper> k);
-    atl_wrapper(size_t dev_count,
-                const std::vector<size_t>& ranks,
+    atl_wrapper(int total_rank_count,
+                const std::vector<int>& ranks,
                 std::shared_ptr<ikvs_wrapper> k);
 
     //    atl_status_t
@@ -46,11 +45,11 @@ class atl_wrapper {
     //        return transport->atl_init(argc, argv, att, main_addr, pmi);
     //    }
 
-    atl_status_t atl_main_addr_reserv(char* main_addr) {
+    atl_status_t atl_main_addr_reserve(char* main_addr) {
         if (!pmi)
             return ATL_STATUS_UNSUPPORTED;
 
-        return pmi->pmrt_main_addr_reserv(main_addr);
+        return pmi->pmrt_main_addr_reserve(main_addr);
         ;
     }
 
@@ -98,7 +97,7 @@ class atl_wrapper {
     atl_status_t atl_ep_send(size_t ep_idx,
                              const void* buf,
                              size_t len,
-                             size_t dst_proc_idx,
+                             int dst_proc_idx,
                              uint64_t tag,
                              atl_req_t* req) {
         return transport->atl_ep_send(eps[ep_idx], buf, len, dst_proc_idx, tag, req);
@@ -107,14 +106,14 @@ class atl_wrapper {
     atl_status_t atl_ep_recv(size_t ep_idx,
                              void* buf,
                              size_t len,
-                             size_t src_proc_idx,
+                             int src_proc_idx,
                              uint64_t tag,
                              atl_req_t* req) {
         return transport->atl_ep_recv(eps[ep_idx], buf, len, src_proc_idx, tag, req);
     }
 
     atl_status_t atl_ep_probe(size_t ep_idx,
-                              size_t src_proc_idx,
+                              int src_proc_idx,
                               uint64_t tag,
                               int* found,
                               size_t* recv_len) {
@@ -166,7 +165,7 @@ class atl_wrapper {
         return transport->atl_ep_barrier(eps[ep_idx], req);
     }
 
-    atl_status_t atl_ep_bcast(size_t ep_idx, void* buf, size_t len, size_t root, atl_req_t* req) {
+    atl_status_t atl_ep_bcast(size_t ep_idx, void* buf, size_t len, int root, atl_req_t* req) {
         return transport->atl_ep_bcast(eps[ep_idx], buf, len, root, req);
     }
 
@@ -174,7 +173,7 @@ class atl_wrapper {
                                const void* send_buf,
                                void* recv_buf,
                                size_t len,
-                               size_t root,
+                               int root,
                                atl_datatype_t dtype,
                                atl_reduction_t op,
                                atl_req_t* req) {
@@ -188,7 +187,8 @@ class atl_wrapper {
                                        atl_datatype_t dtype,
                                        atl_reduction_t op,
                                        atl_req_t* req) {
-        return transport->atl_ep_reduce_scatter(eps[ep_idx], send_buf, recv_buf, recv_len, dtype, op, req);
+        return transport->atl_ep_reduce_scatter(
+            eps[ep_idx], send_buf, recv_buf, recv_len, dtype, op, req);
     }
 
     atl_status_t atl_ep_read(size_t ep_idx,
@@ -197,7 +197,7 @@ class atl_wrapper {
                              atl_mr_t* mr,
                              uint64_t addr,
                              uintptr_t remote_key,
-                             size_t dst_proc_idx,
+                             int dst_proc_idx,
                              atl_req_t* req) {
         return transport->atl_ep_read(
             eps[ep_idx], buf, len, mr, addr, remote_key, dst_proc_idx, req);
@@ -209,7 +209,7 @@ class atl_wrapper {
                               atl_mr_t* mr,
                               uint64_t addr,
                               uintptr_t remote_key,
-                              size_t dst_proc_idx,
+                              int dst_proc_idx,
                               atl_req_t* req) {
         return transport->atl_ep_write(
             eps[ep_idx], buf, len, mr, addr, remote_key, dst_proc_idx, req);
@@ -235,22 +235,30 @@ class atl_wrapper {
         return transport->atl_ep_check(eps[ep_idx], is_completed, req);
     }
 
-    size_t get_threads_count() {
-        return threads_count;
+    size_t get_threads_per_process() {
+        return threads_per_process;
     }
 
-    size_t get_devices_per_rank_count() {
-        return devices_per_rank_count;
+    size_t get_ranks_per_process() {
+        return ranks_per_process;
     }
 
-    size_t get_rank() {
+    int get_rank() {
         return rank;
     }
 
-    size_t get_size() {
+    int get_size() {
         return size;
     }
 
+    /*
+     * TODO: Temporary change.
+     * Need to define correct to unique id
+     */
+    size_t get_id() {
+        return 0;
+    }
+
     /* static ATL attr for all transport instances
        actual values generated by executor */
     static atl_attr_t attr;
@@ -258,15 +266,15 @@ class atl_wrapper {
     std::unique_ptr<ccl_atl_tag> tag;
 
 private:
+    int rank;
+    int size;
+
+    size_t threads_per_process;
+    size_t ranks_per_process;
 
     std::shared_ptr<iatl> transport;
     std::unique_ptr<ipmi> pmi;
-    
-
     atl_ep_t** eps = nullptr;
-    size_t threads_count;
-    size_t devices_per_rank_count;
-    size_t rank;
-    size_t size;
+
     void init_transport();
 };
diff --git a/src/atl/mpi/CMakeLists.txt b/src/atl/mpi/CMakeLists.txt
index decf51c3e..ab733cbed 100644
--- a/src/atl/mpi/CMakeLists.txt
+++ b/src/atl/mpi/CMakeLists.txt
@@ -13,42 +13,42 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-set(MPI_SRC
-        atl_mpi.c)
-
-set(COMMON_MPI_INC_DIRS
-    ${CMAKE_CURRENT_SOURCE_DIR}/../
-    ${PROJECT_SOURCE_DIR}/mpi/include/
-    ${PROJECT_SOURCE_DIR}/src/)
-
-# special library that holds objects only
-add_library(ccl_atl_mpi-objects OBJECT ${MPI_SRC})
-set_target_properties(ccl_atl_mpi-objects PROPERTIES POSITION_INDEPENDENT_CODE 1)
-target_include_directories(ccl_atl_mpi-objects PRIVATE ${COMMON_MPI_INC_DIRS})
-
-# add library search directory
-link_directories(${PROJECT_SOURCE_DIR}/mpi/lib)
-
-# shared
-add_library(ccl_atl_mpi SHARED $<TARGET_OBJECTS:ccl_atl_mpi-objects>)
-target_include_directories(ccl_atl_mpi PRIVATE ${COMMON_MPI_INC_DIRS})
-
-target_link_libraries(ccl_atl_mpi PRIVATE mpi)
-
-# link with release_mt libmpi.so for oneAPI Base toolkit
-set_target_properties(ccl_atl_mpi PROPERTIES LINK_FLAGS "-Wl,-rpath,../../../../mpi/latest/lib/release_mt/")
-if (NOT LIB_ATL_MPI_SO_VERSION AND NOT LIB_ATL_MPI_MAJOR_VERSION)
-        set_target_properties(ccl_atl_mpi PROPERTIES VERSION 1 SOVERSION 1.0)
-else()
-        set_target_properties(ccl_atl_mpi PROPERTIES VERSION ${LIB_ATL_MPI_SO_VERSION} SOVERSION ${LIB_ATL_MPI_MAJOR_VERSION})
-endif()
-
-set_target_properties(ccl_atl_mpi PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
-install(TARGETS ccl_atl_mpi LIBRARY DESTINATION ${CCL_INSTALL_LIB})
-
-# static
-add_library(ccl_atl_mpi-static STATIC $<TARGET_OBJECTS:ccl_atl_mpi-objects>)
-set_target_properties(ccl_atl_mpi-static PROPERTIES OUTPUT_NAME ccl_atl_mpi)
-set_target_properties(ccl_atl_mpi-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
-install(TARGETS ccl_atl_mpi-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB})
+
+set(MPI_SRC
+        atl_mpi.c)
+
+set(COMMON_MPI_INC_DIRS
+    ${CMAKE_CURRENT_SOURCE_DIR}/../
+    ${PROJECT_SOURCE_DIR}/mpi/include/
+    ${PROJECT_SOURCE_DIR}/src/)
+
+# special library that holds objects only
+add_library(ccl_atl_mpi-objects OBJECT ${MPI_SRC})
+set_target_properties(ccl_atl_mpi-objects PROPERTIES POSITION_INDEPENDENT_CODE 1)
+target_include_directories(ccl_atl_mpi-objects PRIVATE ${COMMON_MPI_INC_DIRS})
+
+# add library search directory
+link_directories(${PROJECT_SOURCE_DIR}/mpi/lib)
+
+# shared
+add_library(ccl_atl_mpi SHARED $<TARGET_OBJECTS:ccl_atl_mpi-objects>)
+target_include_directories(ccl_atl_mpi PRIVATE ${COMMON_MPI_INC_DIRS})
+
+target_link_libraries(ccl_atl_mpi PRIVATE mpi)
+
+# link with release_mt libmpi.so for oneAPI Base toolkit
+set_target_properties(ccl_atl_mpi PROPERTIES LINK_FLAGS "-Wl,-rpath,../../../../mpi/latest/lib/release_mt/")
+if (NOT LIB_ATL_MPI_SO_VERSION AND NOT LIB_ATL_MPI_MAJOR_VERSION)
+        set_target_properties(ccl_atl_mpi PROPERTIES VERSION 1 SOVERSION 1.0)
+else()
+        set_target_properties(ccl_atl_mpi PROPERTIES VERSION ${LIB_ATL_MPI_SO_VERSION} SOVERSION ${LIB_ATL_MPI_MAJOR_VERSION})
+endif()
+
+set_target_properties(ccl_atl_mpi PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
+install(TARGETS ccl_atl_mpi LIBRARY DESTINATION ${CCL_INSTALL_LIB})
+
+# static
+add_library(ccl_atl_mpi-static STATIC $<TARGET_OBJECTS:ccl_atl_mpi-objects>)
+set_target_properties(ccl_atl_mpi-static PROPERTIES OUTPUT_NAME ccl_atl_mpi)
+set_target_properties(ccl_atl_mpi-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
+install(TARGETS ccl_atl_mpi-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB})
diff --git a/src/atl/mpi/atl_mpi.c b/src/atl/mpi/atl_mpi.c
index 9c530d1aa..65570c124 100644
--- a/src/atl/mpi/atl_mpi.c
+++ b/src/atl/mpi/atl_mpi.c
@@ -182,9 +182,9 @@ static inline void atl_mpi_check_op_params(void* in_buf,
 }
 
 static void INLINE_TARGET_ATTRIBUTE_ALL atl_mpi_bf16_base_op(void* in,
-                                                              void* inout,
-                                                              int* length,
-                                                              ccl_bf16_reduction_func_ptr op) {
+                                                             void* inout,
+                                                             int* length,
+                                                             ccl_bf16_reduction_func_ptr op) {
     unsigned short* in_buf = (unsigned short*)in;
     unsigned short* inout_buf = (unsigned short*)inout;
 
@@ -194,33 +194,33 @@ static void INLINE_TARGET_ATTRIBUTE_ALL atl_mpi_bf16_base_op(void* in,
 
 // MPI BF16 operation definitions
 static void TARGET_ATTRIBUTE_ALL atl_mpi_bf16_sum_op(void* in,
-                                                      void* inout,
-                                                      int* length,
-                                                      MPI_Datatype* datatype) {
+                                                     void* inout,
+                                                     int* length,
+                                                     MPI_Datatype* datatype) {
     atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
     atl_mpi_bf16_base_op(in, inout, length, &sum_wrap);
 }
 
 static void TARGET_ATTRIBUTE_ALL atl_mpi_bf16_prod_op(void* in,
-                                                       void* inout,
-                                                       int* length,
-                                                       MPI_Datatype* datatype) {
+                                                      void* inout,
+                                                      int* length,
+                                                      MPI_Datatype* datatype) {
     atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
     atl_mpi_bf16_base_op(in, inout, length, &prod_wrap);
 }
 
 static void TARGET_ATTRIBUTE_ALL atl_mpi_bf16_min_op(void* in,
-                                                      void* inout,
-                                                      int* length,
-                                                      MPI_Datatype* datatype) {
+                                                     void* inout,
+                                                     int* length,
+                                                     MPI_Datatype* datatype) {
     atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
     atl_mpi_bf16_base_op(in, inout, length, &min_wrap);
 }
 
 static void TARGET_ATTRIBUTE_ALL atl_mpi_bf16_max_op(void* in,
-                                                      void* inout,
-                                                      int* length,
-                                                      MPI_Datatype* datatype) {
+                                                     void* inout,
+                                                     int* length,
+                                                     MPI_Datatype* datatype) {
     atl_mpi_check_op_params(in, inout, length, datatype, __FUNCTION__);
     atl_mpi_bf16_base_op(in, inout, length, &max_wrap);
 }
@@ -509,7 +509,6 @@ atl_mpi_lib_type_t atl_mpi_get_lib_type() {
 }
 
 atl_status_t atl_mpi_set_env(const atl_attr_t& attr) {
-
     char mpi_ep_count_str[EP_IDX_MAX_STR_LEN] = { 0 };
 
     /* we have endpoints on MPI and ATL levels */
@@ -598,7 +597,7 @@ static atl_status_t atl_mpi_mr_dereg(atl_ctx_t* ctx, atl_mr_t* mr) {
 static atl_status_t atl_mpi_ep_send(atl_ep_t* ep,
                                     const void* buf,
                                     size_t len,
-                                    size_t dest_proc_idx,
+                                    int dst_proc_idx,
                                     uint64_t tag,
                                     atl_req_t* req) {
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
@@ -606,7 +605,7 @@ static atl_status_t atl_mpi_ep_send(atl_ep_t* ep,
     mpi_req->comp_state = ATL_MPI_COMP_POSTED;
 
     int ret = MPI_Isend(
-        buf, len, MPI_CHAR, dest_proc_idx, (int)tag, mpi_ep->mpi_comm, &mpi_req->native_req);
+        buf, len, MPI_CHAR, dst_proc_idx, (int)tag, mpi_ep->mpi_comm, &mpi_req->native_req);
 
 #if 0
 //#ifdef ENABLE_DEBUG
@@ -641,7 +640,7 @@ static atl_status_t atl_mpi_ep_send(atl_ep_t* ep,
 static atl_status_t atl_mpi_ep_recv(atl_ep_t* ep,
                                     void* buf,
                                     size_t len,
-                                    size_t src_proc_idx,
+                                    int src_proc_idx,
                                     uint64_t tag,
                                     atl_req_t* req) {
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
@@ -682,7 +681,7 @@ static atl_status_t atl_mpi_ep_recv(atl_ep_t* ep,
 }
 
 static atl_status_t atl_mpi_ep_probe(atl_ep_t* ep,
-                                     size_t src_proc_idx,
+                                     int src_proc_idx,
                                      uint64_t tag,
                                      int* found,
                                      size_t* recv_len) {
@@ -716,8 +715,7 @@ static atl_status_t atl_mpi_ep_allgatherv(atl_ep_t* ep,
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
     atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
 
-    if (global_data.sync_coll)
-    {
+    if (global_data.sync_coll) {
         ret = MPI_Allgatherv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                              send_len,
                              MPI_CHAR,
@@ -729,8 +727,7 @@ static atl_status_t atl_mpi_ep_allgatherv(atl_ep_t* ep,
         mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
-    else
-    {
+    else {
         ret = MPI_Iallgatherv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                               send_len,
                               MPI_CHAR,
@@ -761,8 +758,7 @@ static atl_status_t atl_mpi_ep_allreduce(atl_ep_t* ep,
     MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
     MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
 
-    if (global_data.sync_coll)
-    {
+    if (global_data.sync_coll) {
         ret = MPI_Allreduce((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                             recv_buf,
                             count,
@@ -772,8 +768,7 @@ static atl_status_t atl_mpi_ep_allreduce(atl_ep_t* ep,
         mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
-    else
-    {
+    else {
         ret = MPI_Iallreduce((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                              recv_buf,
                              count,
@@ -825,8 +820,7 @@ static atl_status_t atl_mpi_ep_alltoall(atl_ep_t* ep,
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
     atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
 
-    if (global_data.sync_coll)
-    {
+    if (global_data.sync_coll) {
         ret = MPI_Alltoall((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                            len,
                            MPI_CHAR,
@@ -837,8 +831,7 @@ static atl_status_t atl_mpi_ep_alltoall(atl_ep_t* ep,
         mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
-    else
-    {
+    else {
         ret = MPI_Ialltoall((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                             len,
                             MPI_CHAR,
@@ -866,8 +859,7 @@ static atl_status_t atl_mpi_ep_alltoallv(atl_ep_t* ep,
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
     atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
 
-    if (global_data.sync_coll)
-    {
+    if (global_data.sync_coll) {
         ret = MPI_Alltoallv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                             send_lens,
                             send_offsets,
@@ -880,8 +872,7 @@ static atl_status_t atl_mpi_ep_alltoallv(atl_ep_t* ep,
         mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
-    else
-    {
+    else {
         ret = MPI_Ialltoallv((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
                              send_lens,
                              send_offsets,
@@ -899,20 +890,17 @@ static atl_status_t atl_mpi_ep_alltoallv(atl_ep_t* ep,
 }
 
 static atl_status_t atl_mpi_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
-
     int ret = MPI_SUCCESS;
 
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
     atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
 
-    if (global_data.sync_coll)
-    {
+    if (global_data.sync_coll) {
         ret = MPI_Barrier(mpi_ep->mpi_comm);
         mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
-    else
-    {
+    else {
         ret = MPI_Ibarrier(mpi_ep->mpi_comm, &mpi_req->native_req);
         mpi_req->comp_state = ATL_MPI_COMP_POSTED;
     }
@@ -923,21 +911,19 @@ static atl_status_t atl_mpi_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
 static atl_status_t atl_mpi_ep_bcast(atl_ep_t* ep,
                                      void* buf,
                                      size_t len,
-                                     size_t root,
+                                     int root,
                                      atl_req_t* req) {
     int ret = MPI_SUCCESS;
 
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
     atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
 
-    if (global_data.sync_coll)
-    {
+    if (global_data.sync_coll) {
         ret = MPI_Bcast(buf, len, MPI_CHAR, root, mpi_ep->mpi_comm);
         mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
-    else
-    {
+    else {
         ret = MPI_Ibcast(buf, len, MPI_CHAR, root, mpi_ep->mpi_comm, &mpi_req->native_req);
         mpi_req->comp_state = ATL_MPI_COMP_POSTED;
     }
@@ -949,7 +935,7 @@ static atl_status_t atl_mpi_ep_reduce(atl_ep_t* ep,
                                       const void* send_buf,
                                       void* recv_buf,
                                       size_t count,
-                                      size_t root,
+                                      int root,
                                       atl_datatype_t dtype,
                                       atl_reduction_t op,
                                       atl_req_t* req) {
@@ -958,25 +944,23 @@ static atl_status_t atl_mpi_ep_reduce(atl_ep_t* ep,
     atl_mpi_ep_t* mpi_ep = container_of(ep, atl_mpi_ep_t, ep);
     atl_mpi_req_t* mpi_req = ((atl_mpi_req_t*)req->internal);
 
-    size_t my_proc_idx = ep->ctx->coord.global_idx;
+    int my_proc_idx = ep->ctx->coord.global_idx;
     MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
     MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
 
-    if (global_data.sync_coll)
-    {
+    if (global_data.sync_coll) {
         ret = MPI_Reduce(
-                (send_buf && (send_buf == recv_buf) && (root == my_proc_idx)) ? MPI_IN_PLACE : send_buf,
-                recv_buf,
-                count,
-                mpi_dtype,
-                mpi_op,
-                root,
-                mpi_ep->mpi_comm);
+            (send_buf && (send_buf == recv_buf) && (root == my_proc_idx)) ? MPI_IN_PLACE : send_buf,
+            recv_buf,
+            count,
+            mpi_dtype,
+            mpi_op,
+            root,
+            mpi_ep->mpi_comm);
         mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
-    else
-    {
+    else {
         ret = MPI_Ireduce(
             (send_buf && (send_buf == recv_buf) && (root == my_proc_idx)) ? MPI_IN_PLACE : send_buf,
             recv_buf,
@@ -1007,20 +991,18 @@ static atl_status_t atl_mpi_ep_reduce_scatter(atl_ep_t* ep,
     MPI_Datatype mpi_dtype = atl2mpi_dtype(dtype);
     MPI_Op mpi_op = atl2mpi_op(op, mpi_dtype);
 
-    if (global_data.sync_coll)
-    {
-        ret = MPI_Reduce_scatter_block(
-            (send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
-            recv_buf,
-            recv_count,
-            mpi_dtype,
-            mpi_op,
-            mpi_ep->mpi_comm);
+    if (global_data.sync_coll) {
+        ret =
+            MPI_Reduce_scatter_block((send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
+                                     recv_buf,
+                                     recv_count,
+                                     mpi_dtype,
+                                     mpi_op,
+                                     mpi_ep->mpi_comm);
         mpi_req->comp_state = ATL_MPI_COMP_COMPLETED;
         mpi_req->native_req = MPI_REQUEST_NULL;
     }
-    else
-    {
+    else {
         ret = MPI_Ireduce_scatter_block(
             (send_buf && (send_buf == recv_buf)) ? MPI_IN_PLACE : send_buf,
             recv_buf,
@@ -1041,7 +1023,7 @@ static atl_status_t atl_mpi_ep_read(atl_ep_t* ep,
                                     atl_mr_t* mr,
                                     uint64_t addr,
                                     uintptr_t r_key,
-                                    size_t dest_proc_idx,
+                                    int dst_proc_idx,
                                     atl_req_t* req) {
     return ATL_STATUS_UNSUPPORTED;
 }
@@ -1052,7 +1034,7 @@ static atl_status_t atl_mpi_ep_write(atl_ep_t* ep,
                                      atl_mr_t* mr,
                                      uint64_t addr,
                                      uintptr_t r_key,
-                                     size_t dest_proc_idx,
+                                     int dst_proc_idx,
                                      atl_req_t* req) {
     return ATL_STATUS_UNSUPPORTED;
 }
@@ -1144,7 +1126,6 @@ static atl_comp_ops_t atl_mpi_ep_comp_ops = { .wait = atl_mpi_ep_wait,
                                               .check = atl_mpi_ep_check };
 
 static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t** ep) {
-
     int ret;
     ssize_t mpi_ep_idx = idx;
 
@@ -1160,8 +1141,9 @@ static atl_status_t atl_mpi_ep_init(atl_mpi_ctx_t* mpi_ctx, size_t idx, atl_ep_t
     MPI_Info_create(&info);
 
     char mpi_ep_idx_str[EP_IDX_MAX_STR_LEN];
-    
-    if (global_data.extra_ep) mpi_ep_idx += global_data.extra_ep;
+
+    if (global_data.extra_ep)
+        mpi_ep_idx += global_data.extra_ep;
     memset(mpi_ep_idx_str, 0, EP_IDX_MAX_STR_LEN);
     snprintf(mpi_ep_idx_str, EP_IDX_MAX_STR_LEN, "%zu", mpi_ep_idx);
 
@@ -1235,7 +1217,8 @@ static atl_status_t atl_mpi_init(int* argc,
         ret = MPI_Init_thread(argc, argv, required_thread_level, &provided_thread_level);
         if (provided_thread_level < required_thread_level) {
             ATL_MPI_PRINT("unexpected MPI thread level: requested %d, provided %d",
-                required_thread_level, provided_thread_level);
+                          required_thread_level,
+                          provided_thread_level);
             goto err_init;
         }
     }
@@ -1245,8 +1228,9 @@ static atl_status_t atl_mpi_init(int* argc,
         MPI_Query_thread(&provided_thread_level);
         if (provided_thread_level < required_thread_level) {
             ATL_MPI_PRINT("MPI was initialized externaly but with unexpected thread level: "
-                "requested %d, provided %d",
-                required_thread_level, provided_thread_level);
+                          "requested %d, provided %d",
+                          required_thread_level,
+                          provided_thread_level);
             goto err_init;
         }
     }
@@ -1333,14 +1317,14 @@ static atl_status_t atl_mpi_init(int* argc,
     return ATL_STATUS_FAILURE;
 }
 
-atl_status_t atl_mpi_main_addr_reserv(char* main_addr) {
+atl_status_t atl_mpi_main_addr_reserve(char* main_addr) {
     return ATL_STATUS_UNSUPPORTED;
 }
 
 ATL_MPI_INI {
     atl_transport->name = "mpi";
     atl_transport->init = atl_mpi_init;
-    atl_transport->main_addr_reserv = atl_mpi_main_addr_reserv;
+    atl_transport->reserve_addr = atl_mpi_main_addr_reserve;
     return ATL_STATUS_SUCCESS;
 }
 #ifdef __cplusplus
diff --git a/src/atl/mpi/atl_mpi.cpp b/src/atl/mpi/atl_mpi.cpp
index 8f23b32e6..29ec6099d 100644
--- a/src/atl/mpi/atl_mpi.cpp
+++ b/src/atl/mpi/atl_mpi.cpp
@@ -16,8 +16,7 @@
 #include "atl_mpi.h"
 #include "atl_mpi.c"
 
-atl_status_t atl_mpi::atl_set_env(const atl_attr_t& attr)
-{
+atl_status_t atl_mpi::atl_set_env(const atl_attr_t& attr) {
     return atl_mpi_set_env(attr);
 }
 
@@ -27,6 +26,7 @@ atl_status_t atl_mpi::atl_init(int* argc,
                                const char* main_addr,
                                std::unique_ptr<ipmi>& pmi) {
     (void)pmi;
+    inited = true;
     return atl_mpi_init(argc, argv, attr, &ctx, main_addr);
 }
 
@@ -63,7 +63,7 @@ atl_status_t atl_mpi::atl_mr_dereg(atl_mr_t* mr) {
 atl_status_t atl_mpi::atl_ep_send(atl_ep_t* ep,
                                   const void* buf,
                                   size_t len,
-                                  size_t dst_proc_idx,
+                                  int dst_proc_idx,
                                   uint64_t tag,
                                   atl_req_t* req) {
     return atl_mpi_ep_send(ep, buf, len, dst_proc_idx, tag, req);
@@ -72,14 +72,14 @@ atl_status_t atl_mpi::atl_ep_send(atl_ep_t* ep,
 atl_status_t atl_mpi::atl_ep_recv(atl_ep_t* ep,
                                   void* buf,
                                   size_t len,
-                                  size_t src_proc_idx,
+                                  int src_proc_idx,
                                   uint64_t tag,
                                   atl_req_t* req) {
     return atl_mpi_ep_recv(ep, buf, len, src_proc_idx, tag, req);
 }
 
 atl_status_t atl_mpi::atl_ep_probe(atl_ep_t* ep,
-                                   size_t src_proc_idx,
+                                   int src_proc_idx,
                                    uint64_t tag,
                                    int* found,
                                    size_t* recv_len) {
@@ -130,11 +130,7 @@ atl_status_t atl_mpi::atl_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
     return atl_mpi_ep_barrier(ep, req);
 }
 
-atl_status_t atl_mpi::atl_ep_bcast(atl_ep_t* ep,
-                                   void* buf,
-                                   size_t len,
-                                   size_t root,
-                                   atl_req_t* req) {
+atl_status_t atl_mpi::atl_ep_bcast(atl_ep_t* ep, void* buf, size_t len, int root, atl_req_t* req) {
     return atl_mpi_ep_bcast(ep, buf, len, root, req);
 }
 
@@ -142,7 +138,7 @@ atl_status_t atl_mpi::atl_ep_reduce(atl_ep_t* ep,
                                     const void* send_buf,
                                     void* recv_buf,
                                     size_t len,
-                                    size_t root,
+                                    int root,
                                     atl_datatype_t dtype,
                                     atl_reduction_t op,
                                     atl_req_t* req) {
@@ -165,7 +161,7 @@ atl_status_t atl_mpi::atl_ep_read(atl_ep_t* ep,
                                   atl_mr_t* mr,
                                   uint64_t addr,
                                   uintptr_t remote_key,
-                                  size_t dst_proc_idx,
+                                  int dst_proc_idx,
                                   atl_req_t* req) {
     return atl_mpi_ep_read(ep, buf, len, mr, addr, remote_key, dst_proc_idx, req);
 }
@@ -176,7 +172,7 @@ atl_status_t atl_mpi::atl_ep_write(atl_ep_t* ep,
                                    atl_mr_t* mr,
                                    uint64_t addr,
                                    uintptr_t remote_key,
-                                   size_t dst_proc_idx,
+                                   int dst_proc_idx,
                                    atl_req_t* req) {
     return atl_mpi_ep_write(ep, buf, len, mr, addr, remote_key, dst_proc_idx, req);
 }
diff --git a/src/atl/mpi/atl_mpi.h b/src/atl/mpi/atl_mpi.h
index a4941f500..a4aab3a47 100644
--- a/src/atl/mpi/atl_mpi.h
+++ b/src/atl/mpi/atl_mpi.h
@@ -43,19 +43,19 @@ class atl_mpi final : public iatl {
     atl_status_t atl_ep_send(atl_ep_t* ep,
                              const void* buf,
                              size_t len,
-                             size_t dst_proc_idx,
+                             int dst_proc_idx,
                              uint64_t tag,
                              atl_req_t* req) override;
 
     atl_status_t atl_ep_recv(atl_ep_t* ep,
                              void* buf,
                              size_t len,
-                             size_t src_proc_idx,
+                             int src_proc_idx,
                              uint64_t tag,
                              atl_req_t* req) override;
 
     atl_status_t atl_ep_probe(atl_ep_t* ep,
-                              size_t src_proc_idx,
+                              int src_proc_idx,
                               uint64_t tag,
                               int* found,
                               size_t* recv_len) override;
@@ -96,14 +96,14 @@ class atl_mpi final : public iatl {
     atl_status_t atl_ep_bcast(atl_ep_t* ep,
                               void* buf,
                               size_t len,
-                              size_t root,
+                              int root,
                               atl_req_t* req) override;
 
     atl_status_t atl_ep_reduce(atl_ep_t* ep,
                                const void* send_buf,
                                void* recv_buf,
                                size_t len,
-                               size_t root,
+                               int root,
                                atl_datatype_t dtype,
                                atl_reduction_t op,
                                atl_req_t* req) override;
@@ -122,7 +122,7 @@ class atl_mpi final : public iatl {
                              atl_mr_t* mr,
                              uint64_t addr,
                              uintptr_t remote_key,
-                             size_t dst_proc_idx,
+                             int dst_proc_idx,
                              atl_req_t* req) override;
 
     atl_status_t atl_ep_write(atl_ep_t* ep,
@@ -131,7 +131,7 @@ class atl_mpi final : public iatl {
                               atl_mr_t* mr,
                               uint64_t addr,
                               uintptr_t remote_key,
-                              size_t dst_proc_idx,
+                              int dst_proc_idx,
                               atl_req_t* req) override;
 
     atl_status_t atl_ep_wait(atl_ep_t* ep, atl_req_t* req) override;
@@ -146,14 +146,18 @@ class atl_mpi final : public iatl {
 
     atl_status_t atl_finalize() override;
 
-    size_t get_rank() {
+    int get_rank() {
         return ctx->coord.global_idx;
     }
-    size_t get_size() {
+    int get_size() {
         return ctx->coord.global_count;
     }
+    bool is_inited() override {
+        return inited;
+    }
 
 private:
     atl_ctx_t* ctx = nullptr;
     bool is_finalized{ false };
+    bool inited{ false };
 };
diff --git a/src/atl/ofi/atl_ofi.c b/src/atl/ofi/atl_ofi.c
index 702fb0817..8ca55b11f 100644
--- a/src/atl/ofi/atl_ofi.c
+++ b/src/atl/ofi/atl_ofi.c
@@ -196,7 +196,7 @@ typedef struct {
     /* table[0..proc_count][0..ep_count] */
     fi_addr_t* addr_table;
     size_t addr_len;
-    size_t first_proc_idx;
+    int first_proc_idx;
 
 } atl_ofi_prov_t;
 
@@ -229,16 +229,14 @@ typedef struct {
 } atl_ofi_req_t;
 
 static void atl_ofi_print_coord(atl_proc_coord_t* coord) {
-    ATL_OFI_DEBUG_PRINT("coord: global [idx %zu, cnt %zu], local [idx %zu, cnt %zu]",
+    ATL_OFI_DEBUG_PRINT("coord: global [idx %d, cnt %d], local [idx %d, cnt %d]",
                         coord->global_idx,
                         coord->global_count,
                         coord->local_idx,
                         coord->local_count);
 }
 
-static inline atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep,
-                                               size_t peer_proc_idx,
-                                               size_t msg_size) {
+static inline atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep, int peer_proc_idx, size_t msg_size) {
     size_t prov_idx;
     atl_ofi_ctx_t* ofi_ctx = container_of(ep->ctx, atl_ofi_ctx_t, ctx);
 
@@ -251,8 +249,8 @@ static inline atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep,
                        ofi_ctx->prov_count);
 
         atl_proc_coord_t* coord = &(ep->ctx->coord);
-        size_t my_node_idx = coord->global_idx / coord->local_count;
-        size_t peer_node_idx = peer_proc_idx / coord->local_count;
+        int my_node_idx = coord->global_idx / coord->local_count;
+        int peer_node_idx = peer_proc_idx / coord->local_count;
 
         if ((my_node_idx == peer_node_idx) &&
             (msg_size <= ofi_ctx->provs[ofi_ctx->shm_prov_idx].max_msg_size))
@@ -273,7 +271,7 @@ static inline atl_ofi_prov_t* atl_ofi_get_prov(atl_ep_t* ep,
 
 static inline fi_addr_t atl_ofi_get_addr(atl_ctx_t* ctx,
                                          atl_ofi_prov_t* prov,
-                                         size_t proc_idx,
+                                         int proc_idx,
                                          size_t ep_idx) {
     return *(prov->addr_table + ((ctx->ep_count * (proc_idx - prov->first_proc_idx)) + ep_idx));
 }
@@ -284,12 +282,12 @@ static atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, ipmi* p
     atl_proc_coord_t* coord = &(ofi_ctx->ctx.coord);
 
     atl_status_t ret = ATL_STATUS_SUCCESS;
-    size_t i;
-    size_t local_idx = 0, local_count = 0;
+    int i;
+    int local_idx = 0, local_count = 0;
     char* all_hostnames = NULL;
     char my_hostname[ATL_OFI_MAX_HOSTNAME_LEN] = { 0 };
     size_t my_hostname_len = 0;
-    size_t my_global_proc_idx = coord->global_idx;
+    int my_global_proc_idx = coord->global_idx;
 
     gethostname(my_hostname, ATL_OFI_MAX_HOSTNAME_LEN - 1);
     my_hostname_len = strlen(my_hostname);
@@ -306,7 +304,7 @@ static atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, ipmi* p
 
     snprintf(my_hostname + my_hostname_len,
              ATL_OFI_MAX_HOSTNAME_LEN - my_hostname_len,
-             "-%zu",
+             "-%d",
              my_global_proc_idx);
 
     ret = pmi->pmrt_kvs_put((char*)ATL_OFI_HOSTNAME_PM_KEY,
@@ -343,9 +341,9 @@ static atl_status_t atl_ofi_get_local_proc_coord(atl_ofi_ctx_t* ofi_ctx, ipmi* p
                      all_hostnames + i * ATL_OFI_MAX_HOSTNAME_LEN,
                      my_hostname_len + 1 /* including "-" at the end */)) {
             local_count++;
-            size_t peer_global_proc_idx;
+            int peer_global_proc_idx;
             sscanf(all_hostnames + i * ATL_OFI_MAX_HOSTNAME_LEN + my_hostname_len + 1,
-                   "%zu",
+                   "%d",
                    &peer_global_proc_idx);
             if (my_global_proc_idx > peer_global_proc_idx)
                 local_idx++;
@@ -372,8 +370,9 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
     atl_ctx_t* ctx = &(ofi_ctx->ctx);
     atl_ofi_prov_t* prov = &(ofi_ctx->provs[prov_idx]);
 
-    atl_status_t ret;
-    size_t i, j;
+    atl_status_t ret = ATL_STATUS_SUCCESS;
+    int i;
+    size_t j;
     int insert_count;
 
     size_t addr_idx = 0;
@@ -382,20 +381,20 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
 
     size_t named_ep_count = (prov->sep ? 1 : ctx->ep_count);
 
-    size_t local_count = ctx->coord.local_count;
-    size_t node_idx = ctx->coord.global_idx / local_count;
-    size_t shm_start_idx = node_idx * local_count;
-    size_t shm_end_idx = (node_idx + 1) * local_count;
+    int local_count = ctx->coord.local_count;
+    int node_idx = ctx->coord.global_idx / local_count;
+    int shm_start_idx = node_idx * local_count;
+    int shm_end_idx = (node_idx + 1) * local_count;
 
-    ATL_OFI_DEBUG_PRINT("shm_start_idx %zu, shm_end_idx %zu", shm_start_idx, shm_end_idx);
+    ATL_OFI_DEBUG_PRINT("shm_start_idx %d, shm_end_idx %d", shm_start_idx, shm_end_idx);
 
-    size_t proc_count = prov->is_shm ? ctx->coord.local_count : ctx->coord.global_count;
+    int proc_count = prov->is_shm ? ctx->coord.local_count : ctx->coord.global_count;
 
     if (proc_count == 0)
         return ATL_STATUS_SUCCESS;
 
     ATL_OFI_DEBUG_PRINT(
-        "name %s, is_shm %d, addr_len %zu, local_count %zu, global_count %zu, proc_count %zu",
+        "name %s, is_shm %d, addr_len %zu, local_count %d, global_count %d, proc_count %d",
         prov->info->fabric_attr->prov_name,
         prov->is_shm,
         prov->addr_len,
@@ -407,7 +406,7 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
     epnames_table_len = prov->addr_len * named_ep_count * proc_count;
 
     if (epnames_table_len == 0) {
-        ATL_OFI_PRINT("epnames_table_len == 0, addr_len %zu, named_ep_count %zu, proc_count %zu",
+        ATL_OFI_PRINT("epnames_table_len == 0, addr_len %zu, named_ep_count %zu, proc_count %d",
                       prov->addr_len,
                       named_ep_count,
                       proc_count);
@@ -438,7 +437,7 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
                 prov->addr_len);
 
             if (ret) {
-                ATL_OFI_PRINT("kvs_get error: ret %d, proc_idx %zu, ep_idx %zu, addr_idx %zu",
+                ATL_OFI_PRINT("kvs_get error: ret %d, proc_idx %d, ep_idx %zu, addr_idx %zu",
                               ret,
                               i,
                               j,
@@ -451,7 +450,7 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
     }
 
     ATL_OFI_DEBUG_PRINT(
-        "kvs_get: ep_count %zu, proc_count %zu, got %zu", named_ep_count, proc_count, addr_idx);
+        "kvs_get: ep_count %zu, proc_count %d, got %zu", named_ep_count, proc_count, addr_idx);
 
     if (addr_idx != named_ep_count * proc_count) {
         ATL_OFI_PRINT("unexpected kvs_get results: expected %zu, got %zu",
@@ -474,7 +473,7 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
     insert_count = fi_av_insert(
         prov->av, epnames_table, named_ep_count * proc_count, prov->addr_table, 0, NULL);
 
-    ATL_OFI_DEBUG_PRINT("av_insert: ep_count %zu, proc_count %zu, inserted %d",
+    ATL_OFI_DEBUG_PRINT("av_insert: ep_count %zu, proc_count %d, inserted %d",
                         named_ep_count,
                         proc_count,
                         insert_count);
@@ -495,6 +494,11 @@ static atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t* ofi_ctx,
 
         fi_addr_t* table;
         table = (fi_addr_t*)calloc(1, proc_count * sizeof(fi_addr_t));
+        if (table == NULL) {
+            ATL_OFI_DEBUG_PRINT("Memory allocaion failed");
+            ret = ATL_STATUS_FAILURE;
+            goto err_ep_names;
+        }
         memcpy(table, prov->addr_table, proc_count * sizeof(fi_addr_t));
 
         for (i = 0; i < proc_count; i++) {
@@ -736,7 +740,6 @@ static int atl_ofi_wait_cancel_cq(struct fid_cq* cq) {
 }
 
 static atl_status_t atl_ofi_prov_ep_init(atl_ofi_prov_t* prov, size_t ep_idx) {
-
     ssize_t ret = 0;
 
     struct fi_cq_attr cq_attr;
@@ -903,13 +906,16 @@ static atl_status_t atl_ofi_set_env(const atl_attr_t& attr) {
 }
 
 static atl_status_t atl_ofi_adjust_env(atl_ofi_ctx_t* ofi_ctx, const atl_attr_t& attr) {
-    
     atl_ofi_set_env(attr);
 
     char* prov_env = getenv("FI_PROVIDER");
 
     if (prov_env && strlen(prov_env)) {
         ofi_ctx->prov_env_copy = (char*)calloc(strlen(prov_env) + 1, sizeof(char));
+        if (ofi_ctx->prov_env_copy == NULL) {
+            ATL_OFI_DEBUG_PRINT("Memory allocaion failed");
+            return ATL_STATUS_FAILURE;
+        }
         memcpy(ofi_ctx->prov_env_copy, prov_env, strlen(prov_env));
     }
     else
@@ -925,8 +931,11 @@ static atl_status_t atl_ofi_adjust_env(atl_ofi_ctx_t* ofi_ctx, const atl_attr_t&
                                         (single_prov ? 0 : 1) + /* for delimeter */
                                         1; /* for terminating null symbol */
 
-            char* prov_env_copy;
-            prov_env_copy = (char*)calloc(prov_env_copy_size, sizeof(char));
+            char* prov_env_copy = (char*)calloc(prov_env_copy_size, sizeof(char));
+            if (prov_env_copy == NULL) {
+                ATL_OFI_DEBUG_PRINT("Memory allocaion failed");
+                return ATL_STATUS_FAILURE;
+            }
 
             if (single_prov)
                 snprintf(prov_env_copy, prov_env_copy_size, "%s", ATL_OFI_SHM_PROV_NAME);
@@ -1019,7 +1028,7 @@ static atl_status_t atl_ofi_ep_wait(atl_ep_t* ep, atl_req_t* req);
 static atl_status_t atl_ofi_ep_send(atl_ep_t* ep,
                                     const void* buf,
                                     size_t len,
-                                    size_t dst_proc_idx,
+                                    int dst_proc_idx,
                                     uint64_t tag,
                                     atl_req_t* req) {
     ssize_t ret;
@@ -1054,7 +1063,7 @@ static atl_status_t atl_ofi_ep_send(atl_ep_t* ep,
 static atl_status_t atl_ofi_ep_recv(atl_ep_t* ep,
                                     void* buf,
                                     size_t len,
-                                    size_t src_proc_idx,
+                                    int src_proc_idx,
                                     uint64_t tag,
                                     atl_req_t* req) {
     ssize_t ret;
@@ -1088,7 +1097,7 @@ static atl_status_t atl_ofi_ep_recv(atl_ep_t* ep,
 }
 
 static atl_status_t atl_ofi_ep_probe(atl_ep_t* ep,
-                                     size_t src_proc_idx,
+                                     int src_proc_idx,
                                      uint64_t tag,
                                      int* found,
                                      size_t* recv_len) {
@@ -1242,7 +1251,7 @@ static atl_status_t atl_ofi_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
 static atl_status_t atl_ofi_ep_bcast(atl_ep_t* ep,
                                      void* buf,
                                      size_t len,
-                                     size_t root,
+                                     int root,
                                      atl_req_t* req) {
     return ATL_STATUS_UNSUPPORTED;
 }
@@ -1251,7 +1260,7 @@ static atl_status_t atl_ofi_ep_reduce(atl_ep_t* ep,
                                       const void* send_buf,
                                       void* recv_buf,
                                       size_t count,
-                                      size_t root,
+                                      int root,
                                       atl_datatype_t dtype,
                                       atl_reduction_t op,
                                       atl_req_t* req) {
@@ -1274,7 +1283,7 @@ static atl_status_t atl_ofi_ep_read(atl_ep_t* ep,
                                     atl_mr_t* mr,
                                     uint64_t addr,
                                     uintptr_t remote_key,
-                                    size_t dst_proc_idx,
+                                    int dst_proc_idx,
                                     atl_req_t* req) {
     ssize_t ret;
 
@@ -1312,7 +1321,7 @@ static atl_status_t atl_ofi_ep_write(atl_ep_t* ep,
                                      atl_mr_t* mr,
                                      uint64_t addr,
                                      uintptr_t remote_key,
-                                     size_t dst_proc_idx,
+                                     int dst_proc_idx,
                                      atl_req_t* req) {
     ssize_t ret;
 
@@ -1570,7 +1579,7 @@ static atl_status_t atl_ofi_init(int* argc,
     if (prov_env && !strcmp(prov_env, ATL_OFI_SHM_PROV_NAME)) {
         ATL_OFI_ASSERT(
             coord->global_count == coord->local_count,
-            "shm provider is requested as primary provider but global_count (%zu) != local_count (%zu)",
+            "shm provider is requested as primary provider but global_count (%d) != local_count (%d)",
             coord->global_count,
             coord->local_count);
 
@@ -1886,7 +1895,7 @@ static atl_status_t atl_ofi_init(int* argc,
 ATL_OFI_INI {
     atl_transport->name = "ofi";
     atl_transport->init = atl_ofi_init;
-    atl_transport->main_addr_reserv = atl_ofi_main_addr_reserv;
+    atl_transport->reserve_addr = atl_ofi_main_addr_reserve;
     return ATL_STATUS_SUCCESS;
 }
 #endif
diff --git a/src/atl/ofi/atl_ofi.cpp b/src/atl/ofi/atl_ofi.cpp
index 1a64d0dc6..29a616550 100644
--- a/src/atl/ofi/atl_ofi.cpp
+++ b/src/atl/ofi/atl_ofi.cpp
@@ -16,8 +16,7 @@
 #include "atl_ofi.h"
 #include "atl_ofi.c"
 
-atl_status_t atl_ofi::atl_set_env(const atl_attr_t& attr)
-{
+atl_status_t atl_ofi::atl_set_env(const atl_attr_t& attr) {
     return atl_ofi_set_env(attr);
 }
 
@@ -26,6 +25,7 @@ atl_status_t atl_ofi::atl_init(int* argc,
                                atl_attr_t* attr,
                                const char* main_addr,
                                std::unique_ptr<ipmi>& pmi) {
+    inited = true;
     return atl_ofi_init(argc, argv, attr, &ctx, main_addr, pmi.get());
 }
 
@@ -62,7 +62,7 @@ atl_status_t atl_ofi::atl_update(std::unique_ptr<ipmi>& pmi) {
 
     if (ofi_ctx->prov_count == 1 && ofi_ctx->provs[0].is_shm) {
         ATL_OFI_ASSERT(coord->global_count == coord->local_count,
-                       "unexpected coord after update: global_count %zu, local_count %zu",
+                       "unexpected coord after update: global_count %d, local_count %d",
                        coord->global_count,
                        coord->local_count);
         /* TODO: recreate providers */
@@ -104,7 +104,7 @@ atl_status_t atl_ofi::atl_mr_dereg(atl_mr_t* mr) {
 atl_status_t atl_ofi::atl_ep_send(atl_ep_t* ep,
                                   const void* buf,
                                   size_t len,
-                                  size_t dst_proc_idx,
+                                  int dst_proc_idx,
                                   uint64_t tag,
                                   atl_req_t* req) {
     return atl_ofi_ep_send(ep, buf, len, dst_proc_idx, tag, req);
@@ -113,14 +113,14 @@ atl_status_t atl_ofi::atl_ep_send(atl_ep_t* ep,
 atl_status_t atl_ofi::atl_ep_recv(atl_ep_t* ep,
                                   void* buf,
                                   size_t len,
-                                  size_t src_proc_idx,
+                                  int src_proc_idx,
                                   uint64_t tag,
                                   atl_req_t* req) {
     return atl_ofi_ep_recv(ep, buf, len, src_proc_idx, tag, req);
 }
 
 atl_status_t atl_ofi::atl_ep_probe(atl_ep_t* ep,
-                                   size_t src_proc_idx,
+                                   int src_proc_idx,
                                    uint64_t tag,
                                    int* found,
                                    size_t* recv_len) {
@@ -171,11 +171,7 @@ atl_status_t atl_ofi::atl_ep_barrier(atl_ep_t* ep, atl_req_t* req) {
     return atl_ofi_ep_barrier(ep, req);
 }
 
-atl_status_t atl_ofi::atl_ep_bcast(atl_ep_t* ep,
-                                   void* buf,
-                                   size_t len,
-                                   size_t root,
-                                   atl_req_t* req) {
+atl_status_t atl_ofi::atl_ep_bcast(atl_ep_t* ep, void* buf, size_t len, int root, atl_req_t* req) {
     return atl_ofi_ep_bcast(ep, buf, len, root, req);
 }
 
@@ -183,7 +179,7 @@ atl_status_t atl_ofi::atl_ep_reduce(atl_ep_t* ep,
                                     const void* send_buf,
                                     void* recv_buf,
                                     size_t len,
-                                    size_t root,
+                                    int root,
                                     atl_datatype_t dtype,
                                     atl_reduction_t op,
                                     atl_req_t* req) {
@@ -206,7 +202,7 @@ atl_status_t atl_ofi::atl_ep_read(atl_ep_t* ep,
                                   atl_mr_t* mr,
                                   uint64_t addr,
                                   uintptr_t remote_key,
-                                  size_t dst_proc_idx,
+                                  int dst_proc_idx,
                                   atl_req_t* req) {
     return atl_ofi_ep_read(ep, buf, len, mr, addr, remote_key, dst_proc_idx, req);
 }
@@ -217,7 +213,7 @@ atl_status_t atl_ofi::atl_ep_write(atl_ep_t* ep,
                                    atl_mr_t* mr,
                                    uint64_t addr,
                                    uintptr_t remote_key,
-                                   size_t dst_proc_idx,
+                                   int dst_proc_idx,
                                    atl_req_t* req) {
     return atl_ofi_ep_write(ep, buf, len, mr, addr, remote_key, dst_proc_idx, req);
 }
diff --git a/src/atl/ofi/atl_ofi.h b/src/atl/ofi/atl_ofi.h
index bba00f10b..fd093ceb7 100644
--- a/src/atl/ofi/atl_ofi.h
+++ b/src/atl/ofi/atl_ofi.h
@@ -46,19 +46,19 @@ class atl_ofi final : public iatl {
     atl_status_t atl_ep_send(atl_ep_t* ep,
                              const void* buf,
                              size_t len,
-                             size_t dst_proc_idx,
+                             int dst_proc_idx,
                              uint64_t tag,
                              atl_req_t* req) override;
 
     atl_status_t atl_ep_recv(atl_ep_t* ep,
                              void* buf,
                              size_t len,
-                             size_t src_proc_idx,
+                             int src_proc_idx,
                              uint64_t tag,
                              atl_req_t* req) override;
 
     atl_status_t atl_ep_probe(atl_ep_t* ep,
-                              size_t src_proc_idx,
+                              int src_proc_idx,
                               uint64_t tag,
                               int* found,
                               size_t* recv_len) override;
@@ -99,14 +99,14 @@ class atl_ofi final : public iatl {
     atl_status_t atl_ep_bcast(atl_ep_t* ep,
                               void* buf,
                               size_t len,
-                              size_t root,
+                              int root,
                               atl_req_t* req) override;
 
     atl_status_t atl_ep_reduce(atl_ep_t* ep,
                                const void* send_buf,
                                void* recv_buf,
                                size_t len,
-                               size_t root,
+                               int root,
                                atl_datatype_t dtype,
                                atl_reduction_t op,
                                atl_req_t* req) override;
@@ -125,7 +125,7 @@ class atl_ofi final : public iatl {
                              atl_mr_t* mr,
                              uint64_t addr,
                              uintptr_t remote_key,
-                             size_t dst_proc_idx,
+                             int dst_proc_idx,
                              atl_req_t* req) override;
 
     atl_status_t atl_ep_write(atl_ep_t* ep,
@@ -134,7 +134,7 @@ class atl_ofi final : public iatl {
                               atl_mr_t* mr,
                               uint64_t addr,
                               uintptr_t remote_key,
-                              size_t dst_proc_idx,
+                              int dst_proc_idx,
                               atl_req_t* req) override;
 
     atl_status_t atl_ep_wait(atl_ep_t* ep, atl_req_t* req) override;
@@ -149,7 +149,12 @@ class atl_ofi final : public iatl {
 
     atl_status_t atl_finalize() override;
 
+    bool is_inited() override {
+        return inited;
+    }
+
 private:
     atl_ctx_t* ctx = nullptr;
     bool is_finalized{ false };
+    bool inited{ false };
 };
diff --git a/src/atl/util/pm/pm_rt.h b/src/atl/util/pm/pm_rt.h
index 6f4f8facc..8057df881 100644
--- a/src/atl/util/pm/pm_rt.h
+++ b/src/atl/util/pm/pm_rt.h
@@ -13,180 +13,180 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#ifndef PM_RT_H
-#define PM_RT_H
-
-#include "atl_def.h"
-
-#define PM_TYPE "CCL_PM_TYPE"
-
-#define PM_RT_VAL_SIMPLE    "simple"
-#define PM_RT_VAL_RESIZABLE "resizable"
-
-typedef struct pm_rt_desc pm_rt_desc_t;
-
-//typedef enum pm_rt_type {
-//    PM_RT_SIMPLE = 0,
-//    PM_RT_RESIZABLE = 1,
-//} pm_rt_type_t;
-//
-//static pm_rt_type_t type = PM_RT_SIMPLE;
-
-typedef struct pm_rt_ops {
-    void (*finalize)(pm_rt_desc_t *pmrt_desc);
-    void (*barrier)(pm_rt_desc_t *pmrt_desc);
-    atl_status_t (*update)(size_t *proc_idx, size_t *proc_count);
-    atl_status_t (*wait_notification)(void);
-} pm_rt_ops_t;
-
-typedef struct pm_rt_kvs_ops {
-    atl_status_t (*put)(pm_rt_desc_t *pmrt_desc,
-                        char *kvs_key,
-                        size_t proc_idx,
-                        const void *kvs_val,
-                        size_t kvs_val_len);
-    atl_status_t (*get)(pm_rt_desc_t *pmrt_desc,
-                        char *kvs_key,
-                        size_t proc_idx,
-                        void *kvs_val,
-                        size_t kvs_val_len);
-} pm_rt_kvs_ops_t;
-
-struct pm_rt_desc {
-    pm_rt_ops_t *ops;
-    pm_rt_kvs_ops_t *kvs_ops;
-};
-
-#if 0
-/* PMI RT */
-atl_status_t pmirt_init(size_t *proc_idx, size_t *procs_num, pm_rt_desc_t **pmrt_desc);
-atl_status_t resizable_pmirt_init(size_t *proc_idx,
-                                  size_t *proc_count,
-                                  pm_rt_desc_t **pmrt_desc,
-                                  const char *main_addr);
-atl_status_t resizable_pmirt_set_resize_function(atl_resize_fn_t resize_fn);
-atl_status_t resizable_pmirt_main_addr_reserv(char *main_addr);
-
-
-static inline int is_pm_resize_enabled() {
-    if (type == PM_RT_RESIZABLE)
-        return 1;
-    return 0;
-}
-
-static inline atl_status_t pmrt_init(size_t *proc_idx,
-                                     size_t *procs_num,
-                                     pm_rt_desc_t **pmrt_desc,
-                                     const char *main_addr) {
-    char *type_str = getenv(PM_TYPE);
-
-    if (type_str) {
-        if (strstr(type_str, PM_RT_VAL_SIMPLE)) {
-            type = PM_RT_SIMPLE;
-        }
-        else if (strstr(type_str, PM_RT_VAL_RESIZABLE)) {
-            type = PM_RT_RESIZABLE;
-        }
-        else {
-            printf("Unknown %s: %s\n", PM_TYPE, type_str);
-            return ATL_STATUS_FAILURE;
-        }
-    }
-
-    switch (type) {
-        case PM_RT_SIMPLE: return pmirt_init(proc_idx, procs_num, pmrt_desc);
-        case PM_RT_RESIZABLE:
-            return resizable_pmirt_init(proc_idx, procs_num, pmrt_desc, main_addr);
-        default: printf("Wrong CCL_PM_TYPE: %s", type_str); return ATL_STATUS_FAILURE;
-    }
-}
-
-static inline atl_status_t pmrt_main_addr_reserv(char *main_addr) {
-    return resizable_pmirt_main_addr_reserv(main_addr);
-}
-
-static inline atl_status_t pmrt_set_resize_function(atl_resize_fn_t user_checker) {
-    switch (type) {
-        case PM_RT_RESIZABLE: return resizable_pmirt_set_resize_function(user_checker);
-        default: return ATL_STATUS_SUCCESS;
-    }
-}
-static inline atl_status_t pmrt_update(size_t *proc_idx,
-                                       size_t *proc_count,
-                                       pm_rt_desc_t *pmrt_desc) {
-    return pmrt_desc->ops->update(proc_idx, proc_count);
-}
-static inline atl_status_t pmrt_wait_notification(pm_rt_desc_t *pmrt_desc) {
-    return pmrt_desc->ops->wait_notification();
-}
-static inline void pmrt_finalize(pm_rt_desc_t *pmrt_desc) {
-    pmrt_desc->ops->finalize(pmrt_desc);
-}
-static inline void pmrt_barrier(pm_rt_desc_t *pmrt_desc) {
-    pmrt_desc->ops->barrier(pmrt_desc);
-}
-
-static inline atl_status_t pmrt_kvs_put(pm_rt_desc_t *pmrt_desc,
-                                        char *kvs_key,
-                                        size_t proc_idx,
-                                        const void *kvs_val,
-                                        size_t kvs_val_len) {
-    return pmrt_desc->kvs_ops->put(pmrt_desc, kvs_key, proc_idx, kvs_val, kvs_val_len);
-}
-
-static inline atl_status_t pmrt_kvs_get(pm_rt_desc_t *pmrt_desc,
-                                        char *kvs_key,
-                                        size_t proc_idx,
-                                        void *kvs_val,
-                                        size_t kvs_val_len) {
-    return pmrt_desc->kvs_ops->get(pmrt_desc, kvs_key, proc_idx, kvs_val, kvs_val_len);
-}
-
-}
-#endif
-
-#ifdef __cplusplus
-class ipmi {
-public:
-    virtual ~ipmi() = default;
-
-    virtual int is_pm_resize_enabled() = 0;
-
-    virtual atl_status_t pmrt_main_addr_reserv(char *main_addr) = 0;
-
-    virtual atl_status_t pmrt_set_resize_function(atl_resize_fn_t resize_fn) = 0;
-
-    virtual atl_status_t pmrt_update() = 0;
-
-    virtual atl_status_t pmrt_wait_notification() = 0;
-
-    virtual void pmrt_finalize() = 0;
-
-    virtual void pmrt_barrier() = 0;
-
-    virtual atl_status_t pmrt_kvs_put(char *kvs_key,
-                                      size_t proc_idx,
-                                      const void *kvs_val,
-                                      size_t kvs_val_len) = 0;
-
-    virtual atl_status_t pmrt_kvs_get(char *kvs_key,
-                                      size_t proc_idx,
-                                      void *kvs_val,
-                                      size_t kvs_val_len) = 0;
-
-    virtual size_t get_rank() = 0;
-
-    virtual size_t get_size() = 0;
-
-    virtual size_t get_thread() = 0;
-
-    virtual size_t get_local_kvs_id() = 0;
-
-    virtual void set_local_kvs_id(size_t local_kvs_id) = 0;
-
-    virtual size_t get_threads_count() = 0;
-
-    virtual size_t get_devices_per_rank_count() = 0;
-};
-#endif
-#endif /* PM_RT_H */
+#ifndef PM_RT_H
+#define PM_RT_H
+
+#include "atl_def.h"
+
+#define PM_TYPE "CCL_PM_TYPE"
+
+#define PM_RT_VAL_SIMPLE    "simple"
+#define PM_RT_VAL_RESIZABLE "resizable"
+
+typedef struct pm_rt_desc pm_rt_desc_t;
+
+//typedef enum pm_rt_type {
+//    PM_RT_SIMPLE = 0,
+//    PM_RT_RESIZABLE = 1,
+//} pm_rt_type_t;
+//
+//static pm_rt_type_t type = PM_RT_SIMPLE;
+
+typedef struct pm_rt_ops {
+    void (*finalize)(pm_rt_desc_t *pmrt_desc);
+    void (*barrier)(pm_rt_desc_t *pmrt_desc);
+    atl_status_t (*update)(int *proc_idx, int *proc_count);
+    atl_status_t (*wait_notification)(void);
+} pm_rt_ops_t;
+
+typedef struct pm_rt_kvs_ops {
+    atl_status_t (*put)(pm_rt_desc_t *pmrt_desc,
+                        char *kvs_key,
+                        int proc_idx,
+                        const void *kvs_val,
+                        size_t kvs_val_len);
+    atl_status_t (*get)(pm_rt_desc_t *pmrt_desc,
+                        char *kvs_key,
+                        int proc_idx,
+                        void *kvs_val,
+                        size_t kvs_val_len);
+} pm_rt_kvs_ops_t;
+
+struct pm_rt_desc {
+    pm_rt_ops_t *ops;
+    pm_rt_kvs_ops_t *kvs_ops;
+};
+
+#if 0
+/* PMI RT */
+atl_status_t pmirt_init(int *proc_idx, int *procs_num, pm_rt_desc_t **pmrt_desc);
+atl_status_t resizable_pmirt_init(int *proc_idx,
+                                  int *proc_count,
+                                  pm_rt_desc_t **pmrt_desc,
+                                  const char *main_addr);
+atl_status_t resizable_pmirt_set_resize_function(atl_resize_fn_t resize_fn);
+atl_status_t resizable_pmirt_main_addr_reserve(char *main_addr);
+
+
+static inline int is_pm_resize_enabled() {
+    if (type == PM_RT_RESIZABLE)
+        return 1;
+    return 0;
+}
+
+static inline atl_status_t pmrt_init(int *proc_idx,
+                                     int *procs_num,
+                                     pm_rt_desc_t **pmrt_desc,
+                                     const char *main_addr) {
+    char *type_str = getenv(PM_TYPE);
+
+    if (type_str) {
+        if (strstr(type_str, PM_RT_VAL_SIMPLE)) {
+            type = PM_RT_SIMPLE;
+        }
+        else if (strstr(type_str, PM_RT_VAL_RESIZABLE)) {
+            type = PM_RT_RESIZABLE;
+        }
+        else {
+            printf("Unknown %s: %s\n", PM_TYPE, type_str);
+            return ATL_STATUS_FAILURE;
+        }
+    }
+
+    switch (type) {
+        case PM_RT_SIMPLE: return pmirt_init(proc_idx, procs_num, pmrt_desc);
+        case PM_RT_RESIZABLE:
+            return resizable_pmirt_init(proc_idx, procs_num, pmrt_desc, main_addr);
+        default: printf("Wrong CCL_PM_TYPE: %s", type_str); return ATL_STATUS_FAILURE;
+    }
+}
+
+static inline atl_status_t pmrt_main_addr_reserve(char *main_addr) {
+    return resizable_pmirt_main_addr_reserve(main_addr);
+}
+
+static inline atl_status_t pmrt_set_resize_function(atl_resize_fn_t user_checker) {
+    switch (type) {
+        case PM_RT_RESIZABLE: return resizable_pmirt_set_resize_function(user_checker);
+        default: return ATL_STATUS_SUCCESS;
+    }
+}
+static inline atl_status_t pmrt_update(int *proc_idx,
+                                       int *proc_count,
+                                       pm_rt_desc_t *pmrt_desc) {
+    return pmrt_desc->ops->update(proc_idx, proc_count);
+}
+static inline atl_status_t pmrt_wait_notification(pm_rt_desc_t *pmrt_desc) {
+    return pmrt_desc->ops->wait_notification();
+}
+static inline void pmrt_finalize(pm_rt_desc_t *pmrt_desc) {
+    pmrt_desc->ops->finalize(pmrt_desc);
+}
+static inline void pmrt_barrier(pm_rt_desc_t *pmrt_desc) {
+    pmrt_desc->ops->barrier(pmrt_desc);
+}
+
+static inline atl_status_t pmrt_kvs_put(pm_rt_desc_t *pmrt_desc,
+                                        char *kvs_key,
+                                        int proc_idx,
+                                        const void *kvs_val,
+                                        size_t kvs_val_len) {
+    return pmrt_desc->kvs_ops->put(pmrt_desc, kvs_key, proc_idx, kvs_val, kvs_val_len);
+}
+
+static inline atl_status_t pmrt_kvs_get(pm_rt_desc_t *pmrt_desc,
+                                        char *kvs_key,
+                                        int proc_idx,
+                                        void *kvs_val,
+                                        size_t kvs_val_len) {
+    return pmrt_desc->kvs_ops->get(pmrt_desc, kvs_key, proc_idx, kvs_val, kvs_val_len);
+}
+
+}
+#endif
+
+#ifdef __cplusplus
+class ipmi {
+public:
+    virtual ~ipmi() = default;
+
+    virtual int is_pm_resize_enabled() = 0;
+
+    virtual atl_status_t pmrt_main_addr_reserve(char *main_addr) = 0;
+
+    virtual atl_status_t pmrt_set_resize_function(atl_resize_fn_t resize_fn) = 0;
+
+    virtual atl_status_t pmrt_update() = 0;
+
+    virtual atl_status_t pmrt_wait_notification() = 0;
+
+    virtual void pmrt_finalize() = 0;
+
+    virtual void pmrt_barrier() = 0;
+
+    virtual atl_status_t pmrt_kvs_put(char *kvs_key,
+                                      int proc_idx,
+                                      const void *kvs_val,
+                                      size_t kvs_val_len) = 0;
+
+    virtual atl_status_t pmrt_kvs_get(char *kvs_key,
+                                      int proc_idx,
+                                      void *kvs_val,
+                                      size_t kvs_val_len) = 0;
+
+    virtual int get_rank() = 0;
+
+    virtual int get_size() = 0;
+
+    virtual size_t get_local_thread_idx() = 0;
+
+    virtual size_t get_local_kvs_id() = 0;
+
+    virtual void set_local_kvs_id(size_t local_kvs_id) = 0;
+
+    virtual size_t get_threads_per_process() = 0;
+
+    virtual size_t get_ranks_per_process() = 0;
+};
+#endif
+#endif /* PM_RT_H */
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
index 3083d1bf8..ec1f9ba30 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.cpp
@@ -20,7 +20,7 @@
 #include "util/pm/codec/pm_rt_codec.h"
 #include "pmi_resizable.h"
 
-#define RESIZABLE_PMI_RT_KEY_FORMAT "%s-%zu"
+#define RESIZABLE_PMI_RT_KEY_FORMAT "%s-%d"
 
 int pmi_resizable::is_pm_resize_enabled() {
     return true;
@@ -85,8 +85,8 @@ atl_status_t pmi_resizable::pmrt_init(const char *main_addr) {
     return ATL_STATUS_FAILURE;
 }
 
-atl_status_t pmi_resizable::pmrt_main_addr_reserv(char *main_addr) {
-    int ret = PMIR_Main_Addr_Reserv(main_addr);
+atl_status_t pmi_resizable::pmrt_main_addr_reserve(char *main_addr) {
+    int ret = PMIR_Main_Addr_Reserve(main_addr);
 
     if (ret)
         return ATL_STATUS_FAILURE;
@@ -155,7 +155,7 @@ void pmi_resizable::pmrt_barrier() {
 }
 
 atl_status_t pmi_resizable::pmrt_kvs_put(char *kvs_key,
-                                         size_t proc_idx,
+                                         int proc_idx,
                                          const void *kvs_val,
                                          size_t kvs_val_len) {
     int ret;
@@ -186,7 +186,7 @@ atl_status_t pmi_resizable::pmrt_kvs_put(char *kvs_key,
 }
 
 atl_status_t pmi_resizable::pmrt_kvs_get(char *kvs_key,
-                                         size_t proc_idx,
+                                         int proc_idx,
                                          void *kvs_val,
                                          size_t kvs_val_len) {
     int ret;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
index 9a518f31d..8ec4023d0 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
@@ -18,7 +18,7 @@
 #include "atl/atl_def.h"
 #include "atl/util/pm/pm_rt.h"
 #include "atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h"
-#include "atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.h"
+#include "atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp"
 #include "atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp"
 
 #define PMIR_SUCCESS                0
@@ -42,7 +42,7 @@ typedef enum {
     KVS_RA_RUN = 1,
     KVS_RA_FINALIZE = 2,
 } kvs_resize_action_t;
-typedef kvs_resize_action_t (*pmir_resize_fn_t)(size_t comm_size);
+typedef kvs_resize_action_t (*pmir_resize_fn_t)(int comm_size);
 
 class helper;
 class pmi_resizable final : public ipmi {
@@ -58,7 +58,7 @@ class pmi_resizable final : public ipmi {
 
     int is_pm_resize_enabled() override;
 
-    atl_status_t pmrt_main_addr_reserv(char* main_addr) override;
+    atl_status_t pmrt_main_addr_reserve(char* main_addr) override;
 
     atl_status_t pmrt_set_resize_function(atl_resize_fn_t resize_fn) override;
 
@@ -69,32 +69,32 @@ class pmi_resizable final : public ipmi {
     void pmrt_barrier() override;
 
     atl_status_t pmrt_kvs_put(char* kvs_key,
-                              size_t proc_idx,
+                              int proc_idx,
                               const void* kvs_val,
                               size_t kvs_val_len) override;
 
     atl_status_t pmrt_kvs_get(char* kvs_key,
-                              size_t proc_idx,
+                              int proc_idx,
                               void* kvs_val,
                               size_t kvs_val_len) override;
 
     void Hard_finilize(int sig);
 
-    size_t get_rank() override;
+    int get_rank() override;
 
-    size_t get_size() override;
+    int get_size() override;
 
-    size_t get_thread() override;
+    size_t get_local_thread_idx() override;
 
     size_t get_local_kvs_id() override;
 
     void set_local_kvs_id(size_t local_kvs_id) override;
 
-    size_t get_threads_count() override {
+    size_t get_threads_per_process() override {
         return 1;
     }
 
-    size_t get_devices_per_rank_count() override {
+    size_t get_ranks_per_process() override {
         return 1;
     }
     void pmrt_finalize() override;
@@ -103,15 +103,15 @@ class pmi_resizable final : public ipmi {
     bool is_finalized{ false };
     atl_status_t pmrt_init(const char* main_addr = nullptr);
     /*Was in API ->*/
-    int PMIR_Main_Addr_Reserv(char* main_addr);
+    int PMIR_Main_Addr_Reserve(char* main_addr);
 
     int PMIR_Init(const char* main_addr);
 
     int PMIR_Finalize(void);
 
-    int PMIR_Get_size(size_t* size);
+    int PMIR_Get_size(int* size);
 
-    int PMIR_Get_rank(size_t* rank);
+    int PMIR_Get_rank(int* rank);
 
     int PMIR_KVS_Get_my_name(char* kvs_name, size_t length);
 
@@ -135,10 +135,11 @@ class pmi_resizable final : public ipmi {
 
     int PMIR_Wait_notification(void);
     /* <- Was in API*/
-    kvs_resize_action_t default_checker(size_t comm_size);
-    kvs_resize_action_t call_resize_fn(size_t comm_size);
-    size_t rank;
-    size_t size;
+    kvs_resize_action_t default_checker(int comm_size);
+    kvs_resize_action_t call_resize_fn(int comm_size);
+
+    int rank;
+    int size;
 
     pmir_resize_fn_t resize_function = nullptr;
     std::shared_ptr<helper> h;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
index 5bfde612c..564a309f7 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
@@ -13,16 +13,13 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#ifndef DEF_INCLUDED
-#define DEF_INCLUDED
-
-#include <string.h>
+#pragma once
 
 //TODO: change exit to something more useful
 #define SET_STR(dst, size, ...) \
     do { \
         if (snprintf(dst, size, __VA_ARGS__) > size) { \
-            printf("Line so big (must be low %d)\n", size); \
+            printf("line too long (must be shorter %d)\n", size); \
             printf(__VA_ARGS__); \
             exit(1); \
         } \
@@ -37,41 +34,65 @@
         } \
     } while (0)
 
-#define DO_RW_OP(op, fd, buf, size) \
+#define DO_RW_OP(op, fd, buf, size, memory_mutex, msg) \
     do { \
-        ssize_t res = 0; \
-        size_t shift = 0; \
-        while (shift != size) { \
-            res = op(fd, (char*)buf + shift, size - shift); \
-            if (res == -1) { \
-                if (errno != EINTR) { \
-                    printf("read/write error: %s\n", strerror(errno)); \
+        { \
+            if (!fd) { \
+                printf("" #msg ": " #op ": fd is closed, size %zu\n", size); \
+                break; \
+            } \
+            std::lock_guard<std::mutex> lock(memory_mutex); \
+            ssize_t res = 0; \
+            size_t shift = 0; \
+            while (shift != size) { \
+                res = op(fd, (char*)buf + shift, size - shift); \
+                if (res == -1) { \
+                    if (errno != EINTR) { \
+                        printf("" #msg ": " #op ": error: buf %p, size %zu, shift %zu\n", \
+                               buf, \
+                               size, \
+                               shift); \
+                        perror("read/write error"); \
+                        exit(EXIT_FAILURE); \
+                    } \
+                } \
+                else if (res == 0) { \
+                    printf("" #msg ": " #op ": can not process all data, size %zu, shift %zu\n", \
+                           size, \
+                           shift); \
                     exit(EXIT_FAILURE); \
                 } \
-            } \
-            else { \
-                shift += res; \
+                else { \
+                    shift += res; \
+                } \
             } \
         } \
     } while (0)
 
-
-#define DO_RW_OP_1(op, fd, buf, size, res) \
+#define DO_RW_OP_1(op, fd, buf, size, res, msg) \
     do { \
+        if (!fd) { \
+            printf("" #msg ": " #op ": fd is closed, size %zu\n", size); \
+            break; \
+        } \
         size_t shift = 0; \
         res = 0; \
         do { \
             res = op(fd, (char*)buf + shift, size - shift); \
             if (res == -1) { \
                 if (errno != EINTR) { \
-                    printf("read/write error: %s\n", strerror(errno)); \
+                    printf("" #msg ": " #op ": error: buf %p, size %zu, shift %zu\n", \
+                           buf, \
+                           size, \
+                           shift); \
+                    perror("read/write error"); \
                     exit(EXIT_FAILURE); \
                 } \
-            }\
+            } \
             else { \
                 shift += res; \
             } \
-        } while (shift != size && res != 0); \
+        } while ((shift != size) && (res != 0)); \
     } while (0)
 
 #define BARRIER_NUM_MAX         1024
@@ -92,10 +113,12 @@
 #define GREP_TEMPLATE               "| grep \"%s\""
 #define GREP_COUNT_TEMPLATE         "| grep -c \"%s\""
 #define CONCAT_TWO_COMMAND_TEMPLATE "%s %s"
+#define RANK_TEMPLATE               "%d"
 #define SIZE_T_TEMPLATE             "%zu"
 
-#define KVS_NAME    "CCL_POD_ADDR"
-#define KVS_BARRIER "CCL_BARRIER"
+#define KVS_NAME         "CCL_POD_ADDR"
+#define KVS_BARRIER      "CCL_BARRIER"
+#define KVS_BARRIER_FULL "CCL_BARRIER_FULL"
 
 #define KVS_IDX               "IDX"
 #define KVS_UP                "CCL_UP"
@@ -109,7 +132,7 @@
 
 #define CCL_IP_LEN 128
 
-#define CHECKER_IP         "hostname -I"
+#define GET_IP_CMD         "hostname -I"
 #define READ_ONLY          "r"
 #define NULL_CHAR          '\0'
 #define MAX_UP_IDX         2048
@@ -117,8 +140,4 @@
 #define INITIAL_RANK_NUM   "0"
 #define MAX_CLEAN_CHECKS   3
 
-#define STR_COPY(dst, src, len) { memcpy((dst), (src), (len-1)); dst[len - 1] = '\0'; }
-
 extern char my_hostname[MAX_KVS_VAL_LENGTH];
-
-#endif
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
index e7ad2365b..c2a0c67fd 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
@@ -13,46 +13,55 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "helper.h"
+#include <string.h>
+
+#include "util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp"
 #include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h"
 
-size_t my_rank, count_pods;
+int my_rank, count_pods;
 size_t barrier_num = 0;
 size_t up_idx;
 size_t applied = 0;
 
 rank_list_t* killed_ranks = NULL;
-size_t killed_ranks_count = 0;
+int killed_ranks_count = 0;
 
 rank_list_t* new_ranks = NULL;
-size_t new_ranks_count = 0;
+int new_ranks_count = 0;
+
+void kvs_str_copy(char* dst, const char* src, size_t bytes) {
+    strncpy(dst, src, bytes - 1);
+    dst[bytes - 1] = '\0';
+}
+
+size_t helper::replace_str(char* str, int old_rank, int new_rank) {
+    throw std::runtime_error("unexpected path");
 
-size_t helper::replace_str(char* str, size_t old_rank, size_t new_rank) {
     char old_str[INT_STR_SIZE];
     char new_str[INT_STR_SIZE];
     char* point_to_replace;
-    size_t old_str_size;
-    size_t new_str_size;
-
-    SET_STR(old_str, INT_STR_SIZE, SIZE_T_TEMPLATE, old_rank);
+    int old_str_size;
+    int new_str_size;
 
-    SET_STR(new_str, INT_STR_SIZE, SIZE_T_TEMPLATE, new_rank);
+    SET_STR(old_str, INT_STR_SIZE, RANK_TEMPLATE, old_rank);
+    SET_STR(new_str, INT_STR_SIZE, RANK_TEMPLATE, new_rank);
 
     point_to_replace = strstr(str, old_str);
     if (point_to_replace == NULL)
         return 1;
+
     old_str_size = strlen(old_str);
     new_str_size = strlen(new_str);
 
     if (old_str_size != new_str_size) {
-        size_t rest_len = strlen(point_to_replace);
+        size_t rest_len = strlen(point_to_replace) - old_str_size;
         memmove(point_to_replace + new_str_size, point_to_replace + old_str_size, rest_len);
     }
-    STR_COPY(point_to_replace, new_str, new_str_size);
+    memcpy(point_to_replace, new_str, new_str_size);
     return 0;
 }
 
-void helper::update_ranks(size_t* old_count, rank_list_t** origin_list, const char* kvs_name) {
+void helper::update_ranks(int* old_count, rank_list_t** origin_list, const char* kvs_name) {
     char** rank_nums = NULL;
     size_t rank_count = get_keys_values_by_name(kvs_name, NULL, &rank_nums);
     size_t i;
@@ -79,7 +88,7 @@ void helper::update_ranks(size_t* old_count, rank_list_t** origin_list, const ch
     *old_count += cur_count;
 }
 
-void helper::keep_first_n_up(size_t prev_new_ranks_count, size_t prev_killed_ranks_count) {
+void helper::keep_first_n_up(int prev_new_ranks_count, int prev_killed_ranks_count) {
     rank_list_keep_first_n(&killed_ranks, prev_killed_ranks_count);
     rank_list_keep_first_n(&new_ranks, prev_new_ranks_count);
 }
@@ -90,8 +99,8 @@ void helper::get_update_ranks(void) {
 }
 
 void helper::get_shift(shift_list_t** list) {
-    size_t shift_pods_count = 0;
-    size_t max_rank_survivor_pod = count_pods;
+    int shift_pods_count = 0;
+    int max_rank_survivor_pod = count_pods;
     rank_list_t* cur_new = new_ranks;
     rank_list_t* cur_killed = killed_ranks;
 
@@ -184,8 +193,8 @@ void helper::accept_new_ranks(shift_list_t* cur_list) {
 
     while (cur_list != NULL) {
         if (cur_list->shift.type == CH_T_UPDATE || cur_list->shift.type == CH_T_NEW) {
-            SET_STR(old_rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, cur_list->shift.old_rank);
-            SET_STR(new_rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, cur_list->shift.new_rank);
+            SET_STR(old_rank_str, INT_STR_SIZE, RANK_TEMPLATE, cur_list->shift.old_rank);
+            SET_STR(new_rank_str, INT_STR_SIZE, RANK_TEMPLATE, cur_list->shift.new_rank);
 
             count_values = get_keys_values_by_name(KVS_APPROVED_NEW_POD, &kvs_keys, &kvs_values);
 
@@ -215,7 +224,7 @@ void helper::accept_new_ranks(shift_list_t* cur_list) {
         free(kvs_values);
 }
 
-void helper::update_kvs_info(size_t new_rank) {
+void helper::update_kvs_info(int new_rank) {
     char kvs_name[MAX_KVS_NAME_LENGTH];
     char kvs_key[MAX_KVS_KEY_LENGTH];
     char kvs_val[MAX_KVS_VAL_LENGTH];
@@ -235,13 +244,13 @@ void helper::update_kvs_info(size_t new_rank) {
     }
 }
 
-void helper::move_to_new_rank(size_t new_rank) {
+void helper::move_to_new_rank(int new_rank) {
     char rank_str[INT_STR_SIZE];
 
     update_kvs_info(new_rank);
     my_rank = new_rank;
 
-    SET_STR(rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, my_rank);
+    SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
     //    request_set_val(KVS_POD_REQUEST, my_hostname, rank_str);
 
@@ -253,10 +262,10 @@ void helper::update_my_info(shift_list_t* list) {
 
     while (list != NULL) {
         if (list->shift.old_rank == my_rank) {
-            size_t old_rank = my_rank;
+            int old_rank = my_rank;
             move_to_new_rank(list->shift.new_rank);
 
-            SET_STR(rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, old_rank);
+            SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, old_rank);
 
             remove_name_key(KVS_POD_NUM, rank_str);
 
@@ -296,7 +305,7 @@ void helper::post_my_info(void) {
 
     applied = 1;
 
-    SET_STR(my_rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, my_rank);
+    SET_STR(my_rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
     set_value(KVS_POD_NUM, my_rank_str, my_hostname);
 
@@ -315,7 +324,7 @@ void helper::post_my_info(void) {
         barrier_num = 0;
 }
 
-size_t helper::update(shift_list_t** list, rank_list_t** dead_up_idx, size_t root_rank) {
+size_t helper::update(shift_list_t** list, rank_list_t** dead_up_idx, int root_rank) {
     if (applied == 1) {
         if ((*list) != NULL) {
             if (my_rank == root_rank) {
@@ -409,7 +418,7 @@ void helper::reg_rank(void) {
     my_rank = 0;
     set_value(KVS_POD_REQUEST, my_hostname, INITIAL_RANK_NUM);
 
-    SET_STR(rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, my_rank);
+    SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
     while (1) {
         wait_shift = 0;
@@ -444,7 +453,7 @@ void helper::reg_rank(void) {
 
         if (!wait_shift) {
             my_rank++;
-            SET_STR(rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, my_rank);
+            SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
             set_value(KVS_POD_REQUEST, my_hostname, rank_str);
         }
     }
@@ -487,13 +496,13 @@ void helper::up_kvs_new_and_dead(void) {
     up_kvs(KVS_APPROVED_DEAD_POD, KVS_DEAD_POD);
 }
 
-void helper::get_new_root(size_t* old_root) {
+void helper::get_new_root(int* old_root) {
     size_t i;
     char** rank_nums = NULL;
     size_t rank_count = get_keys_values_by_name(KVS_DEAD_POD, NULL, &rank_nums);
 
     for (i = 0; i < rank_count; i++) {
-        if (*old_root == (size_t)strtol(rank_nums[i], NULL, 10))
+        if (*old_root == (int)strtol(rank_nums[i], NULL, 10))
             (*old_root)++;
         free(rank_nums[i]);
     }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
similarity index 81%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.h
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
index db61f4c8b..1dd5e4be9 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp
@@ -25,22 +25,25 @@
 #include <iostream>
 #include <memory>
 #include <utility>
+
 #include "def.h"
-#include "rank_list.h"
-#include "shift_list.h"
-#include "kvs_keeper.h"
+#include "rank_list.hpp"
+#include "shift_list.hpp"
+#include "kvs_keeper.hpp"
 #include "kvs/ikvs_wrapper.h"
 
-extern size_t my_rank, count_pods;
+extern int my_rank, count_pods;
 extern size_t barrier_num;
 extern size_t up_idx;
 extern size_t applied;
 
 extern rank_list_t* killed_ranks;
-extern size_t killed_ranks_count;
+extern int killed_ranks_count;
 
 extern rank_list_t* new_ranks;
-extern size_t new_ranks_count;
+extern int new_ranks_count;
+
+void kvs_str_copy(char* dst, const char* src, size_t bytes);
 
 class helper {
 public:
@@ -54,7 +57,7 @@ class helper {
 
     void wait_accept(void);
 
-    size_t update(shift_list_t** list, rank_list_t** dead_up_idx, size_t root_rank);
+    size_t update(shift_list_t** list, rank_list_t** dead_up_idx, int root_rank);
 
     void up_pods_count(void);
 
@@ -66,9 +69,9 @@ class helper {
 
     void up_kvs_new_and_dead(void);
 
-    void keep_first_n_up(size_t prev_new_ranks_count, size_t prev_killed_ranks_count);
+    void keep_first_n_up(int prev_new_ranks_count, int prev_killed_ranks_count);
 
-    void get_new_root(size_t* old_root);
+    void get_new_root(int* old_root);
 
     /*Work with KVS, new*/
     size_t set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val);
@@ -90,17 +93,17 @@ class helper {
     /*Work with KVS, new*/
 
 private:
-    size_t replace_str(char* str, size_t old_rank, size_t new_rank);
+    size_t replace_str(char* str, int old_rank, int new_rank);
 
-    void update_ranks(size_t* old_count, rank_list_t** origin_list, const char* kvs_name);
+    void update_ranks(int* old_count, rank_list_t** origin_list, const char* kvs_name);
 
     void clean_dead_pods_info(rank_list_t* dead_up_idx);
 
     void accept_new_ranks(shift_list_t* cur_list);
 
-    void update_kvs_info(size_t new_rank);
+    void update_kvs_info(int new_rank);
 
-    void move_to_new_rank(size_t new_rank);
+    void move_to_new_rank(int new_rank);
 
     void update_my_info(shift_list_t* list);
 
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h
index 26770913c..6f68a78a6 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/ikvs_wrapper.h
@@ -14,6 +14,7 @@
  limitations under the License.
 */
 #pragma once
+
 #include <unistd.h>
 
 class ikvs_wrapper {
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
index cceca2e6d..e0cb5e518 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
@@ -16,6 +16,7 @@
 #include <arpa/inet.h>
 #include <errno.h>
 #include <netinet/in.h>
+#include <mutex>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,10 +25,11 @@
 #include <time.h>
 #include <unistd.h>
 
+#include "util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp"
 #include "util/pm/pmi_resizable_rt/pmi_resizable/def.h"
 #include "internal_kvs.h"
-#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.h"
-#include "util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.h"
+#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.hpp"
+#include "util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.hpp"
 
 #define CCL_KVS_IP_PORT_ENV         "CCL_KVS_IP_PORT"
 #define CCL_KVS_IP_EXCHANGE_ENV     "CCL_KVS_IP_EXCHANGE"
@@ -37,13 +39,27 @@
 #define MAX_CLIENT_COUNT   300
 #define CONNECTION_TIMEOUT 120
 
-static pthread_t thread = 0;
+static pthread_t kvs_thread = 0;
+
 static char main_host_ip[CCL_IP_LEN];
 char local_host_ip[CCL_IP_LEN];
-static int sock_listener = 0;
+
 static size_t main_port;
 static size_t local_port;
 static size_t is_master = 0;
+static std::mutex client_memory_mutex;
+static std::mutex server_memory_mutex;
+
+static struct sockaddr_in main_server_address;
+static struct sockaddr_in local_server_address;
+
+static int
+    client_op_sock; /* used on client side to send commands and to recv result to/from server */
+static int
+    server_listen_sock; /* used on server side to handle new incoming connect requests from clients */
+
+static int client_control_sock; /* used on client side to control local kvs server */
+static int server_control_sock; /* used on server side to be controlled by local client */
 
 typedef enum ip_getting_type {
     IGT_K8S = 0,
@@ -54,7 +70,7 @@ static ip_getting_type_t ip_getting_mode = IGT_K8S;
 
 typedef enum kvs_access_mode {
     AM_CONNECT = -1,
-//    AM_DISCONNECT = 1,
+    //    AM_DISCONNECT = 1,
     AM_PUT = 2,
     AM_REMOVE = 3,
     AM_GET_COUNT = 4,
@@ -71,19 +87,20 @@ typedef struct kvs_request {
     char val[MAX_KVS_VAL_LENGTH];
 } kvs_request_t;
 
-static struct sockaddr_in main_server_address;
-static struct sockaddr_in local_server_address;
-static int sock_sender, local_sock, accepted_local_sock;
-
 size_t internal_kvs::kvs_set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
     kvs_request_t request;
     memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_PUT;
-    STR_COPY(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
-    STR_COPY(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
-    STR_COPY(request.val, kvs_val, MAX_KVS_VAL_LENGTH);
+    kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
+    kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
+    kvs_str_copy(request.val, kvs_val, MAX_KVS_VAL_LENGTH);
 
-    DO_RW_OP(write, sock_sender, &request, sizeof(kvs_request_t));
+    DO_RW_OP(write,
+             client_op_sock,
+             &request,
+             sizeof(kvs_request_t),
+             client_memory_mutex,
+             "client: put_key_value");
 
     return 0;
 }
@@ -92,10 +109,15 @@ size_t internal_kvs::kvs_remove_name_key(const char* kvs_name, const char* kvs_k
     kvs_request_t request;
     memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_REMOVE;
-    STR_COPY(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
-    STR_COPY(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
+    kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
+    kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
 
-    DO_RW_OP(write, sock_sender, &request, sizeof(kvs_request_t));
+    DO_RW_OP(write,
+             client_op_sock,
+             &request,
+             sizeof(kvs_request_t),
+             client_memory_mutex,
+             "client: remove_key");
 
     return 0;
 }
@@ -107,16 +129,28 @@ size_t internal_kvs::kvs_get_value_by_name_key(const char* kvs_name,
     memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_GET_VAL;
     size_t is_exist = 0;
-    STR_COPY(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
-    STR_COPY(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
+    kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
+    kvs_str_copy(request.key, kvs_key, MAX_KVS_KEY_LENGTH);
     memset(kvs_val, 0, MAX_KVS_VAL_LENGTH);
 
-    DO_RW_OP(write, sock_sender, &request, sizeof(kvs_request_t));
+    DO_RW_OP(
+        write, client_op_sock, &request, sizeof(request), client_memory_mutex, "client: get_value");
+
+    DO_RW_OP(read,
+             client_op_sock,
+             &is_exist,
+             sizeof(is_exist),
+             client_memory_mutex,
+             "client: get_value is_exist");
 
-    DO_RW_OP(read, sock_sender, &is_exist, sizeof(size_t));
     if (is_exist) {
-        DO_RW_OP(read, sock_sender, &request, sizeof(kvs_request_t));
-        STR_COPY(kvs_val, request.val, MAX_KVS_VAL_LENGTH);
+        DO_RW_OP(read,
+                 client_op_sock,
+                 &request,
+                 sizeof(request),
+                 client_memory_mutex,
+                 "client: get_value read data");
+        kvs_str_copy(kvs_val, request.val, MAX_KVS_VAL_LENGTH);
     }
 
     return strlen(kvs_val);
@@ -127,11 +161,21 @@ size_t internal_kvs::kvs_get_count_names(const char* kvs_name) {
     kvs_request_t request;
     memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_GET_COUNT;
-    STR_COPY(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
-
-    DO_RW_OP(write, sock_sender, &request, sizeof(kvs_request_t));
-
-    DO_RW_OP(read, sock_sender, &count_names, sizeof(size_t));
+    kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
+
+    DO_RW_OP(write,
+             client_op_sock,
+             &request,
+             sizeof(kvs_request_t),
+             client_memory_mutex,
+             "client: get_count");
+
+    DO_RW_OP(read,
+             client_op_sock,
+             &count_names,
+             sizeof(size_t),
+             client_memory_mutex,
+             "client: get_count read data");
 
     return count_names;
 }
@@ -146,26 +190,44 @@ size_t internal_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
 
     memset(&request, 0, sizeof(kvs_request_t));
     request.mode = AM_GET_KEYS_VALUES;
-    STR_COPY(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
-
-    DO_RW_OP(write, sock_sender, &request, sizeof(kvs_request_t));
-
-    DO_RW_OP(read, sock_sender, &count, sizeof(size_t));
+    kvs_str_copy(request.name, kvs_name, MAX_KVS_NAME_LENGTH);
+
+    DO_RW_OP(write,
+             client_op_sock,
+             &request,
+             sizeof(kvs_request_t),
+             client_memory_mutex,
+             "client: get_keys_values");
+
+    DO_RW_OP(read,
+             client_op_sock,
+             &count,
+             sizeof(size_t),
+             client_memory_mutex,
+             "client: get_keys_values read size");
 
     if (count == 0)
         return count;
 
     answers = (kvs_request_t*)calloc(count, sizeof(kvs_request_t));
-
-    DO_RW_OP(read, sock_sender, answers, sizeof(kvs_request_t) * count);
+    DO_RW_OP(read,
+             client_op_sock,
+             answers,
+             sizeof(kvs_request_t) * count,
+             client_memory_mutex,
+             "client: get_keys_values read data");
     if (kvs_keys != NULL) {
         if (*kvs_keys != NULL)
             free(*kvs_keys);
 
         *kvs_keys = (char**)calloc(count, sizeof(char*));
+        if ((*kvs_keys) == NULL) {
+            printf("Memory allocation failed\n");
+            exit(1);
+        }
         for (i = 0; i < count; i++) {
             (*kvs_keys)[i] = (char*)calloc(MAX_KVS_KEY_LENGTH, sizeof(char));
-            STR_COPY((*kvs_keys)[i], answers[i].key, MAX_KVS_KEY_LENGTH);
+            kvs_str_copy((*kvs_keys)[i], answers[i].key, MAX_KVS_KEY_LENGTH);
         }
     }
     if (kvs_values != NULL) {
@@ -173,9 +235,13 @@ size_t internal_kvs::kvs_get_keys_values_by_name(const char* kvs_name,
             free(*kvs_values);
 
         *kvs_values = (char**)calloc(count, sizeof(char*));
+        if ((*kvs_values) == NULL) {
+            printf("Memory allocation failed\n");
+            exit(1);
+        }
         for (i = 0; i < count; i++) {
             (*kvs_values)[i] = (char*)calloc(MAX_KVS_VAL_LENGTH, sizeof(char));
-            STR_COPY((*kvs_values)[i], answers[i].val, MAX_KVS_VAL_LENGTH);
+            kvs_str_copy((*kvs_values)[i], answers[i].val, MAX_KVS_VAL_LENGTH);
         }
     }
 
@@ -194,40 +260,51 @@ size_t internal_kvs::kvs_get_replica_size(void) {
         memset(&request, 0, sizeof(kvs_request_t));
         request.mode = AM_GET_REPLICA;
 
-        DO_RW_OP(write, sock_sender, &request, sizeof(kvs_request_t));
+        DO_RW_OP(write,
+                 client_op_sock,
+                 &request,
+                 sizeof(kvs_request_t),
+                 client_memory_mutex,
+                 "client: get_replica");
 
-        DO_RW_OP(read, sock_sender, &replica_size, sizeof(size_t));
+        DO_RW_OP(read,
+                 client_op_sock,
+                 &replica_size,
+                 sizeof(size_t),
+                 client_memory_mutex,
+                 "client: get_replica read size");
     }
     return replica_size;
 }
 
 void* kvs_server_init(void* args) {
     struct sockaddr_in addr;
-    int local_sock;
+    int server_control_sock;
     kvs_request_t request;
     size_t count;
-    size_t clients_count = 0;
-    int is_stop = 0;
+    size_t client_count = 0;
+    int should_stop = 0;
     fd_set read_fds;
-    int i, client_socket[MAX_CLIENT_COUNT], max_sd, sd;
+    int i, client_op_sockets[MAX_CLIENT_COUNT], max_sd, sd;
     int so_reuse = 1;
     int ret = 0;
+
 #ifdef SO_REUSEPORT
-    setsockopt(sock_listener, SOL_SOCKET, SO_REUSEPORT, &so_reuse, sizeof(so_reuse));
+    setsockopt(server_listen_sock, SOL_SOCKET, SO_REUSEPORT, &so_reuse, sizeof(so_reuse));
 #else
-    setsockopt(sock_listener, SOL_SOCKET, SO_REUSEADDR, &so_reuse, sizeof(so_reuse));
+    setsockopt(server_listen_sock, SOL_SOCKET, SO_REUSEADDR, &so_reuse, sizeof(so_reuse));
 #endif
 
     for (i = 0; i < MAX_CLIENT_COUNT; i++) {
-        client_socket[i] = 0;
+        client_op_sockets[i] = 0;
     }
 
-    if ((local_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-        printf("Server: socket init failed - %s\n", strerror(errno));
-        exit(1);
+    if ((server_control_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+        perror("server: server_control_sock init");
+        exit(EXIT_FAILURE);
     }
 
-    while (connect(local_sock, (struct sockaddr*)args, sizeof(addr)) < 0) {
+    while (connect(server_control_sock, (struct sockaddr*)args, sizeof(addr)) < 0) {
     }
 
     memset(&addr, 0, sizeof(addr));
@@ -236,18 +313,19 @@ void* kvs_server_init(void* args) {
     addr.sin_addr.s_addr = INADDR_ANY;
     addr.sin_port = 0;
 
-    if (listen(sock_listener, MAX_CLIENT_COUNT) < 0) {
-        perror("listen");
+    if (listen(server_listen_sock, MAX_CLIENT_COUNT) < 0) {
+        perror("server: server_listen_sock listen");
         exit(EXIT_FAILURE);
     }
 
-    while (!is_stop || clients_count > 1) {
+    while (!should_stop || client_count > 1) {
         FD_ZERO(&read_fds);
-        FD_SET(sock_listener, &read_fds);
-        FD_SET(local_sock, &read_fds);
-        max_sd = sock_listener;
+        FD_SET(server_listen_sock, &read_fds);
+        FD_SET(server_control_sock, &read_fds);
+        max_sd = server_listen_sock;
+
         for (i = 0; i < MAX_CLIENT_COUNT; i++) {
-            sd = client_socket[i];
+            sd = client_op_sockets[i];
 
             if (sd > 0)
                 FD_SET(sd, &read_fds);
@@ -255,42 +333,61 @@ void* kvs_server_init(void* args) {
             if (sd > max_sd)
                 max_sd = sd;
         }
-        if (local_sock > max_sd)
-            max_sd = local_sock;
-        if ((select(max_sd + 1, &read_fds, NULL, NULL, NULL) < 0) && (errno != EINTR)) {
-            perror("select");
-            exit(EXIT_FAILURE);
+
+        if (server_control_sock > max_sd)
+            max_sd = server_control_sock;
+
+        if (select(max_sd + 1, &read_fds, NULL, NULL, NULL) < 0) {
+            if (errno != EINTR) {
+                perror("server: select");
+                exit(EXIT_FAILURE);
+            }
+            else {
+                /* restart select */
+                continue;
+            }
         }
 
-        if (FD_ISSET(local_sock, &read_fds)) {
-            DO_RW_OP_1(read, local_sock, &request,sizeof(kvs_request_t), ret);
+        if (FD_ISSET(server_control_sock, &read_fds)) {
+            DO_RW_OP_1(read,
+                       server_control_sock,
+                       &request,
+                       sizeof(kvs_request_t),
+                       ret,
+                       "server: get control msg from client");
             if (ret == 0) {
-                close(local_sock);
-                local_sock = 0;
+                close(server_control_sock);
+                server_control_sock = 0;
             }
             if (request.mode != AM_FINALIZE) {
-                printf("server: Wrong access mode for local socket.\n");
-                exit(1);
+                printf("server: invalid access mode for local socket\n");
+                exit(EXIT_FAILURE);
             }
-            is_stop = 1;
+            should_stop = 1;
         }
+
         for (i = 0; i < MAX_CLIENT_COUNT; i++) {
-            sd = client_socket[i];
+            sd = client_op_sockets[i];
             if (sd == 0)
                 continue;
 
             if (FD_ISSET(sd, &read_fds)) {
-                DO_RW_OP_1(read, sd, &request,sizeof(kvs_request_t), ret);
+                DO_RW_OP_1(read,
+                           sd,
+                           &request,
+                           sizeof(kvs_request_t),
+                           ret,
+                           "server: get command from client");
                 if (ret == 0) {
                     close(sd);
-                    client_socket[i] = 0;
-                    clients_count--;
+                    client_op_sockets[i] = 0;
+                    client_count--;
                     continue;
                 }
 
                 switch (request.mode) {
                     case AM_CONNECT: {
-                        clients_count++;
+                        client_count++;
                         break;
                     }
                     case AM_PUT: {
@@ -303,21 +400,41 @@ void* kvs_server_init(void* args) {
                     }
                     case AM_GET_VAL: {
                         count = get_val(request.name, request.key, request.val, ST_SERVER);
-                        DO_RW_OP(write, client_socket[i], &count, sizeof(size_t));
+                        DO_RW_OP(write,
+                                 client_op_sockets[i],
+                                 &count,
+                                 sizeof(size_t),
+                                 server_memory_mutex,
+                                 "server: get_value write size");
                         if (count != 0)
-                            DO_RW_OP(write, client_socket[i], &request, sizeof(kvs_request_t));
+                            DO_RW_OP(write,
+                                     client_op_sockets[i],
+                                     &request,
+                                     sizeof(kvs_request_t),
+                                     server_memory_mutex,
+                                     "server: get_value write data");
                         break;
                     }
                     case AM_GET_COUNT: {
                         count = get_count(request.name, ST_SERVER);
-                        DO_RW_OP(write, client_socket[i], &count, sizeof(size_t));
+                        DO_RW_OP(write,
+                                 client_op_sockets[i],
+                                 &count,
+                                 sizeof(size_t),
+                                 server_memory_mutex,
+                                 "server: get_count");
                         break;
                     }
                     case AM_GET_REPLICA: {
                         char* replica_size_str = getenv(CCL_WORLD_SIZE_ENV);
                         count = (replica_size_str != NULL) ? strtol(replica_size_str, NULL, 10)
-                                                           : clients_count;
-                        DO_RW_OP(write, client_socket[i], &count, sizeof(size_t));
+                                                           : client_count;
+                        DO_RW_OP(write,
+                                 client_op_sockets[i],
+                                 &count,
+                                 sizeof(size_t),
+                                 server_memory_mutex,
+                                 "server: get_replica");
                         break;
                     }
                     case AM_GET_KEYS_VALUES: {
@@ -326,22 +443,34 @@ void* kvs_server_init(void* args) {
                         size_t j;
                         kvs_request_t* answers = NULL;
 
-                        count =
-                            get_keys_values(request.name, &kvs_keys, &kvs_values, ST_SERVER);
+                        count = get_keys_values(request.name, &kvs_keys, &kvs_values, ST_SERVER);
 
-                        DO_RW_OP(write, client_socket[i], &count, sizeof(size_t));
+                        DO_RW_OP(write,
+                                 client_op_sockets[i],
+                                 &count,
+                                 sizeof(size_t),
+                                 server_memory_mutex,
+                                 "server: get_keys_values write size");
                         if (count == 0)
                             break;
 
                         answers = (kvs_request_t*)calloc(count, sizeof(kvs_request_t));
+                        if (answers == NULL) {
+                            printf("Memory allocation failed\n");
+                            break;
+                        }
                         for (j = 0; j < count; j++) {
-                            STR_COPY(answers[j].name, request.name, MAX_KVS_NAME_LENGTH);
-                            STR_COPY(answers[j].key, kvs_keys[j], MAX_KVS_KEY_LENGTH);
-                            STR_COPY(answers[j].val, kvs_values[j], MAX_KVS_VAL_LENGTH);
+                            kvs_str_copy(answers[j].name, request.name, MAX_KVS_NAME_LENGTH);
+                            kvs_str_copy(answers[j].key, kvs_keys[j], MAX_KVS_KEY_LENGTH);
+                            kvs_str_copy(answers[j].val, kvs_values[j], MAX_KVS_VAL_LENGTH);
                         }
 
-                        DO_RW_OP(
-                            write, client_socket[i], answers, sizeof(kvs_request_t) * count);
+                        DO_RW_OP(write,
+                                 client_op_sockets[i],
+                                 answers,
+                                 sizeof(kvs_request_t) * count,
+                                 server_memory_mutex,
+                                 "server: get_keys_values write data");
 
                         free(answers);
                         for (j = 0; j < count; j++) {
@@ -355,41 +484,59 @@ void* kvs_server_init(void* args) {
                     default: {
                         if (request.name[0] == '\0')
                             continue;
-                        printf("server: Unknown request mode - %d.\n", request.mode);
-                        exit(1);
+                        printf("server: unknown request mode - %d.\n", request.mode);
+                        exit(EXIT_FAILURE);
                     }
                 }
             }
         }
-        if (FD_ISSET(sock_listener, &read_fds)) {
+
+        if (FD_ISSET(server_listen_sock, &read_fds)) {
             int new_socket;
-      	    socklen_t peer_addr_size = sizeof(addr);
-            if ((new_socket = accept(sock_listener, (struct sockaddr*)&addr, (socklen_t*)&peer_addr_size)) <
+            socklen_t peer_addr_size = sizeof(addr);
+            if ((new_socket = accept(
+                     server_listen_sock, (struct sockaddr*)&addr, (socklen_t*)&peer_addr_size)) <
                 0) {
-                perror("accept");
+                perror("server: server_listen_sock accept");
                 exit(EXIT_FAILURE);
             }
             for (i = 0; i < MAX_CLIENT_COUNT; i++) {
-                if (client_socket[i] == 0) {
-                    client_socket[i] = new_socket;
+                if (client_op_sockets[i] == 0) {
+                    client_op_sockets[i] = new_socket;
                     break;
                 }
             }
             if (i >= MAX_CLIENT_COUNT) {
-                printf("server: Not enough free sockets\n");
-                exit(1);
+                printf("server: no free sockets\n");
+                exit(EXIT_FAILURE);
             }
         }
     }
 
     kvs_keeper_clear(ST_SERVER);
-    DO_RW_OP(write, local_sock, &is_stop, sizeof(int));
-    close(local_sock);
+
+    if (server_control_sock) {
+        DO_RW_OP_1(write,
+                   server_control_sock,
+                   &should_stop,
+                   sizeof(should_stop),
+                   ret,
+                   "server: send control msg to client");
+    }
+
+    close(server_control_sock);
+    server_control_sock = 0;
+
     for (i = 0; i < MAX_CLIENT_COUNT; i++) {
-        if (client_socket[i] != 0)
-            close(client_socket[i]);
+        if (client_op_sockets[i] != 0) {
+            close(client_op_sockets[i]);
+            client_op_sockets[i] = 0;
+        }
     }
-    close(sock_listener);
+
+    close(server_listen_sock);
+    server_listen_sock = 0;
+
     return NULL;
 }
 
@@ -404,7 +551,7 @@ size_t init_main_server_by_k8s(void) {
     main_port = strtol(port_str, NULL, 10);
     main_server_address.sin_port = main_port;
     if (inet_pton(AF_INET, main_host_ip, &(main_server_address.sin_addr)) <= 0) {
-        printf("\nInvalid address/ Address not supported: %s\n", main_host_ip);
+        printf("invalid address/ address not supported: %s\n", main_host_ip);
         return 1;
     }
     return 0;
@@ -417,14 +564,14 @@ size_t init_main_server_by_env(void) {
     tmp_host_ip = getenv(CCL_KVS_IP_PORT_ENV);
 
     if (tmp_host_ip == NULL) {
-        printf("You must set %s\n", CCL_KVS_IP_PORT_ENV);
+        printf("specify %s\n", CCL_KVS_IP_PORT_ENV);
         return 1;
     }
 
     memset(main_host_ip, 0, CCL_IP_LEN);
-    STR_COPY(main_host_ip, tmp_host_ip, CCL_IP_LEN);
+    kvs_str_copy(main_host_ip, tmp_host_ip, CCL_IP_LEN);
     if ((port = strstr(main_host_ip, "_")) == NULL) {
-        printf("You must set %s like IP_PORT\n", CCL_KVS_IP_PORT_ENV);
+        printf("set %s in format <ip>_<port>\n", CCL_KVS_IP_PORT_ENV);
         return 1;
     }
     port[0] = '\0';
@@ -434,7 +581,7 @@ size_t init_main_server_by_env(void) {
     main_server_address.sin_port = main_port;
 
     if (inet_pton(AF_INET, main_host_ip, &(main_server_address.sin_addr)) <= 0) {
-        printf("\nInvalid address/ Address not supported: %s\n", main_host_ip);
+        printf("ivalid address / address not supported: %s\n", main_host_ip);
         return 1;
     }
     return 0;
@@ -448,22 +595,22 @@ size_t init_main_server_by_string(const char* main_addr) {
 
     main_server_address.sin_family = AF_INET;
 
-    if ((sock_listener = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-        printf("Server: socket init failed - %s\n", strerror(errno));
-        exit(1);
+    if ((server_listen_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+        perror("init_main_server_by_string: server_listen_sock init");
+        exit(EXIT_FAILURE);
     }
 
-    while (bind(sock_listener,
+    while (bind(server_listen_sock,
                 (const struct sockaddr*)&local_server_address,
                 sizeof(local_server_address)) < 0) {
         local_server_address.sin_port++;
     }
 
     memset(main_host_ip, 0, CCL_IP_LEN);
-    STR_COPY(main_host_ip, main_addr, CCL_IP_LEN);
+    kvs_str_copy(main_host_ip, main_addr, CCL_IP_LEN);
 
     if ((port = strstr(main_host_ip, "_")) == NULL) {
-        printf("You must set %s like IP_PORT\n", CCL_KVS_IP_PORT_ENV);
+        printf("init_main_server_by_string: set %s in format <ip>_<port>\n", CCL_KVS_IP_PORT_ENV);
         return 1;
     }
     port[0] = '\0';
@@ -473,7 +620,9 @@ size_t init_main_server_by_string(const char* main_addr) {
     main_server_address.sin_port = main_port;
 
     if (inet_pton(AF_INET, main_host_ip, &(main_server_address.sin_addr)) <= 0) {
-        printf("\nInvalid address/ Address not supported: %s(%s)\n", main_host_ip, strerror(errno));
+        printf("init_main_server_by_string: invalid address / address not supported: %s\n",
+               main_host_ip);
+        perror("init_main_server_by_string: inet_pton");
         return 1;
     }
     return 0;
@@ -482,12 +631,13 @@ size_t init_main_server_by_string(const char* main_addr) {
 size_t internal_kvs::kvs_main_server_address_reserve(char* main_address) {
     FILE* fp;
     char* additional_local_host_ips;
-    if ((fp = popen(CHECKER_IP, READ_ONLY)) == NULL) {
-        printf("Can't get host IP - %s\n", strerror(errno));
-        exit(1);
+    if ((fp = popen(GET_IP_CMD, READ_ONLY)) == NULL) {
+        perror("reserve_main_address: can not get host IP");
+        exit(EXIT_FAILURE);
     }
     CHECK_FGETS(fgets(local_host_ip, CCL_IP_LEN, fp), local_host_ip);
     pclose(fp);
+
     while (local_host_ip[strlen(local_host_ip) - 1] == '\n' ||
            local_host_ip[strlen(local_host_ip) - 1] == ' ')
         local_host_ip[strlen(local_host_ip) - 1] = NULL_CHAR;
@@ -495,22 +645,24 @@ size_t internal_kvs::kvs_main_server_address_reserve(char* main_address) {
         additional_local_host_ips[0] = NULL_CHAR;
 
     if (strlen(local_host_ip) >= CCL_IP_LEN - INT_STR_SIZE - 1) {
-        printf("Error: Local host IP is too bigger: %zu, expected: %d\n",
+        printf("reserve_main_address: local host IP is too long: %zu, expected: %d\n",
                strlen(local_host_ip),
                CCL_IP_LEN - INT_STR_SIZE - 1);
-        exit(1);
+        exit(EXIT_FAILURE);
     }
-    if ((sock_listener = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-        printf("Server: socket init failed - %s\n", strerror(errno));
-        exit(1);
+
+    if ((server_listen_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+        perror("reserve_main_address: server_listen_sock init");
+        exit(EXIT_FAILURE);
     }
+
     main_server_address.sin_family = AF_INET;
     main_server_address.sin_addr.s_addr = inet_addr(local_host_ip);
     main_server_address.sin_port = 1;
     local_server_address.sin_family = AF_INET;
     local_server_address.sin_addr.s_addr = inet_addr(local_host_ip);
 
-    while (bind(sock_listener,
+    while (bind(server_listen_sock,
                 (const struct sockaddr*)&main_server_address,
                 sizeof(main_server_address)) < 0) {
         main_server_address.sin_port++;
@@ -532,10 +684,11 @@ size_t init_main_server_address(const char* main_addr) {
     FILE* fp;
     char* additional_local_host_ips;
 
-    if ((fp = popen(CHECKER_IP, READ_ONLY)) == NULL) {
-        printf("Can't get host IP\n");
-        exit(1);
+    if ((fp = popen(GET_IP_CMD, READ_ONLY)) == NULL) {
+        perror("init_main_server_address: can not get host IP");
+        exit(EXIT_FAILURE);
     }
+
     memset(local_host_ip, 0, CCL_IP_LEN);
     CHECK_FGETS(fgets(local_host_ip, CCL_IP_LEN, fp), local_host_ip);
     pclose(fp);
@@ -557,14 +710,14 @@ size_t init_main_server_address(const char* main_addr) {
             ip_getting_mode = IGT_K8S;
         }
         else {
-            printf("Unknown %s: %s\n", CCL_KVS_IP_EXCHANGE_ENV, ip_getting_type);
+            printf("unknown %s: %s\n", CCL_KVS_IP_EXCHANGE_ENV, ip_getting_type);
             return 1;
         }
     }
 
     if (main_addr != NULL) {
         ip_getting_mode = IGT_ENV;
-        if (sock_listener == 0)
+        if (server_listen_sock == 0)
             init_main_server_by_string(main_addr);
         return 0;
     }
@@ -575,14 +728,15 @@ size_t init_main_server_address(const char* main_addr) {
 
     main_server_address.sin_family = AF_INET;
 
-    if ((sock_listener = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-        printf("Server: socket init failed - %s\n", strerror(errno));
-        exit(1);
+    if ((server_listen_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+        ;
+        perror("init_main_server_address: server_listen_sock init");
+        exit(EXIT_FAILURE);
     }
 
     switch (ip_getting_mode) {
         case IGT_K8S: {
-            while (bind(sock_listener,
+            while (bind(server_listen_sock,
                         (const struct sockaddr*)&local_server_address,
                         sizeof(local_server_address)) < 0) {
                 local_server_address.sin_port++;
@@ -610,11 +764,11 @@ size_t init_main_server_address(const char* main_addr) {
                 }
             }
             if (is_master_node) {
-                if (bind(sock_listener,
+                if (bind(server_listen_sock,
                          (const struct sockaddr*)&main_server_address,
                          sizeof(main_server_address)) < 0) {
-                    printf("PORT %d busy\n", main_server_address.sin_port);
-                    while (bind(sock_listener,
+                    printf("port [%d] is busy\n", main_server_address.sin_port);
+                    while (bind(server_listen_sock,
                                 (const struct sockaddr*)&local_server_address,
                                 sizeof(local_server_address)) < 0) {
                         local_server_address.sin_port++;
@@ -626,7 +780,7 @@ size_t init_main_server_address(const char* main_addr) {
                 }
             }
             else {
-                while (bind(sock_listener,
+                while (bind(server_listen_sock,
                             (const struct sockaddr*)&local_server_address,
                             sizeof(local_server_address)) < 0) {
                     local_server_address.sin_port++;
@@ -637,7 +791,7 @@ size_t init_main_server_address(const char* main_addr) {
             return res;
         }
         default: {
-            printf("Unknown %s\n", CCL_KVS_IP_EXCHANGE_ENV);
+            printf("unknown %s\n", CCL_KVS_IP_EXCHANGE_ENV);
             return 1;
         }
     }
@@ -657,55 +811,69 @@ size_t internal_kvs::kvs_init(const char* main_addr) {
     addr.sin_addr.s_addr = inet_addr("127.0.0.1");
     addr.sin_port = 1;
 
-    if ((sock_sender = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-        printf("\n Socket creation error \n");
+    if ((client_op_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+        perror("kvs_init: client_op_sock init");
         return 1;
     }
-    if ((local_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-        printf("\n Socket creation error: %s\n", strerror(errno));
+
+    if ((server_control_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+        perror("kvs_init: server_control_sock init");
         return 1;
     }
+
     if (init_main_server_address(main_addr)) {
-        printf("Init main server address error\n");
-        close(sock_sender);
-        close(local_sock);
+        printf("kvs_init: init main server address error\n");
+        close(client_op_sock);
+        close(server_control_sock);
+        client_op_sock = 0;
+        server_control_sock = 0;
         return 1;
     }
-    while (bind(local_sock, (const struct sockaddr*)&addr, sizeof(addr)) < 0) {
+
+    while (bind(server_control_sock, (const struct sockaddr*)&addr, sizeof(addr)) < 0) {
         addr.sin_port++;
     }
 
-    if (listen(local_sock, 1) < 0) {
-        printf("listener error: %s\n", strerror(errno));
+    if (listen(server_control_sock, 1) < 0) {
+        perror("kvs_init: server_control_sock listen");
         exit(EXIT_FAILURE);
     }
-    getsockname(local_sock, (struct sockaddr*)&addr, &len);
-    err = pthread_create(&thread, NULL, kvs_server_init, &addr);
+
+    getsockname(server_control_sock, (struct sockaddr*)&addr, &len);
+    err = pthread_create(&kvs_thread, NULL, kvs_server_init, &addr);
     if (err) {
-        printf("error while creating listener thread, pthread_create returns %d\n", err);
+        printf("kvs_init: failed to create kvs server thread, pthread_create returns %d\n", err);
         return 1;
     }
 
-    if ((accepted_local_sock = accept(local_sock, NULL, NULL)) < 0) {
-        printf("Client: accept error: %s\n", strerror(errno));
+    if ((client_control_sock = accept(server_control_sock, NULL, NULL)) < 0) {
+        perror("kvs_init: server_control_sock accept");
         exit(EXIT_FAILURE);
     }
+
     /* Wait connection to master */
     start_time = time(NULL);
     do {
         err = connect(
-            sock_sender, (struct sockaddr*)&main_server_address, sizeof(main_server_address));
+            client_op_sock, (struct sockaddr*)&main_server_address, sizeof(main_server_address));
         connection_time = time(NULL) - start_time;
     } while ((err < 0) && (connection_time < CONNECTION_TIMEOUT));
 
     if (connection_time >= CONNECTION_TIMEOUT) {
-        printf("Connection error: timeout limit (%ld > %d)\n", connection_time, CONNECTION_TIMEOUT);
-        exit(1);
+        printf("kvs_init: connection error: timeout limit (%ld > %d)\n",
+               connection_time,
+               CONNECTION_TIMEOUT);
+        exit(EXIT_FAILURE);
     }
 
     request.mode = AM_CONNECT;
 
-    DO_RW_OP(write, sock_sender, &request, sizeof(kvs_request_t));
+    DO_RW_OP(write,
+             client_op_sock,
+             &request,
+             sizeof(kvs_request_t),
+             client_memory_mutex,
+             "client: connect");
 
     if (strstr(main_host_ip, local_host_ip) && local_port == main_port) {
         is_master = 1;
@@ -719,28 +887,46 @@ size_t internal_kvs::kvs_finalize(void) {
     kvs_request_t request;
     memset(&request, 0, sizeof(kvs_request_t));
 
-    if (thread != 0) {
+    if (kvs_thread != 0) {
         void* exit_code;
         int err;
         request.mode = AM_FINALIZE;
 
-        DO_RW_OP(write, accepted_local_sock, &request, sizeof(kvs_request_t));
-
-        DO_RW_OP(read, accepted_local_sock, &err, sizeof(int));
-
-        err = pthread_join(thread, &exit_code);
+        DO_RW_OP(write,
+                 client_control_sock,
+                 &request,
+                 sizeof(kvs_request_t),
+                 client_memory_mutex,
+                 "client: finalize start");
+
+        DO_RW_OP(read,
+                 client_control_sock,
+                 &err,
+                 sizeof(int),
+                 client_memory_mutex,
+                 "client: finalize complete");
+
+        err = pthread_join(kvs_thread, &exit_code);
         if (err) {
-            printf("error while joining progress listener, pthread_join returns %d\n", err);
+            printf("kvs_finalize: failed to stop kvs server thread, pthread_join returns %d\n",
+                   err);
         }
-        thread = 0;
-        close(accepted_local_sock);
-        close(local_sock);
+
+        kvs_thread = 0;
+
+        close(client_control_sock);
+        close(server_control_sock);
+
+        client_control_sock = 0;
+        server_control_sock = 0;
     }
-    close(sock_sender);
+    close(client_op_sock);
+    client_op_sock = 0;
 
     if (ip_getting_mode == IGT_K8S)
         request_k8s_kvs_finalize(is_master);
     is_inited = false;
+
     return 0;
 }
 internal_kvs::~internal_kvs() {
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
index 41ddf1f9e..8c8f5b0ff 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
@@ -13,11 +13,11 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#ifndef KVS
-#define KVS
+#pragma once
 
 #include <stddef.h>
 #include "ikvs_wrapper.h"
+
 class internal_kvs final : public ikvs_wrapper {
 public:
     size_t kvs_set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) override;
@@ -47,4 +47,3 @@ class internal_kvs final : public ikvs_wrapper {
 private:
     bool is_inited{ false };
 };
-#endif
\ No newline at end of file
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/kvs_common_attr.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/kvs_common_attr.hpp
new file mode 100644
index 000000000..8f5eee319
--- /dev/null
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/kvs_common_attr.hpp
@@ -0,0 +1,53 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/kvs_attr_ids_traits.hpp"
+
+namespace ccl {
+
+class ccl_kvs_attr_impl {
+public:
+    /**
+     * `version` operations
+     */
+    using version_traits_t = detail::ccl_api_type_attr_traits<kvs_attr_id, kvs_attr_id::version>;
+
+    const typename version_traits_t::return_type& get_attribute_value(
+        const version_traits_t& id) const {
+        return version;
+    }
+
+    typename version_traits_t::return_type set_attribute_value(typename version_traits_t::type val,
+                                                               const version_traits_t& t) {
+        (void)t;
+        throw ccl::exception("Set value for 'ccl::kvs_attr_id::version' is not allowed");
+        return version;
+    }
+
+    ccl_kvs_attr_impl(const typename version_traits_t::return_type& version) : version(version) {}
+
+    template <kvs_attr_id attr_id>
+    bool is_valid() const noexcept {
+        return (attr_id == kvs_attr_id::version);
+    }
+
+protected:
+    typename version_traits_t::return_type version;
+};
+
+} // namespace ccl
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp
index 0446fce0a..e648f1fbc 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.cpp
@@ -21,28 +21,27 @@
 users_kvs::users_kvs(std::shared_ptr<ccl::kvs_interface> kvs) : kvs(kvs) {}
 
 size_t users_kvs::kvs_set_value(const char* kvs_name, const char* kvs_key, const char* kvs_val) {
-    std::string name(kvs_name), key(kvs_key);
+    ccl::string_class name(kvs_name), key(kvs_key);
     ccl::vector_class<char> vec_val(kvs_val, kvs_val + strlen(kvs_val) + 1);
     vec_val[strlen(kvs_val)] = '\0';
-    kvs->set((name + key).c_str(), vec_val);
+    kvs->set(name + key, vec_val);
 
     return 0;
 }
 
 size_t users_kvs::kvs_remove_name_key(const char* kvs_name, const char* kvs_key) {
     ccl::vector_class<char> kvs_val = { '\0' };
-    std::string name(kvs_name), key(kvs_key);
-    kvs->set((name + key).c_str(), kvs_val);
-
+    ccl::string_class name(kvs_name), key(kvs_key);
+    kvs->set(name + key, kvs_val);
     return 0;
 }
 
 size_t users_kvs::kvs_get_value_by_name_key(const char* kvs_name,
                                             const char* kvs_key,
                                             char* kvs_val) {
-    std::string name(kvs_name), key(kvs_key);
+    ccl::string_class name(kvs_name), key(kvs_key);
+    ccl::vector_class<char> res = kvs->get(name + key);
 
-    ccl::vector_class<char> res = kvs->get((name + key).c_str());
     if (res.data())
         SET_STR(kvs_val, MAX_KVS_VAL_LENGTH, "%s", res.data());
     else
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h
index 4ad3abbac..1d220ebff 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/users_kvs.h
@@ -14,6 +14,7 @@
  limitations under the License.
 */
 #pragma once
+
 #include <unistd.h>
 #include <memory>
 #include "oneapi/ccl.hpp"
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.c b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.cpp
similarity index 87%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.c
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.cpp
index 64701f1d0..4f554a752 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.c
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.cpp
@@ -15,7 +15,9 @@
 */
 #include <stdlib.h>
 #include <string.h>
-#include "kvs_keeper.h"
+
+#include "util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp"
+#include "kvs_keeper.hpp"
 #include "def.h"
 
 #define COMPARE_STR(str1, str2, str2_len) (strstr((str1), (str2)) && (strlen(str1) == (str2_len)))
@@ -56,7 +58,7 @@ size_t get_val(const char kvs_name[], const char kvs_key[], char* kvs_val, stora
     for (i = 0; i < kvs_list_size[st_type]; i++) {
         if (COMPARE_STR(new_key_ptr->kvs.name, kvs_name, kvs_name_len) &&
             COMPARE_STR(new_key_ptr->kvs.key, kvs_key, kvs_key_len)) {
-            STR_COPY(kvs_val, new_key_ptr->kvs.val, MAX_KVS_VAL_LENGTH);
+            kvs_str_copy(kvs_val, new_key_ptr->kvs.val, MAX_KVS_VAL_LENGTH);
             return 1;
         }
         new_key_ptr = new_key_ptr->next;
@@ -91,7 +93,15 @@ size_t get_keys_values(const char* kvs_name,
     }
 
     *kvs_values = (char**)malloc(sizeof(char*) * count);
+    if (*kvs_values == NULL) {
+        printf("Memory allocation failed\n");
+        exit(1);
+    }
     *kvs_keys = (char**)malloc(sizeof(char*) * count);
+    if (*kvs_keys == NULL) {
+        printf("Memory allocation failed\n");
+        exit(1);
+    }
 
     for (i = 0; i < count; i++) {
         (*kvs_keys)[i] = (char*)malloc(sizeof(char) * MAX_KVS_KEY_LENGTH);
@@ -101,8 +111,8 @@ size_t get_keys_values(const char* kvs_name,
     new_key_ptr = head[st_type];
     for (i = 0; ((new_key_ptr != NULL) && (i < count));) {
         if (COMPARE_STR(new_key_ptr->kvs.name, kvs_name, kvs_name_len)) {
-            STR_COPY((*kvs_keys)[i], new_key_ptr->kvs.key, MAX_KVS_KEY_LENGTH);
-            STR_COPY((*kvs_values)[i], new_key_ptr->kvs.val, MAX_KVS_VAL_LENGTH);
+            kvs_str_copy((*kvs_keys)[i], new_key_ptr->kvs.key, MAX_KVS_KEY_LENGTH);
+            kvs_str_copy((*kvs_values)[i], new_key_ptr->kvs.val, MAX_KVS_VAL_LENGTH);
             i++;
         }
         new_key_ptr = new_key_ptr->next;
@@ -167,9 +177,9 @@ void put_key(const char kvs_name[],
     }
     kvs_list_size[st_type]++;
 copy:
-    STR_COPY(tmp_key_ptr->kvs.name, kvs_name, MAX_KVS_NAME_LENGTH);
-    STR_COPY(tmp_key_ptr->kvs.key, kvs_key, MAX_KVS_KEY_LENGTH);
-    STR_COPY(tmp_key_ptr->kvs.val, kvs_val, MAX_KVS_VAL_LENGTH);
+    kvs_str_copy(tmp_key_ptr->kvs.name, kvs_name, MAX_KVS_NAME_LENGTH);
+    kvs_str_copy(tmp_key_ptr->kvs.key, kvs_key, MAX_KVS_KEY_LENGTH);
+    kvs_str_copy(tmp_key_ptr->kvs.val, kvs_val, MAX_KVS_VAL_LENGTH);
 
     if (strlen(kvs_name) > MAX_KVS_NAME_LENGTH) {
         tmp_key_ptr->kvs.name[MAX_KVS_NAME_LENGTH - 1] = NULL_CHAR;
@@ -203,9 +213,9 @@ size_t cut_head(char* kvs_name, char* kvs_key, char* kvs_val, storage_type_t st_
         memset(kvs_name, 0, MAX_KVS_NAME_LENGTH);
         memset(kvs_key, 0, MAX_KVS_KEY_LENGTH);
         memset(kvs_val, 0, MAX_KVS_VAL_LENGTH);
-        STR_COPY(kvs_name, key_ptr->kvs.name, MAX_KVS_NAME_LENGTH);
-        STR_COPY(kvs_key, key_ptr->kvs.key, MAX_KVS_KEY_LENGTH);
-        STR_COPY(kvs_val, key_ptr->kvs.val, MAX_KVS_VAL_LENGTH);
+        kvs_str_copy(kvs_name, key_ptr->kvs.name, MAX_KVS_NAME_LENGTH);
+        kvs_str_copy(kvs_key, key_ptr->kvs.key, MAX_KVS_KEY_LENGTH);
+        kvs_str_copy(kvs_val, key_ptr->kvs.val, MAX_KVS_VAL_LENGTH);
 
         free(key_ptr);
         kvs_list_size[st_type]--;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.hpp
similarity index 100%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.h
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.hpp
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
index 5d9616e42..ca4a2a59a 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
@@ -55,7 +55,7 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     char my_ip[MAX_KVS_VAL_LENGTH];
     char* point_to_space;
 
-    if ((fp = popen(CHECKER_IP, READ_ONLY)) == NULL) {
+    if ((fp = popen(GET_IP_CMD, READ_ONLY)) == NULL) {
         printf("Can't get host IP\n");
         exit(1);
     }
@@ -92,6 +92,11 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
     }
 
     server_addresses = (struct sockaddr_in*)malloc((num_listeners) * sizeof(struct sockaddr_in));
+    if (server_addresses == NULL) {
+        printf("\nmemory allocation failed \n");
+        res = -1;
+        goto exit;
+    }
 
     /*get listener addresses*/
     for (i = 0, j = 0; i < num_listeners; i++, j++) {
@@ -107,7 +112,23 @@ int pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
             i--;
             continue;
         }
-        server_addresses[i].sin_port = strtol(point_to_port, NULL, 10);
+
+        if ((server_addresses[i].sin_port = strtol(point_to_port, NULL, 10)) == 0) {
+            /* if a conversion error occurred, display a message and exit */
+            if (errno == EINVAL) {
+                printf("\nconversion error occurred from: %hu\n", server_addresses[i].sin_port);
+                res = -1;
+                goto exit;
+            }
+
+            /* if the value provided was out of range, display a warning message */
+            if (errno == ERANGE) {
+                printf("\nthe value provided was out of range, value: %hu\n",
+                       server_addresses[i].sin_port);
+                res = -1;
+                goto exit;
+            }
+        }
         server_addresses[i].sin_family = AF_INET;
 
         if (inet_pton(AF_INET, sock_addr_str[j], &(server_addresses[i].sin_addr)) <= 0) {
@@ -163,8 +184,9 @@ int pmi_listener::run_listener(std::shared_ptr<helper> h) {
         char* point_to_space;
         struct timeval timeout;
         timeout.tv_sec = LISTENER_TIMEOUT;
+        timeout.tv_usec = 0;
 
-        if ((fp = popen(CHECKER_IP, READ_ONLY)) == NULL) {
+        if ((fp = popen(GET_IP_CMD, READ_ONLY)) == NULL) {
             printf("Can't get host IP\n");
             exit(1);
         }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp
index 87e44670f..e845ea50d 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp
@@ -16,7 +16,7 @@
 #ifndef LISTENER_H_INCLUDED
 #define LISTENER_H_INCLUDED
 
-#include "helper.h"
+#include "helper.hpp"
 
 class pmi_listener {
 public:
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.c b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.cpp
similarity index 89%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.c
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.cpp
index f2e567581..712c69562 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.c
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.cpp
@@ -16,7 +16,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "rank_list.h"
+#include "rank_list.hpp"
 
 void rank_list_sort(rank_list_t* list) {
     rank_list_t* left = list;
@@ -26,7 +26,7 @@ void rank_list_sort(rank_list_t* list) {
         right = left->next;
         while (right != NULL) {
             if (left->rank > right->rank) {
-                size_t tmp_i = left->rank;
+                int tmp_i = left->rank;
                 left->rank = right->rank;
                 right->rank = tmp_i;
             }
@@ -48,7 +48,7 @@ void rank_list_clean(rank_list_t** list) {
     *list = NULL;
 }
 
-size_t rank_list_contains(rank_list_t* list, size_t rank) {
+size_t rank_list_contains(rank_list_t* list, int rank) {
     rank_list_t* cur_list = list;
 
     while (cur_list != NULL) {
@@ -81,9 +81,13 @@ void rank_list_keep_first_n(rank_list_t** origin_list, size_t n) {
         (*origin_list) = NULL;
 }
 
-void rank_list_add(rank_list_t** origin_list, size_t rank) {
+void rank_list_add(rank_list_t** origin_list, int rank) {
     if ((*origin_list) == NULL) {
         (*origin_list) = (rank_list_t*)malloc(sizeof(rank_list_t));
+        if ((*origin_list) == NULL) {
+            printf("Memory allocation failed\n");
+            return;
+        }
         (*origin_list)->next = NULL;
         (*origin_list)->rank = rank;
     }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.hpp
similarity index 87%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.h
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.hpp
index 97d8a1a81..064e244d6 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/rank_list.hpp
@@ -20,11 +20,11 @@
 extern "C" {
 #endif
 typedef struct rank_list {
-    size_t rank;
+    int rank;
     struct rank_list* next;
 } rank_list_t;
 
-size_t rank_list_contains(rank_list_t* list, size_t rank);
+size_t rank_list_contains(rank_list_t* list, int rank);
 
 void rank_list_clean(rank_list_t** list);
 
@@ -32,7 +32,7 @@ void rank_list_sort(rank_list_t* list);
 
 void rank_list_keep_first_n(rank_list_t** origin_list, size_t n);
 
-void rank_list_add(rank_list_t** origin_list, size_t rank);
+void rank_list_add(rank_list_t** origin_list, int rank);
 
 #ifdef __cplusplus
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.c b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp
similarity index 96%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.c
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp
index 32a460be3..b72c8727e 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.c
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.cpp
@@ -19,8 +19,9 @@
 #include <ctype.h>
 #include <unistd.h>
 
-#include "request_wrappers_k8s.h"
 #include "def.h"
+#include "util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp"
+#include "request_wrappers_k8s.hpp"
 
 #define JOB_NAME "CCL_JOB_NAME"
 
@@ -114,7 +115,7 @@ void json_get_val(FILE* fp, const char** keys, size_t keys_count, char* val) {
         res[strlen(res) - 1] = '\0';
         last_char = res[strlen(res) - 1];
     }
-    STR_COPY(val, res, MAX_KVS_VAL_LENGTH);
+    kvs_str_copy(val, res, MAX_KVS_VAL_LENGTH);
     while (fgets(cur_kvs_str, MAX_KVS_STR_LENGTH, fp)) {
     }
 }
@@ -222,7 +223,10 @@ void get_my_job_name(const char* connect_api_template) {
             pod_name,
             get_kvs_val);
 
-    fp = popen(run_str, READ_ONLY);
+    if ((fp = popen(run_str, READ_ONLY)) == NULL) {
+        printf("Can't get %s", strerror(errno));
+        exit(1);
+    }
     CHECK_FGETS(fgets(job_name, MAX_KVS_NAME_LENGTH, fp), job_name);
     pclose(fp);
     if (job_name[0] == NULL_CHAR) {
@@ -355,24 +359,33 @@ size_t request_k8s_kvs_finalize(size_t is_master) {
 
 size_t get_by_template(char*** kvs_entry,
                        const char* request,
-                       const char* template,
+                       const char* template_str,
                        int count,
                        int max_count) {
     FILE* fp;
     char get_val[REQUEST_POSTFIX_SIZE];
     char run_str[RUN_REQUEST_SIZE];
-    size_t i;
+    int i;
 
     if (*kvs_entry != NULL)
         free(*kvs_entry);
 
     *kvs_entry = (char**)malloc(sizeof(char*) * count);
-    for (i = 0; i < count; i++)
+    if (*kvs_entry == NULL) {
+        printf("Memory allocation failed\n");
+        exit(1);
+    }
+    for (i = 0; i < count; i++) {
         (*kvs_entry)[i] = (char*)malloc(sizeof(char) * max_count);
+        if ((*kvs_entry)[i] == NULL) {
+            printf("Memory allocation failed\n");
+            exit(1);
+        }
+    }
 
     i = 0;
 
-    SET_STR(get_val, REQUEST_POSTFIX_SIZE, CONCAT_TWO_COMMAND_TEMPLATE, request, template);
+    SET_STR(get_val, REQUEST_POSTFIX_SIZE, CONCAT_TWO_COMMAND_TEMPLATE, request, template_str);
     SET_STR(run_str, RUN_REQUEST_SIZE, run_get_template, get_val);
     if ((fp = popen(run_str, READ_ONLY)) == NULL) {
         printf("Can't get by template\n");
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.hpp
similarity index 100%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.h
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/request_wrappers_k8s.hpp
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
index 23addcc7f..5e947e01d 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
@@ -16,7 +16,7 @@
 #include "atl/util/pm/pmi_resizable_rt/pmi_resizable.h"
 #include "util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h"
 
-static size_t root_rank = 0;
+static int root_rank = 0;
 static size_t is_new_root = 0;
 static size_t ask_only_framework = 0;
 static size_t finalized = 0;
@@ -31,9 +31,9 @@ void Call_Hard_finilize(int sig) {
     pmi_object->Hard_finilize(sig);
 }
 
-kvs_resize_action_t pmi_resizable::default_checker(size_t comm_size) {
+kvs_resize_action_t pmi_resizable::default_checker(int comm_size) {
     char* comm_size_to_start_env;
-    size_t comm_size_to_start;
+    int comm_size_to_start;
 
     comm_size_to_start_env = getenv(CCL_WORLD_SIZE_ENV);
 
@@ -47,7 +47,7 @@ kvs_resize_action_t pmi_resizable::default_checker(size_t comm_size) {
     return KVS_RA_WAIT;
 }
 
-kvs_resize_action_t pmi_resizable::call_resize_fn(size_t comm_size) {
+kvs_resize_action_t pmi_resizable::call_resize_fn(int comm_size) {
     if (resize_function != nullptr)
         return resize_function(comm_size);
 
@@ -56,8 +56,8 @@ kvs_resize_action_t pmi_resizable::call_resize_fn(size_t comm_size) {
 
 int pmi_resizable::PMIR_Update(void) {
     char up_idx_str[MAX_KVS_VAL_LENGTH];
-    size_t prev_new_ranks_count = 0;
-    size_t prev_killed_ranks_count = 0;
+    int prev_new_ranks_count = 0;
+    int prev_killed_ranks_count = 0;
     int prev_idx = -1;
     kvs_resize_action_t answer;
     rank_list_t* dead_up_idx = NULL;
@@ -100,7 +100,7 @@ int pmi_resizable::PMIR_Update(void) {
 
                     //                    while (int_list_is_contained(killed_ranks, root_rank) == 1)
                     {
-                        size_t old_root = root_rank;
+                        int old_root = root_rank;
                         h->get_new_root(&root_rank);
 
                         if (my_rank == root_rank && old_root != root_rank)
@@ -151,7 +151,8 @@ int pmi_resizable::PMIR_Update(void) {
             if (!is_first_collect || ask_only_framework == 1)
                 answer = call_resize_fn(count_pods - killed_ranks_count + new_ranks_count);
             else {
-                if (h->get_replica_size() != count_pods - killed_ranks_count + new_ranks_count)
+                if ((int)(h->get_replica_size()) !=
+                    count_pods - killed_ranks_count + new_ranks_count)
                     answer = KVS_RA_WAIT;
                 else
                     answer = KVS_RA_RUN;
@@ -202,7 +203,7 @@ int pmi_resizable::PMIR_Update(void) {
 void pmi_resizable::Hard_finilize(int sig) {
     char rank_str[INT_STR_SIZE];
 
-    SET_STR(rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, my_rank);
+    SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
     h->set_value(KVS_DEAD_POD, my_hostname, rank_str);
 
@@ -214,7 +215,7 @@ void pmi_resizable::Hard_finilize(int sig) {
         old_act.sa_handler(sig);
 }
 
-int pmi_resizable::PMIR_Main_Addr_Reserv(char* main_addr) {
+int pmi_resizable::PMIR_Main_Addr_Reserve(char* main_addr) {
     h->main_server_address_reserve(main_addr);
     return 0;
 }
@@ -272,7 +273,7 @@ int pmi_resizable::PMIR_Finalize(void) {
 
     applied = 0;
 
-    SET_STR(rank_str, INT_STR_SIZE, SIZE_T_TEMPLATE, my_rank);
+    SET_STR(rank_str, INT_STR_SIZE, RANK_TEMPLATE, my_rank);
 
     h->remove_name_key(KVS_POD_NUM, rank_str);
 
@@ -313,18 +314,18 @@ int pmi_resizable::PMIR_Barrier(void) {
     return 0;
 }
 
-int pmi_resizable::PMIR_Get_size(size_t* size) {
+int pmi_resizable::PMIR_Get_size(int* size) {
     *size = count_pods;
     return 0;
 }
 
-int pmi_resizable::PMIR_Get_rank(size_t* rank) {
+int pmi_resizable::PMIR_Get_rank(int* rank) {
     *rank = my_rank;
     return 0;
 }
 
 int pmi_resizable::PMIR_KVS_Get_my_name(char* kvs_name, size_t length) {
-    STR_COPY(kvs_name, KVS_NAME, length);
+    kvs_str_copy(kvs_name, KVS_NAME, length);
     return 0;
 }
 
@@ -372,15 +373,15 @@ int pmi_resizable::PMIR_Wait_notification(void) {
     return listener.run_listener(h);
 }
 
-size_t pmi_resizable::get_rank() {
+int pmi_resizable::get_rank() {
     return rank;
 }
 
-size_t pmi_resizable::get_size() {
+int pmi_resizable::get_size() {
     return size;
 }
 
-size_t pmi_resizable::get_thread() {
+size_t pmi_resizable::get_local_thread_idx() {
     return 0;
 }
 size_t pmi_resizable::get_local_kvs_id() {
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.h
index 85a6eac0c..b9ef14a62 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.h
@@ -44,17 +44,17 @@ typedef enum {
     KVS_RA_RUN = 1,
     KVS_RA_FINALIZE = 2,
 } kvs_resize_action_t;
-typedef kvs_resize_action_t (*pmir_resize_fn_t)(size_t comm_size);
+typedef kvs_resize_action_t (*pmir_resize_fn_t)(int comm_size);
 
-int PMIR_API PMIR_Main_Addr_Reserv(char* main_addr);
+int PMIR_API PMIR_Main_Addr_Reserve(char* main_addr);
 
 int PMIR_API PMIR_Init(const char* main_addr);
 
 int PMIR_API PMIR_Finalize(void);
 
-int PMIR_API PMIR_Get_size(size_t* size);
+int PMIR_API PMIR_Get_size(int* size);
 
-int PMIR_API PMIR_Get_rank(size_t* rank);
+int PMIR_API PMIR_Get_rank(int* rank);
 
 int PMIR_API PMIR_KVS_Get_my_name(char* kvs_name, size_t length);
 
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.c b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.cpp
similarity index 86%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.c
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.cpp
index da5bdb652..30fe48722 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.c
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.cpp
@@ -16,7 +16,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "shift_list.h"
+#include "shift_list.hpp"
 
 void shift_list_clean(shift_list_t** list) {
     shift_list_t* cur_list = (*list);
@@ -29,10 +29,14 @@ void shift_list_clean(shift_list_t** list) {
     (*list) = NULL;
 }
 
-void shift_list_add(shift_list_t** list, size_t old_rank, size_t new_rank, change_type_t type) {
+void shift_list_add(shift_list_t** list, int old_rank, int new_rank, change_type_t type) {
     shift_list_t* cur_list;
     if ((*list) == NULL) {
         (*list) = (shift_list_t*)malloc(sizeof(shift_list_t));
+        if ((*list) == NULL) {
+            printf("Memory allocation failed\n");
+            return;
+        }
         cur_list = (*list);
     }
     else {
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.hpp
similarity index 88%
rename from src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.h
rename to src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.hpp
index 7983f4dcc..59f3d6a5e 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/shift_list.hpp
@@ -27,8 +27,8 @@ typedef enum change_type {
 } change_type_t;
 
 typedef struct shift_rank {
-    size_t old_rank;
-    size_t new_rank;
+    int old_rank;
+    int new_rank;
     change_type_t type;
 } shift_rank_t;
 
@@ -39,7 +39,7 @@ typedef struct shift_list {
 
 void shift_list_clean(shift_list_t** list);
 
-void shift_list_add(shift_list_t** list, size_t old_rank, size_t new_rank, change_type_t type);
+void shift_list_add(shift_list_t** list, int old_rank, int new_rank, change_type_t type);
 
 #ifdef __cplusplus
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_rt.c b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_rt.c
index 58b770e2f..4cfc14dc3 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_rt.c
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_rt.c
@@ -23,7 +23,7 @@
 
 #include "pm_rt.h"
 
-#define RESIZABLE_PMI_RT_KEY_FORMAT "%s-%zu"
+#define RESIZABLE_PMI_RT_KEY_FORMAT "%s-%d"
 
 typedef struct resizable_pm_rt_context {
     pm_rt_desc_t pmrt_desc;
@@ -69,7 +69,7 @@ static void resizable_pmirt_barrier(pm_rt_desc_t *pmrt_desc) {
 
 static atl_status_t resizable_pmirt_kvs_put(pm_rt_desc_t *pmrt_desc,
                                             char *kvs_key,
-                                            size_t proc_idx,
+                                            int proc_idx,
                                             const void *kvs_val,
                                             size_t kvs_val_len) {
     int ret;
@@ -109,7 +109,7 @@ static atl_status_t resizable_pmirt_kvs_put(pm_rt_desc_t *pmrt_desc,
 
 static atl_status_t resizable_pmirt_kvs_get(pm_rt_desc_t *pmrt_desc,
                                             char *kvs_key,
-                                            size_t proc_idx,
+                                            int proc_idx,
                                             void *kvs_val,
                                             size_t kvs_val_len) {
     int ret;
@@ -140,7 +140,7 @@ static atl_status_t resizable_pmirt_kvs_get(pm_rt_desc_t *pmrt_desc,
     return ATL_STATUS_SUCCESS;
 }
 
-static atl_status_t resizable_pmirt_update(size_t *proc_idx, size_t *proc_count) {
+static atl_status_t resizable_pmirt_update(int *proc_idx, int *proc_count) {
     int ret;
     ret = PMIR_Update();
     if (ret != PMIR_SUCCESS)
@@ -184,8 +184,8 @@ pm_rt_kvs_ops_t resizable_kvs_ops = {
     .get = resizable_pmirt_kvs_get,
 };
 
-atl_status_t resizable_pmirt_init(size_t *proc_idx,
-                                  size_t *proc_count,
+atl_status_t resizable_pmirt_init(int *proc_idx,
+                                  int *proc_count,
                                   pm_rt_desc_t **pmrt_desc,
                                   const char *main_addr) {
     int ret;
@@ -260,8 +260,8 @@ atl_status_t resizable_pmirt_init(size_t *proc_idx,
     return ATL_STATUS_FAILURE;
 }
 
-atl_status_t resizable_pmirt_main_addr_reserv(char *main_addr) {
-    int ret = PMIR_Main_Addr_Reserv(main_addr);
+atl_status_t resizable_pmirt_main_addr_reserve(char *main_addr) {
+    int ret = PMIR_Main_Addr_Reserve(main_addr);
 
     if (ret)
         return ATL_STATUS_FAILURE;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
index 1a03c6a43..b797f5c3c 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.cpp
@@ -16,23 +16,24 @@
 #include <unistd.h>
 
 #include "util/pm/pmi_resizable_rt/pmi_resizable/def.h"
-#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.h"
+#include "util/pm/pmi_resizable_rt/pmi_resizable/kvs_keeper.hpp"
 #include "pmi_resizable_simple.h"
 #include "util/pm/codec/pm_rt_codec.h"
 
-#define RESIZABLE_PMI_RT_KEY_FORMAT "%s-%zu"
-#define DEVICES_PER_THREAD          "DEVICES_PER_THREAD"
+#define RESIZABLE_PMI_RT_KEY_FORMAT "%s-%d"
+#define RANKS_PER_THREAD            "RANKS_PER_THREAD"
 #define PROCESS_THREAD_NAME         "PROCESS_THREAD_NAME"
-#define REQUESTED_RANK_TO_NAME      "REQUESTED_RANK_TO_NAME"
-#define GLOBAL_NAME_TO_RANK         "GLOBAL_NAME_TO_RANK"
-#define GLOBAL_RANK_TO_NAME         "GLOBAL_RANK_TO_NAME"
-#define LOCAL_KVS_ID                "LOCAL_KVS_ID"
 
-pmi_resizable_simple::pmi_resizable_simple(size_t size,
-                                           const std::vector<size_t>& ranks,
+#define REQUESTED_RANK_TO_NAME "REQUESTED_RANK_TO_NAME"
+#define GLOBAL_NAME_TO_RANK    "GLOBAL_NAME_TO_RANK"
+#define GLOBAL_RANK_TO_NAME    "GLOBAL_RANK_TO_NAME"
+#define LOCAL_KVS_ID           "LOCAL_KVS_ID"
+
+pmi_resizable_simple::pmi_resizable_simple(int size,
+                                           const std::vector<int>& ranks,
                                            std::shared_ptr<ikvs_wrapper> k,
                                            const char* main_addr)
-        : dev_count(size),
+        : total_rank_count(size),
           ranks(ranks),
           k(k) {
     max_keylen = MAX_KVS_KEY_LENGTH;
@@ -47,14 +48,14 @@ int pmi_resizable_simple::is_pm_resize_enabled() {
 atl_status_t pmi_resizable_simple::pmrt_init(const char* main_addr) {
     (void)main_addr;
     char* connection_timeout_str = getenv("CCL_KVS_GET_TIMEOUT");
-    if (connection_timeout_str)
-    {
+    if (connection_timeout_str) {
         connection_timeout = atoi(connection_timeout_str);
     }
     local_id = 0;
     val_storage = (char*)calloc(1, max_vallen);
     if (!val_storage)
         return ATL_STATUS_FAILURE;
+    /*TODO: add sort, ranks should increase continiusly*/
     if (ranks[0] == 0) {
         size_t tmp_local_id = get_local_kvs_id();
         tmp_local_id++;
@@ -68,18 +69,18 @@ atl_status_t pmi_resizable_simple::pmrt_init(const char* main_addr) {
 }
 
 void pmi_resizable_simple::make_requested_info() {
-    register_my_first_rank_and_dev_count();
-    get_requested_thread_num_and_threads_count();
+    register_first_rank_idx_and_rank_count();
+    assign_thread_idx_and_fill_ranks_per_thread_map();
 
     local_id = get_local_kvs_id();
     register_my_proc_name();
-    get_my_proc_num_and_proc_count();
-    get_local_thread_num();
+    get_my_proc_idx_and_proc_count();
+    calculate_local_thread_idx();
     remove_initial_data();
-    pmrt_barrier();
+    pmrt_barrier_full();
 }
 
-atl_status_t pmi_resizable_simple::pmrt_main_addr_reserv(char* main_addr) {
+atl_status_t pmi_resizable_simple::pmrt_main_addr_reserve(char* main_addr) {
     return ATL_STATUS_UNSUPPORTED;
 }
 
@@ -99,9 +100,9 @@ void pmi_resizable_simple::pmrt_finalize() {
     is_finalized = true;
     free(val_storage);
 
-    if (getenv("CCL_PMI_FORCE_FINALIZE"))
-    {
-        printf("skip pmi_resizable_simple::pmrt_finalize\n"); fflush(stdout);
+    if (getenv("CCL_PMI_FORCE_FINALIZE")) {
+        printf("skip pmi_resizable_simple::pmrt_finalize\n");
+        fflush(stdout);
         return;
     }
 
@@ -120,7 +121,7 @@ void pmi_resizable_simple::pmrt_barrier() {
 
     SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num);
 
-    kvs_set_value(KVS_BARRIER, std::to_string(requested_rank_num).c_str(), barrier_num_str);
+    kvs_set_value(KVS_BARRIER, std::to_string(assigned_proc_idx).c_str(), barrier_num_str);
 
     min_barrier_num = get_barrier_idx();
     while (min_barrier_num != barrier_num) {
@@ -131,9 +132,44 @@ void pmi_resizable_simple::pmrt_barrier() {
     if (barrier_num > BARRIER_NUM_MAX)
         barrier_num = 0;
 }
+void pmi_resizable_simple::pmrt_barrier_full() {
+    size_t min_barrier_num;
+    char barrier_num_str[INT_STR_SIZE];
+
+    SET_STR(barrier_num_str, INT_STR_SIZE, SIZE_T_TEMPLATE, barrier_num_full);
+
+    kvs_set_value(KVS_BARRIER_FULL, std::to_string(assigned_thread_idx).c_str(), barrier_num_str);
+
+    min_barrier_num = get_barrier_full_idx();
+    while (min_barrier_num != barrier_num) {
+        min_barrier_num = get_barrier_idx();
+    }
+
+    barrier_num_full++;
+    if (barrier_num_full > BARRIER_NUM_MAX)
+        barrier_num_full = 0;
+}
+
+size_t pmi_resizable_simple::get_barrier_full_idx() {
+    size_t thread_count = ranks_per_thread_map.size();
+
+    kvs_get_value(KVS_BARRIER_FULL, std::to_string(0).c_str(), val_storage);
 
+    size_t min_barrier_idx = atoi(val_storage);
+    size_t barrier_idx;
+    for (size_t i = 1; i < thread_count; i++) {
+        kvs_get_value(KVS_BARRIER_FULL, std::to_string(i).c_str(), val_storage);
+
+        barrier_idx = atoi(val_storage);
+
+        if (min_barrier_idx > barrier_idx)
+            min_barrier_idx = barrier_idx;
+    }
+
+    return min_barrier_idx;
+}
 atl_status_t pmi_resizable_simple::pmrt_kvs_put(char* kvs_key,
-                                                size_t proc_idx,
+                                                int proc_idx,
                                                 const void* kvs_val,
                                                 size_t kvs_val_len) {
     int ret;
@@ -155,7 +191,7 @@ atl_status_t pmi_resizable_simple::pmrt_kvs_put(char* kvs_key,
 }
 
 atl_status_t pmi_resizable_simple::pmrt_kvs_get(char* kvs_key,
-                                                size_t proc_idx,
+                                                int proc_idx,
                                                 void* kvs_val,
                                                 size_t kvs_val_len) {
     int ret;
@@ -174,16 +210,16 @@ atl_status_t pmi_resizable_simple::pmrt_kvs_get(char* kvs_key,
     return ATL_STATUS_SUCCESS;
 }
 
-size_t pmi_resizable_simple::get_size() {
-    return threads_per_rank.size();
+int pmi_resizable_simple::get_size() {
+    return threads_per_proc.size();
 }
 
-size_t pmi_resizable_simple::get_rank() {
-    return requested_rank_num;
+int pmi_resizable_simple::get_rank() {
+    return assigned_proc_idx;
 }
 
-size_t pmi_resizable_simple::get_thread() {
-    return local_thread_num;
+size_t pmi_resizable_simple::get_local_thread_idx() {
+    return local_thread_idx;
 }
 
 int pmi_resizable_simple::kvs_set_value(const char* kvs_name, const char* key, const char* value) {
@@ -199,12 +235,15 @@ int pmi_resizable_simple::kvs_get_value(const char* kvs_name, const char* key, c
     size_t connection_time = 0;
     start_time = time(NULL);
     while (k->kvs_get_value_by_name_key(result_kvs_name.c_str(), key, value) == 0 &&
-        connection_time < connection_timeout) {
+           connection_time < connection_timeout) {
         connection_time = time(NULL) - start_time;
     }
     if (connection_time >= connection_timeout) {
         printf("KVS get error: timeout limit (%zu > %zu), prefix: %s, key %s\n",
-            connection_time, connection_timeout, result_kvs_name.c_str(), key);
+               connection_time,
+               connection_timeout,
+               result_kvs_name.c_str(),
+               key);
         exit(1);
     }
 
@@ -214,10 +253,9 @@ int pmi_resizable_simple::kvs_get_value(const char* kvs_name, const char* key, c
 int pmi_resizable_simple::kvs_iget_value(const char* kvs_name, const char* key, char* value) {
     std::string result_kvs_name = std::string(kvs_name) + std::to_string(local_id);
     return k->kvs_get_value_by_name_key(result_kvs_name.c_str(), key, value);
-    ;
 }
 size_t pmi_resizable_simple::get_barrier_idx() {
-    size_t proc_count = threads_per_rank.size();
+    size_t proc_count = threads_per_proc.size();
 
     kvs_get_value(KVS_BARRIER, std::to_string(0).c_str(), val_storage);
 
@@ -235,23 +273,23 @@ size_t pmi_resizable_simple::get_barrier_idx() {
     return min_barrier_idx;
 }
 
-void pmi_resizable_simple::register_my_first_rank_and_dev_count() {
+void pmi_resizable_simple::register_first_rank_idx_and_rank_count() {
     kvs_set_value(
-        DEVICES_PER_THREAD, std::to_string(ranks[0]).c_str(), std::to_string(ranks.size()).c_str());
+        RANKS_PER_THREAD, std::to_string(ranks[0]).c_str(), std::to_string(ranks.size()).c_str());
 }
 
-void pmi_resizable_simple::get_requested_thread_num_and_threads_count() {
-    size_t total_dev_count = 0;
-    size_t devises;
-    while (total_dev_count < dev_count) {
-        if (total_dev_count == ranks[0]) {
-            requested_thread_num = devises_per_thread.size();
+void pmi_resizable_simple::assign_thread_idx_and_fill_ranks_per_thread_map() {
+    int rank_count = 0;
+    int ranks_per_thread;
+    while (rank_count < total_rank_count) {
+        if (rank_count == ranks[0]) {
+            assigned_thread_idx = ranks_per_thread_map.size();
         }
-        kvs_get_value(DEVICES_PER_THREAD, std::to_string(total_dev_count).c_str(), val_storage);
+        kvs_get_value(RANKS_PER_THREAD, std::to_string(rank_count).c_str(), val_storage);
 
-        devises = atoi(val_storage);
-        devises_per_thread.push_back(devises);
-        total_dev_count += devises;
+        ranks_per_thread = atoi(val_storage);
+        ranks_per_thread_map.push_back(ranks_per_thread);
+        rank_count += ranks_per_thread;
     }
 }
 
@@ -266,46 +304,45 @@ void pmi_resizable_simple::register_my_proc_name() {
     }
     my_proccess_name = std::string(hostname) + std::to_string(my_pid);
 
-    kvs_set_value(PROCESS_THREAD_NAME,
-                  std::to_string(requested_thread_num).c_str(),
-                  my_proccess_name.c_str());
+    kvs_set_value(
+        PROCESS_THREAD_NAME, std::to_string(assigned_thread_idx).c_str(), my_proccess_name.c_str());
 }
 
-void pmi_resizable_simple::get_my_proc_num_and_proc_count() {
-    std::map<std::string, size_t> proc_name_to_rank;
-    std::map<std::string, size_t>::iterator it;
-    size_t rank;
-    for (size_t i = 0; i < devises_per_thread.size(); i++) {
+void pmi_resizable_simple::get_my_proc_idx_and_proc_count() {
+    std::map<std::string, int> proc_name_to_rank;
+    std::map<std::string, int>::iterator it;
+    int rank;
+    for (size_t i = 0; i < ranks_per_thread_map.size(); i++) {
         kvs_get_value(PROCESS_THREAD_NAME, std::to_string(i).c_str(), val_storage);
 
         it = proc_name_to_rank.find(val_storage);
         if (it == proc_name_to_rank.end()) {
-            rank = threads_per_rank.size();
+            rank = threads_per_proc.size();
             if (!my_proccess_name.compare(val_storage)) {
-                requested_rank_num = rank;
-                if (requested_thread_num == i) {
+                assigned_proc_idx = rank;
+                if (assigned_thread_idx == i) {
                     kvs_set_value(REQUESTED_RANK_TO_NAME,
-                                  std::to_string(requested_rank_num).c_str(),
+                                  std::to_string(assigned_proc_idx).c_str(),
                                   my_proccess_name.c_str());
                 }
             }
             proc_name_to_rank[val_storage] = rank;
-            threads_per_rank[rank].push_back(i);
+            threads_per_proc[rank].push_back(i);
         }
         else {
-            threads_per_rank[it->second].push_back(i);
+            threads_per_proc[it->second].push_back(i);
         }
     }
 }
 
-void pmi_resizable_simple::get_local_thread_num() {
-    local_thread_num = 0;
-    for (auto it = threads_per_rank[requested_rank_num].begin();
-         it != threads_per_rank[requested_rank_num].end();
+void pmi_resizable_simple::calculate_local_thread_idx() {
+    local_thread_idx = 0;
+    for (auto it = threads_per_proc[assigned_proc_idx].begin();
+         it != threads_per_proc[assigned_proc_idx].end();
          it++) {
-        if (requested_thread_num == *it)
+        if (assigned_thread_idx == *it)
             break;
-        local_thread_num++;
+        local_thread_idx++;
     }
 }
 
@@ -314,7 +351,7 @@ void pmi_resizable_simple::make_map_requested2global() {
     char process_name[MAX_KVS_VAL_LENGTH];
     size_t size = get_size();
     requested2global.resize(size);
-    pmrt_barrier();
+    pmrt_barrier_full();
     for (size_t i = 0; i < size; i++) {
         kvs_get_value(REQUESTED_RANK_TO_NAME, std::to_string(i).c_str(), process_name);
         if (kvs_iget_value(GLOBAL_NAME_TO_RANK, process_name, global_rank_str) == 0) {
@@ -336,7 +373,7 @@ void pmi_resizable_simple::make_map_requested2global() {
         }
         requested2global[i] = atoi(global_rank_str);
     }
-    pmrt_barrier();
+    pmrt_barrier_full();
 }
 
 size_t pmi_resizable_simple::get_local_kvs_id() {
@@ -357,7 +394,7 @@ pmi_resizable_simple::~pmi_resizable_simple() {
         pmrt_finalize();
 }
 void pmi_resizable_simple::remove_initial_data() {
-    std::string result_kvs_name = std::string(DEVICES_PER_THREAD) + std::to_string(0);
+    std::string result_kvs_name = std::string(RANKS_PER_THREAD) + std::to_string(0);
     remove_val(result_kvs_name.c_str(), std::to_string(ranks[0]).c_str(), ST_CLIENT);
     k->kvs_remove_name_key(result_kvs_name.c_str(), std::to_string(ranks[0]).c_str());
 }
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
index 9f3544198..86d0a2f04 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
@@ -41,8 +41,8 @@
 class pmi_resizable_simple final : public ipmi {
 public:
     pmi_resizable_simple() = delete;
-    pmi_resizable_simple(size_t dev_count,
-                         const std::vector<size_t>& ranks,
+    pmi_resizable_simple(int total_rank_count,
+                         const std::vector<int>& ranks,
                          std::shared_ptr<ikvs_wrapper> k,
                          const char* main_addr = nullptr);
 
@@ -50,7 +50,7 @@ class pmi_resizable_simple final : public ipmi {
 
     int is_pm_resize_enabled() override;
 
-    atl_status_t pmrt_main_addr_reserv(char* main_addr) override;
+    atl_status_t pmrt_main_addr_reserve(char* main_addr) override;
 
     atl_status_t pmrt_set_resize_function(atl_resize_fn_t resize_fn) override;
 
@@ -61,68 +61,77 @@ class pmi_resizable_simple final : public ipmi {
     void pmrt_barrier() override;
 
     atl_status_t pmrt_kvs_put(char* kvs_key,
-                              size_t proc_idx,
+                              int proc_idx,
                               const void* kvs_val,
                               size_t kvs_val_len) override;
 
     atl_status_t pmrt_kvs_get(char* kvs_key,
-                              size_t proc_idx,
+                              int proc_idx,
                               void* kvs_val,
                               size_t kvs_val_len) override;
 
-    size_t get_size() override;
+    int get_size() override;
 
-    size_t get_rank() override;
+    int get_rank() override;
 
-    size_t get_thread() override;
+    size_t get_local_thread_idx() override;
 
     size_t get_local_kvs_id() override;
 
     void set_local_kvs_id(size_t local_kvs_id) override;
 
-    size_t get_threads_count() override {
-        return threads_per_rank[requested_rank_num].size();
+    size_t get_threads_per_process() override {
+        return threads_per_proc[assigned_proc_idx].size();
     }
 
-    size_t get_devices_per_rank_count() override {
+    size_t get_ranks_per_process() override {
         size_t res = 0;
-        std::list<size_t>& threads = threads_per_rank[requested_rank_num];
-        for (auto it = threads.begin(); it != threads.end(); it++) {
-            res += devises_per_thread[*it];
+        std::list<size_t>& thread_idxs = threads_per_proc[assigned_proc_idx];
+        for (auto it = thread_idxs.begin(); it != thread_idxs.end(); it++) {
+            res += ranks_per_thread_map[*it];
         }
         return res;
     }
+
     void pmrt_finalize() override;
 
 private:
     bool is_finalized{ false };
     atl_status_t pmrt_init(const char* main_addr = nullptr);
+
     int kvs_set_value(const char* kvs_name, const char* key, const char* value);
     int kvs_get_value(const char* kvs_name, const char* key, char* value);
     int kvs_iget_value(const char* kvs_name, const char* key, char* value);
+
     size_t get_barrier_idx();
-    void register_my_first_rank_and_dev_count();
-    void get_requested_thread_num_and_threads_count();
+    size_t get_barrier_full_idx();
+
+    void calculate_local_thread_idx();
+    void register_first_rank_idx_and_rank_count();
+    void assign_thread_idx_and_fill_ranks_per_thread_map();
     void register_my_proc_name();
-    void get_my_proc_num_and_proc_count();
-    void get_local_thread_num();
+    void get_my_proc_idx_and_proc_count();
     void make_requested_info();
     void remove_initial_data();
     void make_map_requested2global();
-    size_t dev_count;
-    size_t requested_rank_num;
-    size_t requested_thread_num;
-    size_t local_thread_num;
+    void pmrt_barrier_full();
+
+    int total_rank_count;
+    int assigned_proc_idx;
+
+    size_t assigned_thread_idx;
+    size_t local_thread_idx;
     std::string my_proccess_name;
-    std::vector<size_t> ranks;
-    std::vector<size_t> devises_per_thread;
-    std::map<size_t, std::list<size_t>> threads_per_rank;
+    std::vector<int> ranks;
+    std::vector<size_t> ranks_per_thread_map;
+    std::map<size_t, std::list<size_t>> threads_per_proc;
     std::shared_ptr<ikvs_wrapper> k;
     size_t max_keylen;
     size_t max_vallen;
     char* val_storage = nullptr;
     size_t barrier_num = 0;
-    std::vector<size_t> requested2global;
+    size_t barrier_num_full = 0;
+    std::vector<int> requested2global;
     size_t local_id;
-    size_t connection_timeout = 120;
+    size_t connection_timeout = 120; /* in seconds */
 };
diff --git a/src/atl/util/pm/pmi_rt/pmi/CMakeLists.txt b/src/atl/util/pm/pmi_rt/pmi/CMakeLists.txt
index 2880ae331..5917c927f 100755
--- a/src/atl/util/pm/pmi_rt/pmi/CMakeLists.txt
+++ b/src/atl/util/pm/pmi_rt/pmi/CMakeLists.txt
@@ -13,35 +13,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-#builds pmi
-
-set(PMI_SRC
-    simple_pmiutil.c
-    simple_pmi.c)
-
-set(COMMON_PMI_INC_DIRS
-    ${PROJECT_SOURCE_DIR}/src/atl/util/pm/pmi_rt/pmi)
-
-#special library that holds objects only
-add_library(pmi-objects OBJECT ${PMI_SRC})
-set_target_properties(pmi-objects PROPERTIES POSITION_INDEPENDENT_CODE 1)
-target_include_directories(pmi-objects PUBLIC ${COMMON_PMI_INC_DIRS})
-target_compile_definitions(pmi-objects PRIVATE HAVE_UNISTD_H HAVE_STDLIB_H HAVE_STRING_H HAVE_STRINGS_H)
-
-#shared lib
-add_library(pmi SHARED $<TARGET_OBJECTS:pmi-objects>)
-target_include_directories(pmi PUBLIC INTERFACE ${COMMON_PMI_INC_DIRS})
-if (NOT LIB_PMI_SO_VERSION AND NOT LIB_PMI_MAJOR_VERSION)
-        set_target_properties(pmi PROPERTIES VERSION 1 SOVERSION 1.0)
-else()
-        set_target_properties(pmi PROPERTIES VERSION ${LIB_PMI_SO_VERSION} SOVERSION ${LIB_PMI_MAJOR_VERSION})
-endif()
-set_target_properties(pmi PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
-
-install(TARGETS pmi LIBRARY DESTINATION ${CCL_INSTALL_LIB})
-
-#static lib
-add_library(pmi-static STATIC $<TARGET_OBJECTS:pmi-objects>)
-set_target_properties(pmi-static PROPERTIES OUTPUT_NAME pmi)
-set_target_properties(pmi-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
-install(TARGETS pmi-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB})
+#builds pmi
+
+set(PMI_SRC
+    simple_pmiutil.c
+    simple_pmi.c)
+
+set(COMMON_PMI_INC_DIRS
+    ${PROJECT_SOURCE_DIR}/src/atl/util/pm/pmi_rt/pmi)
+
+#special library that holds objects only
+add_library(pmi-objects OBJECT ${PMI_SRC})
+set_target_properties(pmi-objects PROPERTIES POSITION_INDEPENDENT_CODE 1)
+target_include_directories(pmi-objects PUBLIC ${COMMON_PMI_INC_DIRS})
+target_compile_definitions(pmi-objects PRIVATE HAVE_UNISTD_H HAVE_STDLIB_H HAVE_STRING_H HAVE_STRINGS_H)
+
+#shared lib
+add_library(pmi SHARED $<TARGET_OBJECTS:pmi-objects>)
+target_include_directories(pmi PUBLIC INTERFACE ${COMMON_PMI_INC_DIRS})
+if (NOT LIB_PMI_SO_VERSION AND NOT LIB_PMI_MAJOR_VERSION)
+        set_target_properties(pmi PROPERTIES VERSION 1 SOVERSION 1.0)
+else()
+        set_target_properties(pmi PROPERTIES VERSION ${LIB_PMI_SO_VERSION} SOVERSION ${LIB_PMI_MAJOR_VERSION})
+endif()
+set_target_properties(pmi PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
+
+install(TARGETS pmi LIBRARY DESTINATION ${CCL_INSTALL_LIB})
+
+#static lib
+add_library(pmi-static STATIC $<TARGET_OBJECTS:pmi-objects>)
+set_target_properties(pmi-static PROPERTIES OUTPUT_NAME pmi)
+set_target_properties(pmi-static PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CCL_BUILD_DIR})
+install(TARGETS pmi-static ARCHIVE DESTINATION ${CCL_INSTALL_LIB})
diff --git a/src/atl/util/pm/pmi_rt/pmi/simple_pmiutil.c b/src/atl/util/pm/pmi_rt/pmi/simple_pmiutil.c
index 102d301d8..59702c88f 100644
--- a/src/atl/util/pm/pmi_rt/pmi/simple_pmiutil.c
+++ b/src/atl/util/pm/pmi_rt/pmi/simple_pmiutil.c
@@ -94,9 +94,17 @@ void PMIU_printf(int print_flag, const char *fmt, ...) {
             if (p) {
                 MPL_snprintf(filename, sizeof(filename), "testclient-%s.out", p);
                 logfile = fopen(filename, "w");
+                if (logfile == NULL) {
+                    printf("Error opening file %s \n", strerror(errno));
+                    return;
+                }
             }
             else {
                 logfile = fopen("testserver.out", "w");
+                if (logfile == NULL) {
+                    printf("Error opening file %s \n", strerror(errno));
+                    return;
+                }
             }
         }
         else
diff --git a/src/atl/util/pm/pmi_rt/pmi_rt.c b/src/atl/util/pm/pmi_rt/pmi_rt.c
index 3d8dd52e9..836cb95cc 100644
--- a/src/atl/util/pm/pmi_rt/pmi_rt.c
+++ b/src/atl/util/pm/pmi_rt/pmi_rt.c
@@ -21,7 +21,7 @@
 
 #include "util/pm/pm_rt.h"
 
-#define PMI_RT_KEY_FORMAT "%s-%zu"
+#define PMI_RT_KEY_FORMAT "%s-%d"
 
 typedef struct pmi_pm_rt_context {
     pm_rt_desc_t pmrt_desc;
@@ -58,7 +58,7 @@ static void pmirt_finalize(pm_rt_desc_t *pmrt_desc) {
 
 static atl_status_t pmirt_kvs_put(pm_rt_desc_t *pmrt_desc,
                                   char *kvs_key,
-                                  size_t proc_idx,
+                                  int proc_idx,
                                   const void *kvs_val,
                                   size_t kvs_val_len) {
     int ret;
@@ -96,7 +96,7 @@ static atl_status_t pmirt_kvs_put(pm_rt_desc_t *pmrt_desc,
 
 static atl_status_t pmirt_kvs_get(pm_rt_desc_t *pmrt_desc,
                                   char *kvs_key,
-                                  size_t proc_idx,
+                                  int proc_idx,
                                   void *kvs_val,
                                   size_t kvs_val_len) {
     int ret;
@@ -137,7 +137,7 @@ static void pmirt_barrier(pm_rt_desc_t *pmrt_desc) {
     (void)PMI_Barrier();
 }
 
-atl_status_t pmirt_update(size_t *proc_idx, size_t *proc_count) {
+atl_status_t pmirt_update(int *proc_idx, int *proc_count) {
     PMI_Get_size((int *)proc_idx);
     PMI_Get_rank((int *)proc_count);
     return ATL_STATUS_SUCCESS;
@@ -159,7 +159,7 @@ pm_rt_kvs_ops_t kvs_ops = {
     .get = pmirt_kvs_get,
 };
 
-atl_status_t pmirt_init(size_t *proc_idx, size_t *proc_count, pm_rt_desc_t **pmrt_desc) {
+atl_status_t pmirt_init(int *proc_idx, int *proc_count, pm_rt_desc_t **pmrt_desc) {
     int ret, spawned, max_kvsnamelen;
     int proc_idx_tmp, proc_count_tmp;
 
diff --git a/src/atl/util/pm/pmi_rt/pmi_simple.cpp b/src/atl/util/pm/pmi_rt/pmi_simple.cpp
index 9ac01602c..6e14b8529 100644
--- a/src/atl/util/pm/pmi_rt/pmi_simple.cpp
+++ b/src/atl/util/pm/pmi_rt/pmi_simple.cpp
@@ -24,7 +24,7 @@ pmi_simple::pmi_simple() {
     pmirt_init(&rank, &size, &pmrt_desc);
 }
 
-atl_status_t pmi_simple::pmrt_main_addr_reserv(char *main_addr) {
+atl_status_t pmi_simple::pmrt_main_addr_reserve(char *main_addr) {
     printf("Function main_addr_reserv unsupported yet for simple pmi\n");
     return ATL_STATUS_FAILURE;
 }
@@ -54,28 +54,28 @@ void pmi_simple::pmrt_barrier() {
 }
 
 atl_status_t pmi_simple::pmrt_kvs_put(char *kvs_key,
-                                      size_t proc_idx,
+                                      int proc_idx,
                                       const void *kvs_val,
                                       size_t kvs_val_len) {
     return pmirt_kvs_put(pmrt_desc, kvs_key, proc_idx, kvs_val, kvs_val_len);
 }
 
 atl_status_t pmi_simple::pmrt_kvs_get(char *kvs_key,
-                                      size_t proc_idx,
+                                      int proc_idx,
                                       void *kvs_val,
                                       size_t kvs_val_len) {
     return pmirt_kvs_get(pmrt_desc, kvs_key, proc_idx, kvs_val, kvs_val_len);
 }
 
-size_t pmi_simple::get_rank() {
+int pmi_simple::get_rank() {
     return rank;
 }
 
-size_t pmi_simple::get_size() {
+int pmi_simple::get_size() {
     return size;
 }
 
-size_t pmi_simple::get_thread() {
+size_t pmi_simple::get_local_thread_idx() {
     return 0;
 }
 size_t pmi_simple::get_local_kvs_id() {
diff --git a/src/atl/util/pm/pmi_rt/pmi_simple.h b/src/atl/util/pm/pmi_rt/pmi_simple.h
index 87d50d40a..27d8b0571 100644
--- a/src/atl/util/pm/pmi_rt/pmi_simple.h
+++ b/src/atl/util/pm/pmi_rt/pmi_simple.h
@@ -23,7 +23,7 @@ class pmi_simple final : public ipmi {
 
     int is_pm_resize_enabled() override;
 
-    atl_status_t pmrt_main_addr_reserv(char *main_addr) override;
+    atl_status_t pmrt_main_addr_reserve(char *main_addr) override;
 
     atl_status_t pmrt_set_resize_function(atl_resize_fn_t resize_fn) override;
 
@@ -36,36 +36,36 @@ class pmi_simple final : public ipmi {
     void pmrt_barrier() override;
 
     atl_status_t pmrt_kvs_put(char *kvs_key,
-                              size_t proc_idx,
+                              int proc_idx,
                               const void *kvs_val,
                               size_t kvs_val_len) override;
 
     atl_status_t pmrt_kvs_get(char *kvs_key,
-                              size_t proc_idx,
+                              int proc_idx,
                               void *kvs_val,
                               size_t kvs_val_len) override;
 
-    size_t get_rank() override;
+    int get_rank() override;
 
-    size_t get_size() override;
+    int get_size() override;
 
-    size_t get_thread() override;
+    size_t get_local_thread_idx() override;
 
     size_t get_local_kvs_id() override;
 
     void set_local_kvs_id(size_t local_kvs_id) override;
 
-    size_t get_threads_count() override {
+    size_t get_threads_per_process() override {
         return 1;
     }
 
-    size_t get_devices_per_rank_count() override {
+    size_t get_ranks_per_process() override {
         return 1;
     }
 
 private:
-    size_t rank;
-    size_t size;
+    int rank;
+    int size;
     pm_rt_desc_t *pmrt_desc = nullptr;
     bool is_finalized{ false };
 };
diff --git a/src/ccl.cpp b/src/ccl.cpp
deleted file mode 100644
index 8bf262359..000000000
--- a/src/ccl.cpp
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/global/global.hpp"
-#include "common/stream/stream.hpp"
-#include "exec/exec.hpp"
-
-// ccl_status_t ccl_set_resize_fn(ccl_resize_fn_t callback)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         return ccl::global_data::get().executor->create_listener(callback);
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t ccl_init()
-// {
-//     try
-//     {
-//         ccl::global_data::get().init();
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t ccl_finalize()
-// {
-//     try
-//     {
-//         ccl::global_data::get().reset();
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_get_version(ccl::library_version* version)
-// {
-//     if (!version)
-//     {
-//         return ccl_status_invalid_arguments;
-//     }
-
-//     version->major = CCL_MAJOR_VERSION;
-//     version->minor = CCL_MINOR_VERSION;
-//     version->update = CCL_UPDATE_VERSION;
-//     version->product_status = CCL_PRODUCT_STATUS;
-//     version->build_date = CCL_PRODUCT_BUILD_DATE;
-//     version->full = CCL_PRODUCT_FULL;
-
-//     return ccl_status_success;
-// }
-
-// ccl_status_t CCL_API ccl_wait(ccl_request_t req)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             LOG_ERROR("empty request");
-//             return ccl_status_success;
-//         }
-
-//         auto request = static_cast<ccl_request*>(req);
-//         ccl_wait_impl(ccl::global_data::get().executor.get(), request);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_test(ccl_request_t req, int* is_completed)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             LOG_ERROR("empty request");
-//             if (is_completed)
-//             {
-//                 *is_completed = 1;
-//             }
-//             return ccl_status_success;
-//         }
-
-//         auto request = static_cast<ccl_request*>(req);
-//         auto completed = ccl_test_impl(ccl::global_data::get().executor.get(), request);
-//         *is_completed = static_cast<int>(completed);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t ccl_comm_create(ccl_comm_t* comm, const ccl_comm_attr_t* attr)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     CCL_ASSERT(comm);
-//     try
-//     {
-//         ccl::global_data& data = ccl::global_data::get();
-//         ccl_comm* comm_ptr = nullptr;
-
-//         if (!attr)
-//         {
-//             LOG_DEBUG("create communicator as copy of global communicator");
-//             comm_ptr = new ccl_comm(data.comm->rank(),
-//                                     data.comm->size(),
-//                                     data.comm_ids->acquire(),
-//                                     ccl::global_data::get().atl);
-//         }
-//         else
-//         {
-//             LOG_DEBUG("create communicator with coll_attr");
-//             comm_ptr = ccl_comm::create_with_color(attr->color,
-//                                                    data.comm_ids.get(),
-//                                                    data.comm.get());
-//         }
-
-//         *comm = static_cast<void*>(comm_ptr);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t ccl_comm_free(ccl_comm_t comm)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     CCL_ASSERT(comm);
-//     LOG_DEBUG("free communicator ", comm);
-//     try
-//     {
-//         delete static_cast<ccl_comm*>(comm);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_get_comm_rank(ccl_comm_t comm, size_t* rank)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     if (!rank)
-//         return ccl_status_invalid_arguments;
-
-//     try
-//     {
-//         auto comm_ptr = (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get();
-//         *rank = comm_ptr->rank();
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_get_comm_size(ccl_comm_t comm, size_t* size)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     if (!size)
-//         return ccl_status_invalid_arguments;
-
-//     try
-//     {
-//         auto comm_ptr = (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get();
-//         *size = comm_ptr->size();
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t ccl_datatype_create(ccl_datatype_t* dtype, const ccl_datatype_attr_t* attr)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     CCL_ASSERT(dtype);
-//     LOG_DEBUG("create datatype");
-//     try
-//     {
-//         *dtype = ccl::global_data::get().dtypes->create(attr);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_get_datatype_size(ccl_datatype_t dtype, size_t* size)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     if (!size)
-//         return ccl_status_invalid_arguments;
-
-//     try
-//     {
-//         *size = ccl::global_data::get().dtypes->get(dtype).size();
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_datatype_free(ccl_datatype_t dtype)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     LOG_DEBUG("free datatype ", dtype);
-//     try
-//     {
-//         ccl::global_data::get().dtypes->free(dtype);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t ccl_stream_create(ccl_stream_type_t type,
-//                                void* native_stream,
-//                                ccl_stream_t* stream)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     CCL_ASSERT(stream);
-//     try
-//     {
-// //TODO
-// #if 0
-//         LOG_DEBUG("create stream by type: ", type);
-// #ifdef MULTI_GPU_SUPPORT
-//     #ifdef CCL_ENABLE_SYCL
-//             *stream = static_cast<void*>(stream_provider_dispatcher::create(*static_cast<cl::sycl::queue*>(native_stream)).release());
-//     #else
-//             *stream = static_cast<void*>(stream_provider_dispatcher::create(*static_cast<ze_command_queue_handle_t*>(native_stream)).release());
-//     #endif
-// #else
-//     #ifdef CCL_ENABLE_SYCL
-//         if( type != ccl_stream_host)
-//         {
-//             *stream = static_cast<void*>(stream_provider_dispatcher::create(*static_cast<cl::sycl::queue*>(native_stream)).release());
-//         }
-//         else
-//     #endif
-//         {
-//             *stream = static_cast<void*>(stream_provider_dispatcher::create(native_stream).release());
-//         }
-
-//         //for legacy stream: override type for 'host' related queue
-//         static_cast<ccl_stream*>(*stream)->type = type;
-// #endif
-// #endif
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t ccl_stream_free(ccl_stream_t stream)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     CCL_ASSERT(stream);
-//     LOG_DEBUG("free stream ", stream);
-//     try
-//     {
-//         delete static_cast<const ccl_stream*>(stream);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_allgatherv(
-//     const void* send_buf,
-//     size_t send_count,
-//     void* recv_buf,
-//     const size_t* recv_counts,
-//     ccl_datatype_t dtype,
-//     const ccl_coll_attr_t* attr,
-//     ccl_comm_t comm,
-//     ccl_stream_t stream,
-//     ccl_request_t* req)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             return ccl_status_invalid_arguments;
-//         }
-//         auto request = ccl_allgatherv_impl(send_buf, send_count, recv_buf, recv_counts, dtype, attr,
-//                                            (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get(),
-//                                            static_cast<const ccl_stream*>(stream));
-//         *req = static_cast<ccl_request_t>(request);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_allreduce(
-//     const void* send_buf,
-//     void* recv_buf,
-//     size_t count,
-//     ccl_datatype_t dtype,
-//     ccl_reduction_t reduction,
-//     const ccl_coll_attr_t* attr,
-//     ccl_comm_t comm,
-//     ccl_stream_t stream,
-//     ccl_request_t* req)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             return ccl_status_invalid_arguments;
-//         }
-//         auto request = ccl_allreduce_impl(send_buf, recv_buf, count, dtype, static_cast<ccl::reduction>(reduction), attr,
-//                                           (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get(),
-//                                           static_cast<const ccl_stream*>(stream));
-//         *req = static_cast<ccl_request_t>(request);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_alltoall(
-//     const void* send_buf,
-//     void* recv_buf,
-//     size_t count,
-//     ccl_datatype_t dtype,
-//     const ccl_coll_attr_t* attr,
-//     ccl_comm_t comm,
-//     ccl_stream_t stream,
-//     ccl_request_t* req)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             return ccl_status_invalid_arguments;
-//         }
-//         auto request = ccl_alltoall_impl(send_buf, recv_buf, count, dtype, attr,
-//                                          (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get(),
-//                                          static_cast<const ccl_stream*>(stream));
-//         *req = static_cast<ccl_request_t>(request);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_alltoallv(
-//     const void* send_buf,
-//     const size_t* send_counts,
-//     void* recv_buf,
-//     const size_t* recv_counts,
-//     ccl_datatype_t dtype,
-//     const ccl_coll_attr_t* attr,
-//     ccl_comm_t comm,
-//     ccl_stream_t stream,
-//     ccl_request_t* req)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             return ccl_status_invalid_arguments;
-//         }
-//         auto request = ccl_alltoallv_impl(send_buf, send_counts, recv_buf, recv_counts, dtype, attr,
-//                                           (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get(),
-//                                           static_cast<const ccl_stream*>(stream));
-//         *req = static_cast<ccl_request_t>(request);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_barrier(ccl_comm_t comm, ccl_stream_t stream)
-// {
-//     try
-//     {
-//         ccl_barrier_impl((comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get(),
-//                          static_cast<const ccl_stream*>(stream));
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_bcast(
-//     void* buf,
-//     size_t count,
-//     ccl_datatype_t dtype,
-//     size_t root,
-//     const ccl_coll_attr_t* attr,
-//     ccl_comm_t comm,
-//     ccl_stream_t stream,
-//     ccl_request_t* req)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             return ccl_status_invalid_arguments;
-//         }
-//         auto request = ccl_broadcast_impl(buf, count, dtype, root, attr,
-//                                       (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get(),
-//                                       static_cast<const ccl_stream*>(stream));
-//         *req = static_cast<ccl_request_t>(request);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_reduce(
-//     const void* send_buf,
-//     void* recv_buf,
-//     size_t count,
-//     ccl_datatype_t dtype,
-//     ccl_reduction_t reduction,
-//     size_t root,
-//     const ccl_coll_attr_t* attr,
-//     ccl_comm_t comm,
-//     ccl_stream_t stream,
-//     ccl_request_t* req)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             return ccl_status_invalid_arguments;
-//         }
-//         auto request = ccl_reduce_impl(send_buf, recv_buf, count, dtype, static_cast<ccl::reduction>(reduction), root, attr,
-//                                        (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get(),
-//                                        static_cast<const ccl_stream*>(stream));
-//         *req = static_cast<ccl_request_t>(request);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
-
-// ccl_status_t CCL_API ccl_sparse_allreduce(const void* send_ind_buf, size_t send_ind_count,
-//                                           const void* send_val_buf, size_t send_val_count,
-//                                           void* recv_ind_buf, size_t recv_ind_count,
-//                                           void* recv_val_buf, size_t recv_val_count,
-//                                           ccl_datatype_t index_dtype,
-//                                           ccl_datatype_t value_dtype,
-//                                           ccl_reduction_t reduction,
-//                                           const ccl_coll_attr_t* attr,
-//                                           ccl_comm_t comm,
-//                                           ccl_stream_t stream,
-//                                           ccl_request_t* req)
-// {
-//     CCL_CHECK_IS_BLOCKED();
-//     try
-//     {
-//         if (!req)
-//         {
-//             return ccl_status_invalid_arguments;
-//         }
-//         auto request = ccl_sparse_allreduce_impl(send_ind_buf, send_ind_count,
-//                                                  send_val_buf, send_val_count,
-//                                                  recv_ind_buf, recv_ind_count,
-//                                                  recv_val_buf, recv_val_count,
-//                                                  index_dtype, value_dtype,
-//                                                  static_cast<ccl::reduction>(reduction), attr,
-//                                                  (comm) ? static_cast<ccl_comm*>(comm) : ccl::global_data::get().comm.get(),
-//                                                  static_cast<const ccl_stream*>(stream));
-//         *req = static_cast<ccl_request_t>(request);
-//         return ccl_status_success;
-//     }
-//     COMMON_CATCH_BLOCK();
-// }
diff --git a/src/ccl_api_functions.cpp b/src/ccl_api_functions.cpp
index 718084907..16e97688f 100644
--- a/src/ccl_api_functions.cpp
+++ b/src/ccl_api_functions.cpp
@@ -13,1243 +13,1316 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_environment.hpp"
-#include "oneapi/ccl/ccl_api_functions.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-
-#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-#include "common/comm/comm_interface.hpp"
-#endif //#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-
-#include "ccl_api_functions_generators.hpp"
-#include "common/global/global.hpp"
-#include "ccl_gpu_module.hpp"
-
-namespace ccl {
-
-/**
- * A structure that is a friend of the passed object
- * and which allows access to the internal representation of this object
- */
-struct impl_dispatch {
-    template <class Object>
-    const typename Object::impl_value_t& operator()(const Object& obj) {
-        return obj.get_impl();
-    }
-};
-
-#ifdef MULTI_GPU_SUPPORT
-/* register a gpu module */
-void register_gpu_module(std::string kernel_dir_path)
-{
-    // allgatherv
-    if (!kernel_dir_path.empty())
-    {
-        if(*kernel_dir_path.rbegin() != '/')
-        {
-            kernel_dir_path += '/';
-        }
-    }
-    LOG_INFO("SPV Kernels found directory: ", kernel_dir_path);
-    std::string kernel_path = kernel_dir_path + "ring_allgatherv.spv";
-    register_gpu_module_source(kernel_path.c_str(),
-                                ccl::device_topology_type::ring,
-                                ccl_coll_allgatherv);
-    // register__gpu_module_source("kernels/a2a_allgatherv.spv",
-    //                             ccl::device_topology_type::a2a,
-    //                             ccl_coll_allgatherv);
-    // alltoallv
-    kernel_path = kernel_dir_path + "ring_alltoallv.spv";
-    register_gpu_module_source(kernel_path.c_str(),
-                               ccl::device_topology_type::ring,
-                               ccl_coll_alltoallv);
-    // register_gpu_module_source("kernels/a2a_alltoallv.spv",
-    //                             ccl::device_topology_type::a2a,
-    //                             ccl_coll_alltoallv);
-    // allreduce
-    kernel_path = kernel_dir_path + "ring_allreduce.spv";
-    register_gpu_module_source(kernel_path.c_str(),
-                                ccl::device_topology_type::ring,
-                                ccl_coll_allreduce);
-    kernel_path = kernel_dir_path + "a2a_allreduce.spv";
-    register_gpu_module_source(kernel_path.c_str(),
-                                ccl::device_topology_type::a2a,
-                                ccl_coll_allreduce);
-    // bcast
-    kernel_path = kernel_dir_path + "ring_bcast.spv";
-    register_gpu_module_source(kernel_path.c_str(),
-                               ccl::device_topology_type::ring,
-                               ccl_coll_bcast);
-    kernel_path = kernel_dir_path + "a2a_bcast.spv";
-    register_gpu_module_source(kernel_path.c_str(),
-                               ccl::device_topology_type::a2a,
-                               ccl_coll_bcast);
-    // reduce
-    kernel_path = kernel_dir_path + "ring_reduce.spv";
-    register_gpu_module_source(kernel_path.c_str(),
-                                ccl::device_topology_type::ring,
-                                ccl_coll_reduce);
-    // register_gpu_module_source("kernels/a2a_reduce.spv",
-    //                            ccl_topology_class_t::a2a_algo_class,
-    //                            ccl_coll_reduce);
-}
-#endif //MULTI_GPU_SUPPORT
-
-void CCL_API init() {
-    auto& env = environment::instance();
-    (void)env;
-#ifdef MULTI_GPU_SUPPORT
-    const auto& env_object = ccl::global_data::env();
-
-    //WA
-    if (!env_object.kernel_path.empty())
-    {
-        register_gpu_module(env_object.kernel_path);
-    }
-#endif //MULTI_GPU_SUPPORT
-}
-
-/******************** ENVIRONMENT ********************/
-
-library_version CCL_API get_library_version() {
-    return environment::instance().get_library_version();
-}
-
-/* datatype */
-datatype CCL_API register_datatype(const datatype_attr& attr) {
-    return environment::instance().register_datatype(attr);
-}
-
-void CCL_API deregister_datatype(datatype dtype) {
-    return environment::instance().deregister_datatype(dtype);
-}
-
-size_t CCL_API get_datatype_size(datatype dtype) {
-    return environment::instance().get_datatype_size(dtype);
-}
-
-/* KVS */
-shared_ptr_class<kvs> CCL_API create_main_kvs() {
-    return environment::instance().create_main_kvs();
-}
-
-shared_ptr_class<kvs> CCL_API create_kvs(const kvs::address_type& addr) {
-    return environment::instance().create_kvs(addr);
-}
-
-/* device */
-device CCL_API create_device()
-{
-    static empty_t empty {};
-    return environment::instance().create_device(empty);
-}
-
-/* context */
-context CCL_API create_context()
-{
-    static empty_t empty {};
-    return environment::instance().create_context(empty);
-}
-
-/* stream */
-stream CCL_API create_stream()
-{
-    return default_stream;
-}
-
-#ifdef CCL_ENABLE_SYCL
-communicator create_single_device_communicator(const size_t comm_size,
-                                                      const size_t rank,
-                                                      const cl::sycl::device& device,
-                                                      const cl::sycl::context& context,
-                                                      shared_ptr_class<kvs_interface> kvs) {
-    return environment::instance().create_single_device_communicator(
-        comm_size, rank, device, context, kvs);
-}
-#endif // CCL_ENABLE_SYCL
-
-// communicator create_single_device_communicator(const size_t world_size,
-//                                     const size_t rank,
-//                                     cl::sycl::queue queue,
-//                                     shared_ptr_class<kvs_interface> kvs) const;
-
-// template<class DeviceSelectorType>
-// communicator create_single_device_communicator(const size_t world_size,
-//                                     const size_t rank,
-//                                     const DeviceSelectorType& selector,
-//                                     shared_ptr_class<kvs_interface> kvs) const
-// {
-//     return return environment::instance().create_single_device_communicator(world_size, rank, cl::sycl::device(selector), kvs);
-// }
-
-#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-
-vector_class<communicator> split_device_communicators(
-    const vector_class<pair_class<communicator, comm_split_attr>>& attrs) {
-    // TODO not implemented
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-
-    // return environment::instance().split_device_communicators(attrs);
-    return {};
-}
-
-#endif //#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-
-namespace preview {
-
-/* communicator */
-communicator CCL_API create_communicator() {
-    return environment::instance().create_communicator();
-}
-
-communicator CCL_API create_communicator(const size_t size, shared_ptr_class<kvs_interface> kvs) {
-    return environment::instance().create_communicator(size, kvs);
-}
-
-} // namespace preview
-
-communicator CCL_API create_communicator(const size_t size,
-                                         const size_t rank,
-                                         shared_ptr_class<kvs_interface> kvs) {
-    return environment::instance().create_communicator(size, rank, kvs);
-}
-
-/******************** COMMUNICATOR ********************/
-
-/* allgatherv */
-CCL_API event allgatherv(const void* send_buf,
-                           size_t send_count,
-                           void* recv_buf,
-                           const vector_class<size_t>& recv_counts,
-                           datatype dtype,
-                           const communicator& comm,
-                           const stream& op_stream,
-                           const allgatherv_attr& attr,
-                           const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
-}
-
-CCL_API event allgatherv(const void* send_buf,
-                           size_t send_count,
-                           void* recv_buf,
-                           const vector_class<size_t>& recv_counts,
-                           datatype dtype,
-                           const communicator& comm,
-                           const allgatherv_attr& attr,
-                           const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_buf, recv_counts, dtype, disp(default_stream), attr, deps);
-}
-
-CCL_API event allgatherv(const void* send_buf,
-                           size_t send_count,
-                           const vector_class<void*>& recv_bufs,
-                           const vector_class<size_t>& recv_counts,
-                           datatype dtype,
-                           const communicator& comm,
-                           const stream& op_stream,
-                           const allgatherv_attr& attr,
-                           const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_bufs, recv_counts, dtype, disp(op_stream), attr, deps);
-}
-
-CCL_API event allgatherv(const void* send_buf,
-                           size_t send_count,
-                           const vector_class<void*>& recv_bufs,
-                           const vector_class<size_t>& recv_counts,
-                           datatype dtype,
-                           const communicator& comm,
-                           const allgatherv_attr& attr,
-                           const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_bufs, recv_counts, dtype, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event allgatherv(const BufferType* send_buf,
-                   size_t send_count,
-                   BufferType* recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const stream& op_stream,
-                   const allgatherv_attr& attr,
-                   const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_buf, recv_counts, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event allgatherv(const BufferType* send_buf,
-                   size_t send_count,
-                   BufferType* recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const allgatherv_attr& attr,
-                   const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_buf, recv_counts, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event allgatherv(const BufferType* send_buf,
-                   size_t send_count,
-                   vector_class<BufferType*>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const stream& op_stream,
-                   const allgatherv_attr& attr,
-                   const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_bufs, recv_counts, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event allgatherv(const BufferType* send_buf,
-                   size_t send_count,
-                   vector_class<BufferType*>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const allgatherv_attr& attr,
-                   const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_bufs, recv_counts, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event allgatherv(const BufferObjectType& send_buf,
-                   size_t send_count,
-                   BufferObjectType& recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const stream& op_stream,
-                   const allgatherv_attr& attr,
-                   const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_buf, recv_counts, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event allgatherv(const BufferObjectType& send_buf,
-                   size_t send_count,
-                   BufferObjectType& recv_buf,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const allgatherv_attr& attr,
-                   const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_buf, recv_counts, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event allgatherv(const BufferObjectType& send_buf,
-                   size_t send_count,
-                   vector_class<ccl::reference_wrapper_class<BufferObjectType>>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const stream& op_stream,
-                   const allgatherv_attr& attr,
-                   const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_bufs, recv_counts, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event allgatherv(const BufferObjectType& send_buf,
-                   size_t send_count,
-                   vector_class<ccl::reference_wrapper_class<BufferObjectType>>& recv_bufs,
-                   const vector_class<size_t>& recv_counts,
-                   const communicator& comm,
-                   const allgatherv_attr& attr,
-                   const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allgatherv(
-        send_buf, send_count, recv_bufs, recv_counts, disp(default_stream), attr, deps);
-}
-
-/* allreduce */
-CCL_API event allreduce(const void* send_buf,
-                          void* recv_buf,
-                          size_t count,
-                          datatype dtype,
-                          reduction reduction,
-                          const communicator& comm,
-                          const stream& op_stream,
-                          const allreduce_attr& attr,
-                          const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allreduce(
-        send_buf, recv_buf, count, dtype, reduction, disp(op_stream), attr, deps);
-}
-
-CCL_API event allreduce(const void* send_buf,
-                          void* recv_buf,
-                          size_t count,
-                          datatype dtype,
-                          reduction reduction,
-                          const communicator& comm,
-                          const allreduce_attr& attr,
-                          const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allreduce(
-        send_buf, recv_buf, count, dtype, reduction, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event allreduce(const BufferType* send_buf,
-                  BufferType* recv_buf,
-                  size_t count,
-                  reduction reduction,
-                  const communicator& comm,
-                  const stream& op_stream,
-                  const allreduce_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event allreduce(const BufferType* send_buf,
-                  BufferType* recv_buf,
-                  size_t count,
-                  reduction reduction,
-                  const communicator& comm,
-                  const allreduce_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event allreduce(const BufferObjectType& send_buf,
-                  BufferObjectType& recv_buf,
-                  size_t count,
-                  reduction reduction,
-                  const communicator& comm,
-                  const stream& op_stream,
-                  const allreduce_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event allreduce(const BufferObjectType& send_buf,
-                  BufferObjectType& recv_buf,
-                  size_t count,
-                  reduction reduction,
-                  const communicator& comm,
-                  const allreduce_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(default_stream), attr, deps);
-}
-
-/* alltoall */
-CCL_API event alltoall(const void* send_buf,
-                         void* recv_buf,
-                         size_t count,
-                         datatype dtype,
-                         const communicator& comm,
-                         const stream& op_stream,
-                         const alltoall_attr& attr,
-                         const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(op_stream), attr, deps);
-}
-
-CCL_API event alltoall(const void* send_buf,
-                         void* recv_buf,
-                         size_t count,
-                         datatype dtype,
-                         const communicator& comm,
-                         const alltoall_attr& attr,
-                         const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(default_stream), attr, deps);
-}
-
-CCL_API event alltoall(const vector_class<void*>& send_buf,
-                         const vector_class<void*>& recv_buf,
-                         size_t count,
-                         datatype dtype,
-                         const communicator& comm,
-                         const stream& op_stream,
-                         const alltoall_attr& attr,
-                         const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event alltoall(const BufferType* send_buf,
-                 BufferType* recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const stream& op_stream,
-                 const alltoall_attr& attr,
-                 const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event alltoall(const BufferType* send_buf,
-                 BufferType* recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const alltoall_attr& attr,
-                 const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event alltoall(const vector_class<BufferType*>& send_buf,
-                 const vector_class<BufferType*>& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const stream& op_stream,
-                 const alltoall_attr& attr,
-                 const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event alltoall(const vector_class<BufferType*>& send_buf,
-                 const vector_class<BufferType*>& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const alltoall_attr& attr,
-                 const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event alltoall(const BufferObjectType& send_buf,
-                 BufferObjectType& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const stream& op_stream,
-                 const alltoall_attr& attr,
-                 const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event alltoall(const BufferObjectType& send_buf,
-                 BufferObjectType& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const alltoall_attr& attr,
-                 const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
-                 const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const stream& op_stream,
-                 const alltoall_attr& attr,
-                 const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
-                 const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf,
-                 size_t count,
-                 const communicator& comm,
-                 const alltoall_attr& attr,
-                 const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
-}
-
-/* alltoallv */
-CCL_API event alltoallv(const void* send_buf,
-                          const vector_class<size_t>& send_counts,
-                          void* recv_buf,
-                          const vector_class<size_t>& recv_counts,
-                          datatype dtype,
-                          const communicator& comm,
-                          const stream& op_stream,
-                          const alltoallv_attr& attr,
-                          const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_buf, send_counts, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
-}
-
-CCL_API event alltoallv(const void* send_buf,
-                          const vector_class<size_t>& send_counts,
-                          void* recv_buf,
-                          const vector_class<size_t>& recv_counts,
-                          datatype dtype,
-                          const communicator& comm,
-                          const alltoallv_attr& attr,
-                          const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_buf, send_counts, recv_buf, recv_counts, dtype, disp(default_stream), attr, deps);
-}
-
-CCL_API event alltoallv(const vector_class<void*>& send_bufs,
-                          const vector_class<size_t>& send_counts,
-                          const vector_class<void*>& recv_bufs,
-                          const vector_class<size_t>& recv_counts,
-                          datatype dtype,
-                          const communicator& comm,
-                          const stream& op_stream,
-                          const alltoallv_attr& attr,
-                          const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_bufs, send_counts, recv_bufs, recv_counts, dtype, disp(op_stream), attr, deps);
-}
-
-CCL_API event alltoallv(const vector_class<void*>& send_bufs,
-                          const vector_class<size_t>& send_counts,
-                          const vector_class<void*>& recv_bufs,
-                          const vector_class<size_t>& recv_counts,
-                          datatype dtype,
-                          const communicator& comm,
-                          const alltoallv_attr& attr,
-                          const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_bufs, send_counts, recv_bufs, recv_counts, dtype, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event alltoallv(const BufferType* send_buf,
-                  const vector_class<size_t>& send_counts,
-                  BufferType* recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const stream& op_stream,
-                  const alltoallv_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_buf, send_counts, recv_buf, recv_counts, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event alltoallv(const BufferType* send_buf,
-                  const vector_class<size_t>& send_counts,
-                  BufferType* recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const alltoallv_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_buf, send_counts, recv_buf, recv_counts, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event alltoallv(const vector_class<BufferType*>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<BufferType*>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const stream& op_stream,
-                  const alltoallv_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_bufs, send_counts, recv_bufs, recv_counts, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event alltoallv(const vector_class<BufferType*>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<BufferType*>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const alltoallv_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_bufs, send_counts, recv_bufs, recv_counts, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event alltoallv(const BufferObjectType& send_buf,
-                  const vector_class<size_t>& send_counts,
-                  BufferObjectType& recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const stream& op_stream,
-                  const alltoallv_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_buf, send_counts, recv_buf, recv_counts, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event alltoallv(const BufferObjectType& send_buf,
-                  const vector_class<size_t>& send_counts,
-                  BufferObjectType& recv_buf,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const alltoallv_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_buf, send_counts, recv_buf, recv_counts, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const stream& op_stream,
-                  const alltoallv_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_bufs, send_counts, recv_bufs, recv_counts, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
-                  const vector_class<size_t>& send_counts,
-                  const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
-                  const vector_class<size_t>& recv_counts,
-                  const communicator& comm,
-                  const alltoallv_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->alltoallv(
-        send_bufs, send_counts, recv_bufs, recv_counts, disp(default_stream), attr, deps);
-}
-
-/* barrier */
-CCL_API event barrier(const communicator& comm,
-                        const stream& op_stream,
-                        const barrier_attr& attr,
-                        const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->barrier(disp(op_stream), attr, deps);
-}
-
-CCL_API event barrier(const communicator& comm,
-                        const barrier_attr& attr,
-                        const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->barrier(disp(default_stream), attr, deps);
-}
-
-/* broadcast */
-CCL_API event broadcast(void* buf,
-                          size_t count,
-                          datatype dtype,
-                          size_t root,
-                          const communicator& comm,
-                          const stream& op_stream,
-                          const broadcast_attr& attr,
-                          const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->bcast(buf, count, dtype, root, disp(op_stream), attr, deps);
-}
-
-CCL_API event broadcast(void* buf,
-                          size_t count,
-                          datatype dtype,
-                          size_t root,
-                          const communicator& comm,
-                          const broadcast_attr& attr,
-                          const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->bcast(buf, count, dtype, root, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event broadcast(BufferType* buf,
-                  size_t count,
-                  size_t root,
-                  const communicator& comm,
-                  const stream& op_stream,
-                  const broadcast_attr& attr,
-                  const vector_class<event>& deps)
-
-{
-    impl_dispatch disp;
-    return disp(comm)->bcast(buf, count, root, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event broadcast(BufferType* buf,
-                  size_t count,
-                  size_t root,
-                  const communicator& comm,
-                  const broadcast_attr& attr,
-                  const vector_class<event>& deps)
-
-{
-    impl_dispatch disp;
-    return disp(comm)->bcast(buf, count, root, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event broadcast(BufferObjectType& buf,
-                  size_t count,
-                  size_t root,
-                  const communicator& comm,
-                  const stream& op_stream,
-                  const broadcast_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->bcast(buf, count, root, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event broadcast(BufferObjectType& buf,
-                  size_t count,
-                  size_t root,
-                  const communicator& comm,
-                  const broadcast_attr& attr,
-                  const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->bcast(buf, count, root, disp(default_stream), attr, deps);
-}
-
-/* reduce */
-CCL_API event reduce(const void* send_buf,
-                       void* recv_buf,
-                       size_t count,
-                       datatype dtype,
-                       reduction reduction,
-                       size_t root,
-                       const communicator& comm,
-                       const stream& op_stream,
-                       const reduce_attr& attr,
-                       const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce(
-        send_buf, recv_buf, count, dtype, reduction, root, disp(op_stream), attr, deps);
-}
-
-CCL_API event reduce(const void* send_buf,
-                       void* recv_buf,
-                       size_t count,
-                       datatype dtype,
-                       reduction reduction,
-                       size_t root,
-                       const communicator& comm,
-                       const reduce_attr& attr,
-                       const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce(
-        send_buf, recv_buf, count, dtype, reduction, root, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event reduce(const BufferType* send_buf,
-               BufferType* recv_buf,
-               size_t count,
-               reduction reduction,
-               size_t root,
-               const communicator& comm,
-               const stream& op_stream,
-               const reduce_attr& attr,
-               const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce(
-        send_buf, recv_buf, count, reduction, root, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event reduce(const BufferType* send_buf,
-               BufferType* recv_buf,
-               size_t count,
-               reduction reduction,
-               size_t root,
-               const communicator& comm,
-               const reduce_attr& attr,
-               const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce(
-        send_buf, recv_buf, count, reduction, root, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event reduce(const BufferObjectType& send_buf,
-               BufferObjectType& recv_buf,
-               size_t count,
-               reduction reduction,
-               size_t root,
-               const communicator& comm,
-               const stream& op_stream,
-               const reduce_attr& attr,
-               const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce(
-        send_buf, recv_buf, count, reduction, root, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event reduce(const BufferObjectType& send_buf,
-               BufferObjectType& recv_buf,
-               size_t count,
-               reduction reduction,
-               size_t root,
-               const communicator& comm,
-               const reduce_attr& attr,
-               const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce(
-        send_buf, recv_buf, count, reduction, root, disp(default_stream), attr, deps);
-}
-
-/* reduce_scatter */
-CCL_API event reduce_scatter(const void* send_buf,
-                               void* recv_buf,
-                               size_t recv_count,
-                               datatype dtype,
-                               reduction reduction,
-                               const communicator& comm,
-                               const stream& op_stream,
-                               const reduce_scatter_attr& attr,
-                               const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce_scatter(
-        send_buf, recv_buf, recv_count, dtype, reduction, disp(op_stream), attr, deps);
-}
-
-CCL_API event reduce_scatter(const void* send_buf,
-                               void* recv_buf,
-                               size_t recv_count,
-                               datatype dtype,
-                               reduction reduction,
-                               const communicator& comm,
-                               const reduce_scatter_attr& attr,
-                               const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce_scatter(
-        send_buf, recv_buf, recv_count, dtype, reduction, disp(default_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event reduce_scatter(const BufferType* send_buf,
-                       BufferType* recv_buf,
-                       size_t recv_count,
-                       reduction reduction,
-                       const communicator& comm,
-                       const stream& op_stream,
-                       const reduce_scatter_attr& attr,
-                       const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce_scatter(
-        send_buf, recv_buf, recv_count, reduction, disp(op_stream), attr, deps);
-}
-
-template <class BufferType, typename T>
-event reduce_scatter(const BufferType* send_buf,
-                       BufferType* recv_buf,
-                       size_t recv_count,
-                       reduction reduction,
-                       const communicator& comm,
-                       const reduce_scatter_attr& attr,
-                       const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce_scatter(
-        send_buf, recv_buf, recv_count, reduction, disp(default_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event reduce_scatter(const BufferObjectType& send_buf,
-                       BufferObjectType& recv_buf,
-                       size_t recv_count,
-                       reduction reduction,
-                       const communicator& comm,
-                       const stream& op_stream,
-                       const reduce_scatter_attr& attr,
-                       const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce_scatter(
-        send_buf, recv_buf, recv_count, reduction, disp(op_stream), attr, deps);
-}
-
-template <class BufferObjectType, typename T>
-event reduce_scatter(const BufferObjectType& send_buf,
-                       BufferObjectType& recv_buf,
-                       size_t recv_count,
-                       reduction reduction,
-                       const communicator& comm,
-                       const reduce_scatter_attr& attr,
-                       const vector_class<event>& deps) {
-    impl_dispatch disp;
-    return disp(comm)->reduce_scatter(
-        send_buf, recv_buf, recv_count, reduction, disp(default_stream), attr, deps);
-}
-
-namespace preview {
-
-/* sparse_allreduce */
-CCL_API ccl::event sparse_allreduce(const void* send_ind_buf,
-                                      size_t send_ind_count,
-                                      const void* send_val_buf,
-                                      size_t send_val_count,
-                                      void* recv_ind_buf,
-                                      size_t recv_ind_count,
-                                      void* recv_val_buf,
-                                      size_t recv_val_count,
-                                      ccl::datatype index_dtype,
-                                      ccl::datatype value_dtype,
-                                      ccl::reduction reduction,
-                                      const ccl::communicator& comm,
-                                      const ccl::stream& op_stream,
-                                      const ccl::sparse_allreduce_attr& attr,
-                                      const ccl::vector_class<ccl::event>& deps) {
-    ccl::impl_dispatch disp;
-    return disp(comm)->sparse_allreduce(send_ind_buf,
-                                        send_ind_count,
-                                        send_val_buf,
-                                        send_val_count,
-                                        recv_ind_buf,
-                                        recv_ind_count,
-                                        recv_val_buf,
-                                        recv_val_count,
-                                        index_dtype,
-                                        value_dtype,
-                                        reduction,
-                                        disp(op_stream),
-                                        attr,
-                                        deps);
-}
-
-CCL_API ccl::event sparse_allreduce(const void* send_ind_buf,
-                                      size_t send_ind_count,
-                                      const void* send_val_buf,
-                                      size_t send_val_count,
-                                      void* recv_ind_buf,
-                                      size_t recv_ind_count,
-                                      void* recv_val_buf,
-                                      size_t recv_val_count,
-                                      ccl::datatype index_dtype,
-                                      ccl::datatype value_dtype,
-                                      ccl::reduction reduction,
-                                      const ccl::communicator& comm,
-                                      const ccl::sparse_allreduce_attr& attr,
-                                      const ccl::vector_class<ccl::event>& deps) {
-    ccl::impl_dispatch disp;
-    return disp(comm)->sparse_allreduce(send_ind_buf,
-                                        send_ind_count,
-                                        send_val_buf,
-                                        send_val_count,
-                                        recv_ind_buf,
-                                        recv_ind_count,
-                                        recv_val_buf,
-                                        recv_val_count,
-                                        index_dtype,
-                                        value_dtype,
-                                        reduction,
-                                        disp(default_stream),
-                                        attr,
-                                        deps);
-}
-
-template <class IndexBufferType, class ValueBufferType, typename T>
-ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf,
-                              size_t send_ind_count,
-                              const ValueBufferType* send_val_buf,
-                              size_t send_val_count,
-                              IndexBufferType* recv_ind_buf,
-                              size_t recv_ind_count,
-                              ValueBufferType* recv_val_buf,
-                              size_t recv_val_count,
-                              ccl::reduction reduction,
-                              const ccl::communicator& comm,
-                              const ccl::stream& op_stream,
-                              const ccl::sparse_allreduce_attr& attr,
-                              const ccl::vector_class<ccl::event>& deps) {
-    ccl::impl_dispatch disp;
-    return disp(comm)->sparse_allreduce(send_ind_buf,
-                                        send_ind_count,
-                                        send_val_buf,
-                                        send_val_count,
-                                        recv_ind_buf,
-                                        recv_ind_count,
-                                        recv_val_buf,
-                                        recv_val_count,
-                                        reduction,
-                                        disp(op_stream),
-                                        attr,
-                                        deps);
-}
-
-template <class IndexBufferType, class ValueBufferType, typename T>
-ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf,
-                              size_t send_ind_count,
-                              const ValueBufferType* send_val_buf,
-                              size_t send_val_count,
-                              IndexBufferType* recv_ind_buf,
-                              size_t recv_ind_count,
-                              ValueBufferType* recv_val_buf,
-                              size_t recv_val_count,
-                              ccl::reduction reduction,
-                              const ccl::communicator& comm,
-                              const ccl::sparse_allreduce_attr& attr,
-                              const ccl::vector_class<ccl::event>& deps) {
-    ccl::impl_dispatch disp;
-    return disp(comm)->sparse_allreduce(send_ind_buf,
-                                        send_ind_count,
-                                        send_val_buf,
-                                        send_val_count,
-                                        recv_ind_buf,
-                                        recv_ind_count,
-                                        recv_val_buf,
-                                        recv_val_count,
-                                        reduction,
-                                        disp(default_stream),
-                                        attr,
-                                        deps);
-}
-
-// template <class IndexBufferObjectType, class ValueBufferObjectType, typename T>
-// ccl::event
-// sparse_allreduce(const IndexBufferObjectType& send_ind_buf,
-//                  size_t send_ind_count,
-//                  const ValueBufferObjectType& send_val_buf,
-//                  size_t send_val_count,
-//                  IndexBufferObjectType& recv_ind_buf,
-//                  size_t recv_ind_count,
-//                  ValueBufferObjectType& recv_val_buf,
-//                  size_t recv_val_count,
-//                  ccl::reduction reduction,
-//                  const ccl::communicator& comm,
-//                  const ccl::stream& op_stream,
-//                  const ccl::sparse_allreduce_attr& attr,
-//                  const ccl::vector_class<ccl::event>& deps)
-// {
-//     ccl::impl_dispatch disp;
-//     return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count,
-//                                         send_val_buf, send_val_count,
-//                                         recv_ind_buf, recv_ind_count,
-//                                         recv_val_buf, recv_val_count,
-//                                         reduction,
-//                                         disp(op_stream), attr, deps);
-// }
-//
-// template <class IndexBufferObjectType, class ValueBufferObjectType, typename T>
-// ccl::event
-// sparse_allreduce(const IndexBufferObjectType& send_ind_buf,
-//                  size_t send_ind_count,
-//                  const ValueBufferObjectType& send_val_buf,
-//                  size_t send_val_count,
-//                  IndexBufferObjectType& recv_ind_buf,
-//                  size_t recv_ind_count,
-//                  ValueBufferObjectType& recv_val_buf,
-//                  size_t recv_val_count,
-//                  ccl::reduction reduction,
-//                  const ccl::communicator& comm,
-//                  const ccl::sparse_allreduce_attr& attr,
-//                  const ccl::vector_class<ccl::event>& deps)
-// {
-//     ccl::impl_dispatch disp;
-//     return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count,
-//                                         send_val_buf, send_val_count,
-//                                         recv_ind_buf, recv_ind_count,
-//                                         recv_val_buf, recv_val_count,
-//                                         reduction,
-//                                         disp(default_stream), attr, deps);
-// }
-
-} // namespace preview
-
-// API force instantiations for Operations
-API_DEVICE_COMM_OP_PTR_EXPLICIT_INSTANTIATION(char);
-API_DEVICE_COMM_OP_PTR_EXPLICIT_INSTANTIATION(int);
-API_DEVICE_COMM_OP_PTR_EXPLICIT_INSTANTIATION(int64_t);
-API_DEVICE_COMM_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t);
-API_DEVICE_COMM_OP_PTR_EXPLICIT_INSTANTIATION(float);
-API_DEVICE_COMM_OP_PTR_EXPLICIT_INSTANTIATION(double);
-
-#ifdef CCL_ENABLE_SYCL
-#ifndef COMMA
-#define COMMA ,
-#endif
-API_DEVICE_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<char COMMA 1>);
-API_DEVICE_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int COMMA 1>);
-API_DEVICE_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>);
-API_DEVICE_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint64_t COMMA 1>);
-API_DEVICE_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<float COMMA 1>);
-API_DEVICE_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<double COMMA 1>);
-#undef COMMA
-#endif // CCL_ENABLE_SYCL
-
-namespace preview {
-
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(char, char);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(char, int);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(char, ccl::bf16);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(char, float);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(char, double);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(char, int64_t);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(char, uint64_t);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int, char);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int, int);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int, ccl::bf16);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int, float);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int, double);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int, int64_t);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int, uint64_t);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, char);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, int);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, ccl::bf16);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, float);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, double);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, int64_t);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, uint64_t);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t, char);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t, int);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t, ccl::bf16);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t, float);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t, double);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t, int64_t);
-API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t, uint64_t);
-
-// #ifdef CCL_ENABLE_SYCL
-// #ifndef COMMA
-// #define COMMA ,
-// #endif
-// API_DEVICE_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int COMMA 1>,
-//                                                      cl::sycl::buffer<float COMMA 1>);
-// API_DEVICE_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int COMMA 1>,
-//                                                      cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-// API_DEVICE_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>,
-//                                                      cl::sycl::buffer<float COMMA 1>);
-// API_DEVICE_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>,
-//                                                      cl::sycl::buffer<ccl::bf16 COMMA 1>);
-// #undef COMMA
-// #endif //CCL_ENABLE_SYCL
-
-} // namespace preview
-
-} // namespace ccl
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/environment.hpp"
+#include "oneapi/ccl/api_functions.hpp"
+#include "common/comm/host_communicator/host_communicator.hpp"
+#include "oneapi/ccl/exception.hpp"
+
+#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
+#include "common/comm/comm_interface.hpp"
+#endif //#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
+
+#include "ccl_api_functions_generators.hpp"
+#include "common/global/global.hpp"
+#include "ccl_gpu_module.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+/**
+ * A structure that is a friend of the passed object
+ * and which allows access to the internal representation of this object
+ */
+struct impl_dispatch {
+    template <class Object>
+    const typename Object::impl_value_t& operator()(const Object& obj) {
+        return obj.get_impl();
+    }
+};
+
+#ifdef MULTI_GPU_SUPPORT
+/* register a gpu module */
+void register_gpu_module(std::string kernel_dir_path) {
+    if (!kernel_dir_path.empty()) {
+        if (*kernel_dir_path.rbegin() != '/') {
+            kernel_dir_path += '/';
+        }
+    }
+    LOG_INFO("SPV Kernels found directory: ", kernel_dir_path);
+
+    /*
+     * TODO:
+     * Important: Fix kernels data types generations, then uncoment
+     * the registration module.
+     */
+
+    // allgatherv
+    std::string kernel_path = kernel_dir_path + "ring_allgatherv.spv";
+    register_gpu_module_source(
+        kernel_path.c_str(), ccl::device_topology_type::ring, ccl_coll_allgatherv);
+    // kernel_path = kernel_dir_path + "a2a_allgatherv.spv";
+    // register_gpu_module_source(kernel_path.c_str(),
+    //                             ccl::device_topology_type::a2a,
+    //                             ccl_coll_allgatherv);
+    // alltoallv
+    kernel_path = kernel_dir_path + "ring_alltoallv.spv";
+    register_gpu_module_source(
+        kernel_path.c_str(), ccl::device_topology_type::ring, ccl_coll_alltoallv);
+    // register_gpu_module_source("kernels/a2a_alltoallv.spv",
+    //                             ccl::device_topology_type::a2a,
+    //                             ccl_coll_alltoallv);
+    // allreduce
+    kernel_path = kernel_dir_path + "ring_allreduce.spv";
+    register_gpu_module_source(
+        kernel_path.c_str(), ccl::device_topology_type::ring, ccl_coll_allreduce);
+    // kernel_path = kernel_dir_path + "a2a_allreduce.spv";
+    // register_gpu_module_source(kernel_path.c_str(),
+    //                             ccl::device_topology_type::a2a,
+    //                             ccl_coll_allreduce);
+    // // bcast
+    kernel_path = kernel_dir_path + "ring_bcast.spv";
+    register_gpu_module_source(
+        kernel_path.c_str(), ccl::device_topology_type::ring, ccl_coll_bcast);
+    // kernel_path = kernel_dir_path + "a2a_bcast.spv";
+    // register_gpu_module_source(kernel_path.c_str(),
+    //                            ccl::device_topology_type::a2a,
+    //                            ccl_coll_bcast);
+    kernel_path = kernel_dir_path + "ring_reduce.spv";
+    register_gpu_module_source(
+        kernel_path.c_str(), ccl::device_topology_type::ring, ccl_coll_reduce);
+    // kernel_path = kernel_dir_path + "a2a_reduce.spv";
+    // register_gpu_module_source(kernel_path.c_str(),
+    //                            ccl::device_topology_type::a2a,
+    //                            ccl_coll_reduce);
+}
+#endif //MULTI_GPU_SUPPORT
+
+void CCL_API init(const init_attr& attr) {
+    auto& env = detail::environment::instance();
+    (void)env;
+#ifdef MULTI_GPU_SUPPORT
+    const auto& env_object = ccl::global_data::env();
+
+    //WA
+    if (!env_object.kernel_path.empty()) {
+        register_gpu_module(env_object.kernel_path);
+    }
+#endif //MULTI_GPU_SUPPORT
+}
+
+/******************** ENVIRONMENT ********************/
+
+library_version CCL_API get_library_version() {
+    return detail::environment::get_library_version();
+}
+
+/* datatype */
+datatype CCL_API register_datatype(const datatype_attr& attr) {
+    return detail::environment::instance().register_datatype(attr);
+}
+
+void CCL_API deregister_datatype(datatype dtype) {
+    return detail::environment::instance().deregister_datatype(dtype);
+}
+
+size_t CCL_API get_datatype_size(datatype dtype) {
+    return detail::environment::instance().get_datatype_size(dtype);
+}
+
+/* KVS */
+shared_ptr_class<kvs> CCL_API create_main_kvs(const kvs_attr& attr) {
+    return detail::environment::instance().create_main_kvs(attr);
+}
+
+shared_ptr_class<kvs> CCL_API create_kvs(const kvs::address_type& addr, const kvs_attr& attr) {
+    return detail::environment::instance().create_kvs(addr, attr);
+}
+
+/* device */
+device CCL_API create_device() {
+    static empty_t empty{};
+    return detail::environment::instance().create_device(empty);
+}
+
+/* context */
+context CCL_API create_context() {
+    static empty_t empty{};
+    return detail::environment::instance().create_context(empty);
+}
+
+/* stream */
+stream CCL_API create_stream() {
+    return default_stream;
+}
+
+#ifdef CCL_ENABLE_SYCL
+communicator create_single_device_communicator(const int comm_size,
+                                               const int rank,
+                                               const cl::sycl::device& device,
+                                               const cl::sycl::context& context,
+                                               shared_ptr_class<kvs_interface> kvs) {
+    return detail::environment::instance().create_single_device_communicator(
+        comm_size, rank, device, context, kvs);
+}
+#endif // CCL_ENABLE_SYCL
+
+// communicator create_single_device_communicator(const size_t world_size,
+//                                     const int rank,
+//                                     cl::sycl::queue queue,
+//                                     shared_ptr_class<kvs_interface> kvs) const;
+
+// template<class DeviceSelectorType>
+// communicator create_single_device_communicator(const size_t world_size,
+//                                     const int rank,
+//                                     const DeviceSelectorType& selector,
+//                                     shared_ptr_class<kvs_interface> kvs) const
+// {
+//     return return detail::environment::instance().create_single_device_communicator(world_size, rank, cl::sycl::device(selector), kvs);
+// }
+
+} // namespace v1
+
+namespace preview {
+
+vector_class<communicator> split_communicators(
+    const vector_class<pair_class<communicator, comm_split_attr>>& attrs) {
+    // TODO not implemented
+    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
+
+    // return detail::environment::instance().split_device_communicators(attrs);
+    return {};
+}
+
+/* communicator */
+communicator CCL_API create_communicator(const comm_attr& attr) {
+    return ccl::detail::environment::instance().create_communicator(attr);
+}
+
+communicator CCL_API create_communicator(const int size,
+                                         shared_ptr_class<kvs_interface> kvs,
+                                         const comm_attr& attr) {
+    return ccl::detail::environment::instance().create_communicator(size, kvs, attr);
+}
+
+} // namespace preview
+
+namespace v1 {
+
+communicator CCL_API create_communicator(const int size,
+                                         const int rank,
+                                         shared_ptr_class<kvs_interface> kvs,
+                                         const comm_attr& attr) {
+    return detail::environment::instance().create_communicator(size, rank, kvs, attr);
+}
+
+/******************** COMMUNICATOR ********************/
+
+#define CHECK_DEPS(deps) \
+    do { \
+        if (!deps.empty()) { \
+            throw ccl::exception( \
+                std::string(__PRETTY_FUNCTION__) + \
+                " - handling a vector of events that the operation should depend on is not implemented"); \
+        } \
+    } while (0)
+
+/* allgatherv */
+CCL_API event allgatherv(const void* send_buf,
+                         size_t send_count,
+                         void* recv_buf,
+                         const vector_class<size_t>& recv_counts,
+                         datatype dtype,
+                         const communicator& comm,
+                         const stream& op_stream,
+                         const allgatherv_attr& attr,
+                         const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
+}
+
+CCL_API event allgatherv(const void* send_buf,
+                         size_t send_count,
+                         void* recv_buf,
+                         const vector_class<size_t>& recv_counts,
+                         datatype dtype,
+                         const communicator& comm,
+                         const allgatherv_attr& attr,
+                         const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_buf, recv_counts, dtype, disp(default_stream), attr, deps);
+}
+
+CCL_API event allgatherv(const void* send_buf,
+                         size_t send_count,
+                         const vector_class<void*>& recv_bufs,
+                         const vector_class<size_t>& recv_counts,
+                         datatype dtype,
+                         const communicator& comm,
+                         const stream& op_stream,
+                         const allgatherv_attr& attr,
+                         const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_bufs, recv_counts, dtype, disp(op_stream), attr, deps);
+}
+
+CCL_API event allgatherv(const void* send_buf,
+                         size_t send_count,
+                         const vector_class<void*>& recv_bufs,
+                         const vector_class<size_t>& recv_counts,
+                         datatype dtype,
+                         const communicator& comm,
+                         const allgatherv_attr& attr,
+                         const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_bufs, recv_counts, dtype, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event allgatherv(const BufferType* send_buf,
+                 size_t send_count,
+                 BufferType* recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const stream& op_stream,
+                 const allgatherv_attr& attr,
+                 const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_buf, recv_counts, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event allgatherv(const BufferType* send_buf,
+                 size_t send_count,
+                 BufferType* recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const allgatherv_attr& attr,
+                 const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_buf, recv_counts, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event allgatherv(const BufferType* send_buf,
+                 size_t send_count,
+                 vector_class<BufferType*>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const stream& op_stream,
+                 const allgatherv_attr& attr,
+                 const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_bufs, recv_counts, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event allgatherv(const BufferType* send_buf,
+                 size_t send_count,
+                 vector_class<BufferType*>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const allgatherv_attr& attr,
+                 const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_bufs, recv_counts, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event allgatherv(const BufferObjectType& send_buf,
+                 size_t send_count,
+                 BufferObjectType& recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const stream& op_stream,
+                 const allgatherv_attr& attr,
+                 const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_buf, recv_counts, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event allgatherv(const BufferObjectType& send_buf,
+                 size_t send_count,
+                 BufferObjectType& recv_buf,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const allgatherv_attr& attr,
+                 const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_buf, recv_counts, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event allgatherv(const BufferObjectType& send_buf,
+                 size_t send_count,
+                 vector_class<ccl::reference_wrapper_class<BufferObjectType>>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const stream& op_stream,
+                 const allgatherv_attr& attr,
+                 const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_bufs, recv_counts, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event allgatherv(const BufferObjectType& send_buf,
+                 size_t send_count,
+                 vector_class<ccl::reference_wrapper_class<BufferObjectType>>& recv_bufs,
+                 const vector_class<size_t>& recv_counts,
+                 const communicator& comm,
+                 const allgatherv_attr& attr,
+                 const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allgatherv(
+        send_buf, send_count, recv_bufs, recv_counts, disp(default_stream), attr, deps);
+}
+
+/* allreduce */
+CCL_API event allreduce(const void* send_buf,
+                        void* recv_buf,
+                        size_t count,
+                        datatype dtype,
+                        reduction reduction,
+                        const communicator& comm,
+                        const stream& op_stream,
+                        const allreduce_attr& attr,
+                        const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allreduce(
+        send_buf, recv_buf, count, dtype, reduction, disp(op_stream), attr, deps);
+}
+
+CCL_API event allreduce(const void* send_buf,
+                        void* recv_buf,
+                        size_t count,
+                        datatype dtype,
+                        reduction reduction,
+                        const communicator& comm,
+                        const allreduce_attr& attr,
+                        const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allreduce(
+        send_buf, recv_buf, count, dtype, reduction, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event allreduce(const BufferType* send_buf,
+                BufferType* recv_buf,
+                size_t count,
+                reduction reduction,
+                const communicator& comm,
+                const stream& op_stream,
+                const allreduce_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event allreduce(const BufferType* send_buf,
+                BufferType* recv_buf,
+                size_t count,
+                reduction reduction,
+                const communicator& comm,
+                const allreduce_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allreduce(
+        send_buf, recv_buf, count, reduction, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event allreduce(const BufferObjectType& send_buf,
+                BufferObjectType& recv_buf,
+                size_t count,
+                reduction reduction,
+                const communicator& comm,
+                const stream& op_stream,
+                const allreduce_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allreduce(send_buf, recv_buf, count, reduction, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event allreduce(const BufferObjectType& send_buf,
+                BufferObjectType& recv_buf,
+                size_t count,
+                reduction reduction,
+                const communicator& comm,
+                const allreduce_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->allreduce(
+        send_buf, recv_buf, count, reduction, disp(default_stream), attr, deps);
+}
+
+/* alltoall */
+CCL_API event alltoall(const void* send_buf,
+                       void* recv_buf,
+                       size_t count,
+                       datatype dtype,
+                       const communicator& comm,
+                       const stream& op_stream,
+                       const alltoall_attr& attr,
+                       const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(op_stream), attr, deps);
+}
+
+CCL_API event alltoall(const void* send_buf,
+                       void* recv_buf,
+                       size_t count,
+                       datatype dtype,
+                       const communicator& comm,
+                       const alltoall_attr& attr,
+                       const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(default_stream), attr, deps);
+}
+
+CCL_API event alltoall(const vector_class<void*>& send_buf,
+                       const vector_class<void*>& recv_buf,
+                       size_t count,
+                       datatype dtype,
+                       const communicator& comm,
+                       const stream& op_stream,
+                       const alltoall_attr& attr,
+                       const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, dtype, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event alltoall(const BufferType* send_buf,
+               BufferType* recv_buf,
+               size_t count,
+               const communicator& comm,
+               const stream& op_stream,
+               const alltoall_attr& attr,
+               const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event alltoall(const BufferType* send_buf,
+               BufferType* recv_buf,
+               size_t count,
+               const communicator& comm,
+               const alltoall_attr& attr,
+               const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event alltoall(const vector_class<BufferType*>& send_buf,
+               const vector_class<BufferType*>& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const stream& op_stream,
+               const alltoall_attr& attr,
+               const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event alltoall(const vector_class<BufferType*>& send_buf,
+               const vector_class<BufferType*>& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const alltoall_attr& attr,
+               const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event alltoall(const BufferObjectType& send_buf,
+               BufferObjectType& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const stream& op_stream,
+               const alltoall_attr& attr,
+               const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event alltoall(const BufferObjectType& send_buf,
+               BufferObjectType& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const alltoall_attr& attr,
+               const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
+               const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const stream& op_stream,
+               const alltoall_attr& attr,
+               const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event alltoall(const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf,
+               const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf,
+               size_t count,
+               const communicator& comm,
+               const alltoall_attr& attr,
+               const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoall(send_buf, recv_buf, count, disp(default_stream), attr, deps);
+}
+
+/* alltoallv */
+CCL_API event alltoallv(const void* send_buf,
+                        const vector_class<size_t>& send_counts,
+                        void* recv_buf,
+                        const vector_class<size_t>& recv_counts,
+                        datatype dtype,
+                        const communicator& comm,
+                        const stream& op_stream,
+                        const alltoallv_attr& attr,
+                        const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_buf, send_counts, recv_buf, recv_counts, dtype, disp(op_stream), attr, deps);
+}
+
+CCL_API event alltoallv(const void* send_buf,
+                        const vector_class<size_t>& send_counts,
+                        void* recv_buf,
+                        const vector_class<size_t>& recv_counts,
+                        datatype dtype,
+                        const communicator& comm,
+                        const alltoallv_attr& attr,
+                        const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_buf, send_counts, recv_buf, recv_counts, dtype, disp(default_stream), attr, deps);
+}
+
+CCL_API event alltoallv(const vector_class<void*>& send_bufs,
+                        const vector_class<size_t>& send_counts,
+                        const vector_class<void*>& recv_bufs,
+                        const vector_class<size_t>& recv_counts,
+                        datatype dtype,
+                        const communicator& comm,
+                        const stream& op_stream,
+                        const alltoallv_attr& attr,
+                        const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_bufs, send_counts, recv_bufs, recv_counts, dtype, disp(op_stream), attr, deps);
+}
+
+CCL_API event alltoallv(const vector_class<void*>& send_bufs,
+                        const vector_class<size_t>& send_counts,
+                        const vector_class<void*>& recv_bufs,
+                        const vector_class<size_t>& recv_counts,
+                        datatype dtype,
+                        const communicator& comm,
+                        const alltoallv_attr& attr,
+                        const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_bufs, send_counts, recv_bufs, recv_counts, dtype, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event alltoallv(const BufferType* send_buf,
+                const vector_class<size_t>& send_counts,
+                BufferType* recv_buf,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const stream& op_stream,
+                const alltoallv_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_buf, send_counts, recv_buf, recv_counts, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event alltoallv(const BufferType* send_buf,
+                const vector_class<size_t>& send_counts,
+                BufferType* recv_buf,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const alltoallv_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_buf, send_counts, recv_buf, recv_counts, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event alltoallv(const vector_class<BufferType*>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<BufferType*>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const stream& op_stream,
+                const alltoallv_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_bufs, send_counts, recv_bufs, recv_counts, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event alltoallv(const vector_class<BufferType*>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<BufferType*>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const alltoallv_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_bufs, send_counts, recv_bufs, recv_counts, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event alltoallv(const BufferObjectType& send_buf,
+                const vector_class<size_t>& send_counts,
+                BufferObjectType& recv_buf,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const stream& op_stream,
+                const alltoallv_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_buf, send_counts, recv_buf, recv_counts, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event alltoallv(const BufferObjectType& send_buf,
+                const vector_class<size_t>& send_counts,
+                BufferObjectType& recv_buf,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const alltoallv_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_buf, send_counts, recv_buf, recv_counts, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const stream& op_stream,
+                const alltoallv_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_bufs, send_counts, recv_bufs, recv_counts, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event alltoallv(const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs,
+                const vector_class<size_t>& send_counts,
+                const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs,
+                const vector_class<size_t>& recv_counts,
+                const communicator& comm,
+                const alltoallv_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->alltoallv(
+        send_bufs, send_counts, recv_bufs, recv_counts, disp(default_stream), attr, deps);
+}
+
+/* barrier */
+CCL_API event barrier(const communicator& comm,
+                      const stream& op_stream,
+                      const barrier_attr& attr,
+                      const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->barrier(disp(op_stream), attr, deps);
+}
+
+CCL_API event barrier(const communicator& comm,
+                      const barrier_attr& attr,
+                      const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->barrier(disp(default_stream), attr, deps);
+}
+
+/* broadcast */
+CCL_API event broadcast(void* buf,
+                        size_t count,
+                        datatype dtype,
+                        int root,
+                        const communicator& comm,
+                        const stream& op_stream,
+                        const broadcast_attr& attr,
+                        const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->bcast(buf, count, dtype, root, disp(op_stream), attr, deps);
+}
+
+CCL_API event broadcast(void* buf,
+                        size_t count,
+                        datatype dtype,
+                        int root,
+                        const communicator& comm,
+                        const broadcast_attr& attr,
+                        const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->bcast(buf, count, dtype, root, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event broadcast(BufferType* buf,
+                size_t count,
+                int root,
+                const communicator& comm,
+                const stream& op_stream,
+                const broadcast_attr& attr,
+                const vector_class<event>& deps)
+
+{
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->bcast(buf, count, root, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event broadcast(BufferType* buf,
+                size_t count,
+                int root,
+                const communicator& comm,
+                const broadcast_attr& attr,
+                const vector_class<event>& deps)
+
+{
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->bcast(buf, count, root, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event broadcast(BufferObjectType& buf,
+                size_t count,
+                int root,
+                const communicator& comm,
+                const stream& op_stream,
+                const broadcast_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->bcast(buf, count, root, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event broadcast(BufferObjectType& buf,
+                size_t count,
+                int root,
+                const communicator& comm,
+                const broadcast_attr& attr,
+                const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->bcast(buf, count, root, disp(default_stream), attr, deps);
+}
+
+/* reduce */
+CCL_API event reduce(const void* send_buf,
+                     void* recv_buf,
+                     size_t count,
+                     datatype dtype,
+                     reduction reduction,
+                     int root,
+                     const communicator& comm,
+                     const stream& op_stream,
+                     const reduce_attr& attr,
+                     const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce(
+        send_buf, recv_buf, count, dtype, reduction, root, disp(op_stream), attr, deps);
+}
+
+CCL_API event reduce(const void* send_buf,
+                     void* recv_buf,
+                     size_t count,
+                     datatype dtype,
+                     reduction reduction,
+                     int root,
+                     const communicator& comm,
+                     const reduce_attr& attr,
+                     const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce(
+        send_buf, recv_buf, count, dtype, reduction, root, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event reduce(const BufferType* send_buf,
+             BufferType* recv_buf,
+             size_t count,
+             reduction reduction,
+             int root,
+             const communicator& comm,
+             const stream& op_stream,
+             const reduce_attr& attr,
+             const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce(
+        send_buf, recv_buf, count, reduction, root, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event reduce(const BufferType* send_buf,
+             BufferType* recv_buf,
+             size_t count,
+             reduction reduction,
+             int root,
+             const communicator& comm,
+             const reduce_attr& attr,
+             const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce(
+        send_buf, recv_buf, count, reduction, root, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event reduce(const BufferObjectType& send_buf,
+             BufferObjectType& recv_buf,
+             size_t count,
+             reduction reduction,
+             int root,
+             const communicator& comm,
+             const stream& op_stream,
+             const reduce_attr& attr,
+             const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce(
+        send_buf, recv_buf, count, reduction, root, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event reduce(const BufferObjectType& send_buf,
+             BufferObjectType& recv_buf,
+             size_t count,
+             reduction reduction,
+             int root,
+             const communicator& comm,
+             const reduce_attr& attr,
+             const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce(
+        send_buf, recv_buf, count, reduction, root, disp(default_stream), attr, deps);
+}
+
+/* reduce_scatter */
+CCL_API event reduce_scatter(const void* send_buf,
+                             void* recv_buf,
+                             size_t recv_count,
+                             datatype dtype,
+                             reduction reduction,
+                             const communicator& comm,
+                             const stream& op_stream,
+                             const reduce_scatter_attr& attr,
+                             const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce_scatter(
+        send_buf, recv_buf, recv_count, dtype, reduction, disp(op_stream), attr, deps);
+}
+
+CCL_API event reduce_scatter(const void* send_buf,
+                             void* recv_buf,
+                             size_t recv_count,
+                             datatype dtype,
+                             reduction reduction,
+                             const communicator& comm,
+                             const reduce_scatter_attr& attr,
+                             const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce_scatter(
+        send_buf, recv_buf, recv_count, dtype, reduction, disp(default_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event reduce_scatter(const BufferType* send_buf,
+                     BufferType* recv_buf,
+                     size_t recv_count,
+                     reduction reduction,
+                     const communicator& comm,
+                     const stream& op_stream,
+                     const reduce_scatter_attr& attr,
+                     const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce_scatter(
+        send_buf, recv_buf, recv_count, reduction, disp(op_stream), attr, deps);
+}
+
+template <class BufferType, typename T>
+event reduce_scatter(const BufferType* send_buf,
+                     BufferType* recv_buf,
+                     size_t recv_count,
+                     reduction reduction,
+                     const communicator& comm,
+                     const reduce_scatter_attr& attr,
+                     const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce_scatter(
+        send_buf, recv_buf, recv_count, reduction, disp(default_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event reduce_scatter(const BufferObjectType& send_buf,
+                     BufferObjectType& recv_buf,
+                     size_t recv_count,
+                     reduction reduction,
+                     const communicator& comm,
+                     const stream& op_stream,
+                     const reduce_scatter_attr& attr,
+                     const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce_scatter(
+        send_buf, recv_buf, recv_count, reduction, disp(op_stream), attr, deps);
+}
+
+template <class BufferObjectType, typename T>
+event reduce_scatter(const BufferObjectType& send_buf,
+                     BufferObjectType& recv_buf,
+                     size_t recv_count,
+                     reduction reduction,
+                     const communicator& comm,
+                     const reduce_scatter_attr& attr,
+                     const vector_class<event>& deps) {
+    CHECK_DEPS(deps);
+    impl_dispatch disp;
+    return disp(comm)->reduce_scatter(
+        send_buf, recv_buf, recv_count, reduction, disp(default_stream), attr, deps);
+}
+
+} // namespace v1
+
+namespace preview {
+
+/* sparse_allreduce */
+CCL_API ccl::event sparse_allreduce(const void* send_ind_buf,
+                                    size_t send_ind_count,
+                                    const void* send_val_buf,
+                                    size_t send_val_count,
+                                    void* recv_ind_buf,
+                                    size_t recv_ind_count,
+                                    void* recv_val_buf,
+                                    size_t recv_val_count,
+                                    ccl::datatype index_dtype,
+                                    ccl::datatype value_dtype,
+                                    ccl::reduction reduction,
+                                    const ccl::communicator& comm,
+                                    const ccl::stream& op_stream,
+                                    const ccl::sparse_allreduce_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
+    CHECK_DEPS(deps);
+    ccl::impl_dispatch disp;
+    return disp(comm)->sparse_allreduce(send_ind_buf,
+                                        send_ind_count,
+                                        send_val_buf,
+                                        send_val_count,
+                                        recv_ind_buf,
+                                        recv_ind_count,
+                                        recv_val_buf,
+                                        recv_val_count,
+                                        index_dtype,
+                                        value_dtype,
+                                        reduction,
+                                        disp(op_stream),
+                                        attr,
+                                        deps);
+}
+
+CCL_API ccl::event sparse_allreduce(const void* send_ind_buf,
+                                    size_t send_ind_count,
+                                    const void* send_val_buf,
+                                    size_t send_val_count,
+                                    void* recv_ind_buf,
+                                    size_t recv_ind_count,
+                                    void* recv_val_buf,
+                                    size_t recv_val_count,
+                                    ccl::datatype index_dtype,
+                                    ccl::datatype value_dtype,
+                                    ccl::reduction reduction,
+                                    const ccl::communicator& comm,
+                                    const ccl::sparse_allreduce_attr& attr,
+                                    const ccl::vector_class<ccl::event>& deps) {
+    CHECK_DEPS(deps);
+    ccl::impl_dispatch disp;
+    return disp(comm)->sparse_allreduce(send_ind_buf,
+                                        send_ind_count,
+                                        send_val_buf,
+                                        send_val_count,
+                                        recv_ind_buf,
+                                        recv_ind_count,
+                                        recv_val_buf,
+                                        recv_val_count,
+                                        index_dtype,
+                                        value_dtype,
+                                        reduction,
+                                        disp(default_stream),
+                                        attr,
+                                        deps);
+}
+
+template <class IndexBufferType, class ValueBufferType, typename T>
+ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf,
+                            size_t send_ind_count,
+                            const ValueBufferType* send_val_buf,
+                            size_t send_val_count,
+                            IndexBufferType* recv_ind_buf,
+                            size_t recv_ind_count,
+                            ValueBufferType* recv_val_buf,
+                            size_t recv_val_count,
+                            ccl::reduction reduction,
+                            const ccl::communicator& comm,
+                            const ccl::stream& op_stream,
+                            const ccl::sparse_allreduce_attr& attr,
+                            const ccl::vector_class<ccl::event>& deps) {
+    CHECK_DEPS(deps);
+    ccl::impl_dispatch disp;
+    return disp(comm)->sparse_allreduce(send_ind_buf,
+                                        send_ind_count,
+                                        send_val_buf,
+                                        send_val_count,
+                                        recv_ind_buf,
+                                        recv_ind_count,
+                                        recv_val_buf,
+                                        recv_val_count,
+                                        reduction,
+                                        disp(op_stream),
+                                        attr,
+                                        deps);
+}
+
+template <class IndexBufferType, class ValueBufferType, typename T>
+ccl::event sparse_allreduce(const IndexBufferType* send_ind_buf,
+                            size_t send_ind_count,
+                            const ValueBufferType* send_val_buf,
+                            size_t send_val_count,
+                            IndexBufferType* recv_ind_buf,
+                            size_t recv_ind_count,
+                            ValueBufferType* recv_val_buf,
+                            size_t recv_val_count,
+                            ccl::reduction reduction,
+                            const ccl::communicator& comm,
+                            const ccl::sparse_allreduce_attr& attr,
+                            const ccl::vector_class<ccl::event>& deps) {
+    CHECK_DEPS(deps);
+    ccl::impl_dispatch disp;
+    return disp(comm)->sparse_allreduce(send_ind_buf,
+                                        send_ind_count,
+                                        send_val_buf,
+                                        send_val_count,
+                                        recv_ind_buf,
+                                        recv_ind_count,
+                                        recv_val_buf,
+                                        recv_val_count,
+                                        reduction,
+                                        disp(default_stream),
+                                        attr,
+                                        deps);
+}
+
+// template <class IndexBufferObjectType, class ValueBufferObjectType, typename T>
+// ccl::event
+// sparse_allreduce(const IndexBufferObjectType& send_ind_buf,
+//                  size_t send_ind_count,
+//                  const ValueBufferObjectType& send_val_buf,
+//                  size_t send_val_count,
+//                  IndexBufferObjectType& recv_ind_buf,
+//                  size_t recv_ind_count,
+//                  ValueBufferObjectType& recv_val_buf,
+//                  size_t recv_val_count,
+//                  ccl::reduction reduction,
+//                  const ccl::communicator& comm,
+//                  const ccl::stream& op_stream,
+//                  const ccl::sparse_allreduce_attr& attr,
+//                  const ccl::vector_class<ccl::event>& deps)
+// {
+//     CHECK_DEPS(deps);
+//     ccl::impl_dispatch disp;
+//     return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count,
+//                                         send_val_buf, send_val_count,
+//                                         recv_ind_buf, recv_ind_count,
+//                                         recv_val_buf, recv_val_count,
+//                                         reduction,
+//                                         disp(op_stream), attr, deps);
+// }
+//
+// template <class IndexBufferObjectType, class ValueBufferObjectType, typename T>
+// ccl::event
+// sparse_allreduce(const IndexBufferObjectType& send_ind_buf,
+//                  size_t send_ind_count,
+//                  const ValueBufferObjectType& send_val_buf,
+//                  size_t send_val_count,
+//                  IndexBufferObjectType& recv_ind_buf,
+//                  size_t recv_ind_count,
+//                  ValueBufferObjectType& recv_val_buf,
+//                  size_t recv_val_count,
+//                  ccl::reduction reduction,
+//                  const ccl::communicator& comm,
+//                  const ccl::sparse_allreduce_attr& attr,
+//                  const ccl::vector_class<ccl::event>& deps)
+// {
+//     CHECK_DEPS(deps);
+//     ccl::impl_dispatch disp;
+//     return disp(comm)->sparse_allreduce(send_ind_buf, send_ind_count,
+//                                         send_val_buf, send_val_count,
+//                                         recv_ind_buf, recv_ind_count,
+//                                         recv_val_buf, recv_val_count,
+//                                         reduction,
+//                                         disp(default_stream), attr, deps);
+// }
+
+} // namespace preview
+
+namespace v1 {
+
+// API force instantiations for Operations
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(int8_t);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(uint8_t);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(int16_t);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(uint16_t);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(int32_t);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(uint32_t);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(int64_t);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(uint64_t);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(float);
+API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(double);
+
+#ifdef CCL_ENABLE_SYCL
+#ifndef COMMA
+#define COMMA ,
+#endif
+
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int8_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint8_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int16_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint16_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int32_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint32_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<uint64_t COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<float COMMA 1>);
+API_COMM_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<double COMMA 1>);
+
+#undef COMMA
+#endif // CCL_ENABLE_SYCL
+
+} // namespace v1
+
+namespace preview {
+
+API_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int32_t, float);
+API_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int32_t, ccl::bfloat16);
+API_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, float);
+API_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(int64_t, ccl::bfloat16);
+
+// #ifdef CCL_ENABLE_SYCL
+// #ifndef COMMA
+// #define COMMA ,
+// #endif
+// API_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int32_t COMMA 1>,
+//                                                      cl::sycl::buffer<float COMMA 1>);
+// API_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int32_t COMMA 1>,
+//                                                      cl::sycl::buffer<ccl::bfloat16 COMMA 1>);
+
+// API_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>,
+//                                                      cl::sycl::buffer<float COMMA 1>);
+// API_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>,
+//                                                      cl::sycl::buffer<ccl::bfloat16 COMMA 1>);
+// #undef COMMA
+// #endif //CCL_ENABLE_SYCL
+
+} // namespace preview
+
+} // namespace ccl
diff --git a/src/ccl_api_functions_generators.hpp b/src/ccl_api_functions_generators.hpp
index b02c2bfd1..0fc150dbb 100644
--- a/src/ccl_api_functions_generators.hpp
+++ b/src/ccl_api_functions_generators.hpp
@@ -13,424 +13,426 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-namespace ccl {
-
-#define CREATE_OP_ATTR_INSTANTIATION(attr) template attr CCL_API create_operation_attr<attr>();
-
-/******************** DEVICE COMMUNICATOR ********************/
-
-/**
- * Generating API types for collective operations
- * of the device communicator class (communicator)
- */
-#define API_DEVICE_COMM_OP_PTR_EXPLICIT_INSTANTIATION(BufferType) \
-\
-    template event CCL_API allgatherv(const BufferType* send_buf, \
-                                        size_t send_count, \
-                                        BufferType* recv_buf, \
-                                        const vector_class<size_t>& recv_counts, \
-                                        const communicator& comm, \
-                                        const stream& op_stream, \
-                                        const allgatherv_attr& attr, \
-                                        const vector_class<event>& deps); \
-\
-    template event CCL_API allgatherv(const BufferType* send_buf, \
-                                        size_t send_count, \
-                                        BufferType* recv_buf, \
-                                        const vector_class<size_t>& recv_counts, \
-                                        const communicator& comm, \
-                                        const allgatherv_attr& attr, \
-                                        const vector_class<event>& deps); \
-\
-    template event CCL_API allgatherv(const BufferType* send_buf, \
-                                        size_t send_count, \
-                                        vector_class<BufferType*>& recv_bufs, \
-                                        const vector_class<size_t>& recv_counts, \
-                                        const communicator& comm, \
-                                        const stream& op_stream, \
-                                        const allgatherv_attr& attr, \
-                                        const vector_class<event>& deps); \
-\
-    template event CCL_API allgatherv(const BufferType* send_buf, \
-                                        size_t send_count, \
-                                        vector_class<BufferType*>& recv_bufs, \
-                                        const vector_class<size_t>& recv_counts, \
-                                        const communicator& comm, \
-                                        const allgatherv_attr& attr, \
-                                        const vector_class<event>& deps); \
-\
-    template event CCL_API allreduce(const BufferType* send_buf, \
-                                       BufferType* recv_buf, \
-                                       size_t count, \
-                                       reduction reduction, \
-                                       const communicator& comm, \
-                                       const stream& op_stream, \
-                                       const allreduce_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API allreduce(const BufferType* send_buf, \
-                                       BufferType* recv_buf, \
-                                       size_t count, \
-                                       reduction reduction, \
-                                       const communicator& comm, \
-                                       const allreduce_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API alltoall(const BufferType* send_buf, \
-                                      BufferType* recv_buf, \
-                                      size_t count, \
-                                      const communicator& comm, \
-                                      const stream& op_stream, \
-                                      const alltoall_attr& attr, \
-                                      const vector_class<event>& deps); \
-\
-    template event CCL_API alltoall(const BufferType* send_buf, \
-                                      BufferType* recv_buf, \
-                                      size_t count, \
-                                      const communicator& comm, \
-                                      const alltoall_attr& attr, \
-                                      const vector_class<event>& deps); \
-\
-    template event CCL_API alltoall(const vector_class<BufferType*>& send_buf, \
-                                      const vector_class<BufferType*>& recv_buf, \
-                                      size_t count, \
-                                      const communicator& comm, \
-                                      const stream& op_stream, \
-                                      const alltoall_attr& attr, \
-                                      const vector_class<event>& deps); \
-\
-    template event CCL_API alltoall(const vector_class<BufferType*>& send_buf, \
-                                      const vector_class<BufferType*>& recv_buf, \
-                                      size_t count, \
-                                      const communicator& comm, \
-                                      const alltoall_attr& attr, \
-                                      const vector_class<event>& deps); \
-\
-    template event CCL_API alltoallv(const BufferType* send_buf, \
-                                       const vector_class<size_t>& send_counts, \
-                                       BufferType* recv_buf, \
-                                       const vector_class<size_t>& recv_counts, \
-                                       const communicator& comm, \
-                                       const stream& op_stream, \
-                                       const alltoallv_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API alltoallv(const BufferType* send_buf, \
-                                       const vector_class<size_t>& send_counts, \
-                                       BufferType* recv_buf, \
-                                       const vector_class<size_t>& recv_counts, \
-                                       const communicator& comm, \
-                                       const alltoallv_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API alltoallv(const vector_class<BufferType*>& send_bufs, \
-                                       const vector_class<size_t>& send_counts, \
-                                       const vector_class<BufferType*>& recv_bufs, \
-                                       const vector_class<size_t>& recv_counts, \
-                                       const communicator& comm, \
-                                       const stream& op_stream, \
-                                       const alltoallv_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API alltoallv(const vector_class<BufferType*>& send_bufs, \
-                                       const vector_class<size_t>& send_counts, \
-                                       const vector_class<BufferType*>& recv_bufs, \
-                                       const vector_class<size_t>& recv_counts, \
-                                       const communicator& comm, \
-                                       const alltoallv_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API broadcast(BufferType* buf, \
-                                       size_t count, \
-                                       size_t root, \
-                                       const communicator& comm, \
-                                       const stream& op_stream, \
-                                       const broadcast_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API broadcast(BufferType* buf, \
-                                       size_t count, \
-                                       size_t root, \
-                                       const communicator& comm, \
-                                       const broadcast_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API reduce(const BufferType* send_buf, \
-                                    BufferType* recv_buf, \
-                                    size_t count, \
-                                    reduction reduction, \
-                                    size_t root, \
-                                    const communicator& comm, \
-                                    const stream& op_stream, \
-                                    const reduce_attr& attr, \
-                                    const vector_class<event>& deps); \
-\
-    template event CCL_API reduce(const BufferType* send_buf, \
-                                    BufferType* recv_buf, \
-                                    size_t count, \
-                                    reduction reduction, \
-                                    size_t root, \
-                                    const communicator& comm, \
-                                    const reduce_attr& attr, \
-                                    const vector_class<event>& deps); \
-\
-    template event CCL_API reduce_scatter(const BufferType* send_buf, \
-                                            BufferType* recv_buf, \
-                                            size_t recv_count, \
-                                            reduction reduction, \
-                                            const communicator& comm, \
-                                            const stream& op_stream, \
-                                            const reduce_scatter_attr& attr, \
-                                            const vector_class<event>& deps); \
-\
-    template event CCL_API reduce_scatter(const BufferType* send_buf, \
-                                            BufferType* recv_buf, \
-                                            size_t recv_count, \
-                                            reduction reduction, \
-                                            const communicator& comm, \
-                                            const reduce_scatter_attr& attr, \
-                                            const vector_class<event>& deps);
-
-#define API_DEVICE_COMM_OP_REF_EXPLICIT_INSTANTIATION(BufferObjectType) \
-\
-    template event CCL_API allgatherv(const BufferObjectType& send_buf, \
-                                        size_t send_count, \
-                                        BufferObjectType& recv_buf, \
-                                        const vector_class<size_t>& recv_counts, \
-                                        const communicator& comm, \
-                                        const stream& op_stream, \
-                                        const allgatherv_attr& attr, \
-                                        const vector_class<event>& deps); \
-\
-    template event CCL_API allgatherv(const BufferObjectType& send_buf, \
-                                        size_t send_count, \
-                                        BufferObjectType& recv_buf, \
-                                        const vector_class<size_t>& recv_counts, \
-                                        const communicator& comm, \
-                                        const allgatherv_attr& attr, \
-                                        const vector_class<event>& deps); \
-\
-    template event CCL_API allgatherv( \
-        const BufferObjectType& send_buf, \
-        size_t send_count, \
-        vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs, \
-        const vector_class<size_t>& recv_counts, \
-        const communicator& comm, \
-        const stream& op_stream, \
-        const allgatherv_attr& attr, \
-        const vector_class<event>& deps); \
-\
-    template event CCL_API allgatherv( \
-        const BufferObjectType& send_buf, \
-        size_t send_count, \
-        vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs, \
-        const vector_class<size_t>& recv_counts, \
-        const communicator& comm, \
-        const allgatherv_attr& attr, \
-        const vector_class<event>& deps); \
-\
-    template event CCL_API allreduce(const BufferObjectType& send_buf, \
-                                       BufferObjectType& recv_buf, \
-                                       size_t count, \
-                                       reduction reduction, \
-                                       const communicator& comm, \
-                                       const stream& op_stream, \
-                                       const allreduce_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API allreduce(const BufferObjectType& send_buf, \
-                                       BufferObjectType& recv_buf, \
-                                       size_t count, \
-                                       reduction reduction, \
-                                       const communicator& comm, \
-                                       const allreduce_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API alltoall(const BufferObjectType& send_buf, \
-                                      BufferObjectType& recv_buf, \
-                                      size_t count, \
-                                      const communicator& comm, \
-                                      const stream& op_stream, \
-                                      const alltoall_attr& attr, \
-                                      const vector_class<event>& deps); \
-\
-    template event CCL_API alltoall(const BufferObjectType& send_buf, \
-                                      BufferObjectType& recv_buf, \
-                                      size_t count, \
-                                      const communicator& comm, \
-                                      const alltoall_attr& attr, \
-                                      const vector_class<event>& deps); \
-\
-    template event CCL_API alltoall( \
-        const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf, \
-        const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf, \
-        size_t count, \
-        const communicator& comm, \
-        const stream& op_stream, \
-        const alltoall_attr& attr, \
-        const vector_class<event>& deps); \
-\
-    template event CCL_API alltoall( \
-        const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf, \
-        const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf, \
-        size_t count, \
-        const communicator& comm, \
-        const alltoall_attr& attr, \
-        const vector_class<event>& deps); \
-\
-    template event CCL_API alltoallv(const BufferObjectType& send_buf, \
-                                       const vector_class<size_t>& send_counts, \
-                                       BufferObjectType& recv_buf, \
-                                       const vector_class<size_t>& recv_counts, \
-                                       const communicator& comm, \
-                                       const stream& op_stream, \
-                                       const alltoallv_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API alltoallv(const BufferObjectType& send_buf, \
-                                       const vector_class<size_t>& send_counts, \
-                                       BufferObjectType& recv_buf, \
-                                       const vector_class<size_t>& recv_counts, \
-                                       const communicator& comm, \
-                                       const alltoallv_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API alltoallv( \
-        const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs, \
-        const vector_class<size_t>& send_counts, \
-        const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs, \
-        const vector_class<size_t>& recv_counts, \
-        const communicator& comm, \
-        const stream& op_stream, \
-        const alltoallv_attr& attr, \
-        const vector_class<event>& deps); \
-\
-    template event CCL_API alltoallv( \
-        const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs, \
-        const vector_class<size_t>& send_counts, \
-        const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs, \
-        const vector_class<size_t>& recv_counts, \
-        const communicator& comm, \
-        const alltoallv_attr& attr, \
-        const vector_class<event>& deps); \
-\
-    template event CCL_API broadcast(BufferObjectType& buf, \
-                                       size_t count, \
-                                       size_t root, \
-                                       const communicator& comm, \
-                                       const stream& op_stream, \
-                                       const broadcast_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API broadcast(BufferObjectType& buf, \
-                                       size_t count, \
-                                       size_t root, \
-                                       const communicator& comm, \
-                                       const broadcast_attr& attr, \
-                                       const vector_class<event>& deps); \
-\
-    template event CCL_API reduce(const BufferObjectType& send_buf, \
-                                    BufferObjectType& recv_buf, \
-                                    size_t count, \
-                                    reduction reduction, \
-                                    size_t root, \
-                                    const communicator& comm, \
-                                    const stream& op_stream, \
-                                    const reduce_attr& attr, \
-                                    const vector_class<event>& deps); \
-\
-    template event CCL_API reduce(const BufferObjectType& send_buf, \
-                                    BufferObjectType& recv_buf, \
-                                    size_t count, \
-                                    reduction reduction, \
-                                    size_t root, \
-                                    const communicator& comm, \
-                                    const reduce_attr& attr, \
-                                    const vector_class<event>& deps); \
-\
-    template event CCL_API reduce_scatter(const BufferObjectType& send_buf, \
-                                            BufferObjectType& recv_buf, \
-                                            size_t recv_count, \
-                                            reduction reduction, \
-                                            const communicator& comm, \
-                                            const stream& op_stream, \
-                                            const reduce_scatter_attr& attr, \
-                                            const vector_class<event>& deps); \
-\
-    template event CCL_API reduce_scatter(const BufferObjectType& send_buf, \
-                                            BufferObjectType& recv_buf, \
-                                            size_t recv_count, \
-                                            reduction reduction, \
-                                            const communicator& comm, \
-                                            const reduce_scatter_attr& attr, \
-                                            const vector_class<event>& deps);
-
-namespace preview {
-
-#define API_DEVICE_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(index_type, value_type) \
-\
-    template ccl::event CCL_API sparse_allreduce(const index_type* send_ind_buf, \
-                                                   size_t send_ind_count, \
-                                                   const value_type* send_val_buf, \
-                                                   size_t send_val_count, \
-                                                   index_type* recv_ind_buf, \
-                                                   size_t recv_ind_count, \
-                                                   value_type* recv_val_buf, \
-                                                   size_t recv_val_count, \
-                                                   ccl::reduction reduction, \
-                                                   const ccl::communicator& comm, \
-                                                   const ccl::stream& op_stream, \
-                                                   const ccl::sparse_allreduce_attr& attr, \
-                                                   const ccl::vector_class<ccl::event>& deps); \
-\
-    template ccl::event CCL_API sparse_allreduce(const index_type* send_ind_buf, \
-                                                   size_t send_ind_count, \
-                                                   const value_type* send_val_buf, \
-                                                   size_t send_val_count, \
-                                                   index_type* recv_ind_buf, \
-                                                   size_t recv_ind_count, \
-                                                   value_type* recv_val_buf, \
-                                                   size_t recv_val_count, \
-                                                   ccl::reduction reduction, \
-                                                   const ccl::communicator& comm, \
-                                                   const ccl::sparse_allreduce_attr& attr, \
-                                                   const ccl::vector_class<ccl::event>& deps);
-
-/*
-#define API_DEVICE_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(index_object_type, value_object_type) \
-\
-template ccl::event CCL_API \
-sparse_allreduce(const index_object_type& send_ind_buf, \
-                 size_t send_ind_count, \
-                 const value_object_type& send_val_buf, \
-                 size_t send_val_count, \
-                 index_object_type& recv_ind_buf, \
-                 size_t recv_ind_count, \
-                 value_object_type& recv_val_buf, \
-                 size_t recv_val_count, \
-                 ccl::reduction reduction, \
-                 const ccl::communicator& comm, \
-                 const ccl::stream& op_stream, \
-                 const ccl::sparse_allreduce_attr& attr, \
-                 const ccl::vector_class<event>& deps); \
-\
-template ccl::event CCL_API \
-sparse_allreduce(const index_object_type& send_ind_buf, \
-                 size_t send_ind_count, \
-                 const value_object_type& send_val_buf, \
-                 size_t send_val_count, \
-                 index_object_type& recv_ind_buf, \
-                 size_t recv_ind_count, \
-                 value_object_type& recv_val_buf, \
-                 size_t recv_val_count, \
-                 ccl::reduction reduction, \
-                 const ccl::communicator& comm, \
-                 const ccl::sparse_allreduce_attr& attr, \
-                 const ccl::vector_class<event>& deps);
-*/
-
-} // namespace preview
-
-} // namespace ccl
+#pragma once
+
+namespace ccl {
+
+namespace v1 {
+
+/******************** COMMUNICATOR ********************/
+
+/**
+ * Generating API types for collective operations
+ * of the communicator class (communicator)
+ */
+#define API_COMM_OP_PTR_EXPLICIT_INSTANTIATION(BufferType) \
+\
+    template event CCL_API allgatherv(const BufferType* send_buf, \
+                                      size_t send_count, \
+                                      BufferType* recv_buf, \
+                                      const vector_class<size_t>& recv_counts, \
+                                      const communicator& comm, \
+                                      const stream& op_stream, \
+                                      const allgatherv_attr& attr, \
+                                      const vector_class<event>& deps); \
+\
+    template event CCL_API allgatherv(const BufferType* send_buf, \
+                                      size_t send_count, \
+                                      BufferType* recv_buf, \
+                                      const vector_class<size_t>& recv_counts, \
+                                      const communicator& comm, \
+                                      const allgatherv_attr& attr, \
+                                      const vector_class<event>& deps); \
+\
+    template event CCL_API allgatherv(const BufferType* send_buf, \
+                                      size_t send_count, \
+                                      vector_class<BufferType*>& recv_bufs, \
+                                      const vector_class<size_t>& recv_counts, \
+                                      const communicator& comm, \
+                                      const stream& op_stream, \
+                                      const allgatherv_attr& attr, \
+                                      const vector_class<event>& deps); \
+\
+    template event CCL_API allgatherv(const BufferType* send_buf, \
+                                      size_t send_count, \
+                                      vector_class<BufferType*>& recv_bufs, \
+                                      const vector_class<size_t>& recv_counts, \
+                                      const communicator& comm, \
+                                      const allgatherv_attr& attr, \
+                                      const vector_class<event>& deps); \
+\
+    template event CCL_API allreduce(const BufferType* send_buf, \
+                                     BufferType* recv_buf, \
+                                     size_t count, \
+                                     reduction reduction, \
+                                     const communicator& comm, \
+                                     const stream& op_stream, \
+                                     const allreduce_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API allreduce(const BufferType* send_buf, \
+                                     BufferType* recv_buf, \
+                                     size_t count, \
+                                     reduction reduction, \
+                                     const communicator& comm, \
+                                     const allreduce_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API alltoall(const BufferType* send_buf, \
+                                    BufferType* recv_buf, \
+                                    size_t count, \
+                                    const communicator& comm, \
+                                    const stream& op_stream, \
+                                    const alltoall_attr& attr, \
+                                    const vector_class<event>& deps); \
+\
+    template event CCL_API alltoall(const BufferType* send_buf, \
+                                    BufferType* recv_buf, \
+                                    size_t count, \
+                                    const communicator& comm, \
+                                    const alltoall_attr& attr, \
+                                    const vector_class<event>& deps); \
+\
+    template event CCL_API alltoall(const vector_class<BufferType*>& send_buf, \
+                                    const vector_class<BufferType*>& recv_buf, \
+                                    size_t count, \
+                                    const communicator& comm, \
+                                    const stream& op_stream, \
+                                    const alltoall_attr& attr, \
+                                    const vector_class<event>& deps); \
+\
+    template event CCL_API alltoall(const vector_class<BufferType*>& send_buf, \
+                                    const vector_class<BufferType*>& recv_buf, \
+                                    size_t count, \
+                                    const communicator& comm, \
+                                    const alltoall_attr& attr, \
+                                    const vector_class<event>& deps); \
+\
+    template event CCL_API alltoallv(const BufferType* send_buf, \
+                                     const vector_class<size_t>& send_counts, \
+                                     BufferType* recv_buf, \
+                                     const vector_class<size_t>& recv_counts, \
+                                     const communicator& comm, \
+                                     const stream& op_stream, \
+                                     const alltoallv_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API alltoallv(const BufferType* send_buf, \
+                                     const vector_class<size_t>& send_counts, \
+                                     BufferType* recv_buf, \
+                                     const vector_class<size_t>& recv_counts, \
+                                     const communicator& comm, \
+                                     const alltoallv_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API alltoallv(const vector_class<BufferType*>& send_bufs, \
+                                     const vector_class<size_t>& send_counts, \
+                                     const vector_class<BufferType*>& recv_bufs, \
+                                     const vector_class<size_t>& recv_counts, \
+                                     const communicator& comm, \
+                                     const stream& op_stream, \
+                                     const alltoallv_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API alltoallv(const vector_class<BufferType*>& send_bufs, \
+                                     const vector_class<size_t>& send_counts, \
+                                     const vector_class<BufferType*>& recv_bufs, \
+                                     const vector_class<size_t>& recv_counts, \
+                                     const communicator& comm, \
+                                     const alltoallv_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API broadcast(BufferType* buf, \
+                                     size_t count, \
+                                     int root, \
+                                     const communicator& comm, \
+                                     const stream& op_stream, \
+                                     const broadcast_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API broadcast(BufferType* buf, \
+                                     size_t count, \
+                                     int root, \
+                                     const communicator& comm, \
+                                     const broadcast_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API reduce(const BufferType* send_buf, \
+                                  BufferType* recv_buf, \
+                                  size_t count, \
+                                  reduction reduction, \
+                                  int root, \
+                                  const communicator& comm, \
+                                  const stream& op_stream, \
+                                  const reduce_attr& attr, \
+                                  const vector_class<event>& deps); \
+\
+    template event CCL_API reduce(const BufferType* send_buf, \
+                                  BufferType* recv_buf, \
+                                  size_t count, \
+                                  reduction reduction, \
+                                  int root, \
+                                  const communicator& comm, \
+                                  const reduce_attr& attr, \
+                                  const vector_class<event>& deps); \
+\
+    template event CCL_API reduce_scatter(const BufferType* send_buf, \
+                                          BufferType* recv_buf, \
+                                          size_t recv_count, \
+                                          reduction reduction, \
+                                          const communicator& comm, \
+                                          const stream& op_stream, \
+                                          const reduce_scatter_attr& attr, \
+                                          const vector_class<event>& deps); \
+\
+    template event CCL_API reduce_scatter(const BufferType* send_buf, \
+                                          BufferType* recv_buf, \
+                                          size_t recv_count, \
+                                          reduction reduction, \
+                                          const communicator& comm, \
+                                          const reduce_scatter_attr& attr, \
+                                          const vector_class<event>& deps);
+
+#define API_COMM_OP_REF_EXPLICIT_INSTANTIATION(BufferObjectType) \
+\
+    template event CCL_API allgatherv(const BufferObjectType& send_buf, \
+                                      size_t send_count, \
+                                      BufferObjectType& recv_buf, \
+                                      const vector_class<size_t>& recv_counts, \
+                                      const communicator& comm, \
+                                      const stream& op_stream, \
+                                      const allgatherv_attr& attr, \
+                                      const vector_class<event>& deps); \
+\
+    template event CCL_API allgatherv(const BufferObjectType& send_buf, \
+                                      size_t send_count, \
+                                      BufferObjectType& recv_buf, \
+                                      const vector_class<size_t>& recv_counts, \
+                                      const communicator& comm, \
+                                      const allgatherv_attr& attr, \
+                                      const vector_class<event>& deps); \
+\
+    template event CCL_API allgatherv( \
+        const BufferObjectType& send_buf, \
+        size_t send_count, \
+        vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs, \
+        const vector_class<size_t>& recv_counts, \
+        const communicator& comm, \
+        const stream& op_stream, \
+        const allgatherv_attr& attr, \
+        const vector_class<event>& deps); \
+\
+    template event CCL_API allgatherv( \
+        const BufferObjectType& send_buf, \
+        size_t send_count, \
+        vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs, \
+        const vector_class<size_t>& recv_counts, \
+        const communicator& comm, \
+        const allgatherv_attr& attr, \
+        const vector_class<event>& deps); \
+\
+    template event CCL_API allreduce(const BufferObjectType& send_buf, \
+                                     BufferObjectType& recv_buf, \
+                                     size_t count, \
+                                     reduction reduction, \
+                                     const communicator& comm, \
+                                     const stream& op_stream, \
+                                     const allreduce_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API allreduce(const BufferObjectType& send_buf, \
+                                     BufferObjectType& recv_buf, \
+                                     size_t count, \
+                                     reduction reduction, \
+                                     const communicator& comm, \
+                                     const allreduce_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API alltoall(const BufferObjectType& send_buf, \
+                                    BufferObjectType& recv_buf, \
+                                    size_t count, \
+                                    const communicator& comm, \
+                                    const stream& op_stream, \
+                                    const alltoall_attr& attr, \
+                                    const vector_class<event>& deps); \
+\
+    template event CCL_API alltoall(const BufferObjectType& send_buf, \
+                                    BufferObjectType& recv_buf, \
+                                    size_t count, \
+                                    const communicator& comm, \
+                                    const alltoall_attr& attr, \
+                                    const vector_class<event>& deps); \
+\
+    template event CCL_API alltoall( \
+        const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf, \
+        const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf, \
+        size_t count, \
+        const communicator& comm, \
+        const stream& op_stream, \
+        const alltoall_attr& attr, \
+        const vector_class<event>& deps); \
+\
+    template event CCL_API alltoall( \
+        const vector_class<reference_wrapper_class<BufferObjectType>>& send_buf, \
+        const vector_class<reference_wrapper_class<BufferObjectType>>& recv_buf, \
+        size_t count, \
+        const communicator& comm, \
+        const alltoall_attr& attr, \
+        const vector_class<event>& deps); \
+\
+    template event CCL_API alltoallv(const BufferObjectType& send_buf, \
+                                     const vector_class<size_t>& send_counts, \
+                                     BufferObjectType& recv_buf, \
+                                     const vector_class<size_t>& recv_counts, \
+                                     const communicator& comm, \
+                                     const stream& op_stream, \
+                                     const alltoallv_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API alltoallv(const BufferObjectType& send_buf, \
+                                     const vector_class<size_t>& send_counts, \
+                                     BufferObjectType& recv_buf, \
+                                     const vector_class<size_t>& recv_counts, \
+                                     const communicator& comm, \
+                                     const alltoallv_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API alltoallv( \
+        const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs, \
+        const vector_class<size_t>& send_counts, \
+        const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs, \
+        const vector_class<size_t>& recv_counts, \
+        const communicator& comm, \
+        const stream& op_stream, \
+        const alltoallv_attr& attr, \
+        const vector_class<event>& deps); \
+\
+    template event CCL_API alltoallv( \
+        const vector_class<reference_wrapper_class<BufferObjectType>>& send_bufs, \
+        const vector_class<size_t>& send_counts, \
+        const vector_class<reference_wrapper_class<BufferObjectType>>& recv_bufs, \
+        const vector_class<size_t>& recv_counts, \
+        const communicator& comm, \
+        const alltoallv_attr& attr, \
+        const vector_class<event>& deps); \
+\
+    template event CCL_API broadcast(BufferObjectType& buf, \
+                                     size_t count, \
+                                     int root, \
+                                     const communicator& comm, \
+                                     const stream& op_stream, \
+                                     const broadcast_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API broadcast(BufferObjectType& buf, \
+                                     size_t count, \
+                                     int root, \
+                                     const communicator& comm, \
+                                     const broadcast_attr& attr, \
+                                     const vector_class<event>& deps); \
+\
+    template event CCL_API reduce(const BufferObjectType& send_buf, \
+                                  BufferObjectType& recv_buf, \
+                                  size_t count, \
+                                  reduction reduction, \
+                                  int root, \
+                                  const communicator& comm, \
+                                  const stream& op_stream, \
+                                  const reduce_attr& attr, \
+                                  const vector_class<event>& deps); \
+\
+    template event CCL_API reduce(const BufferObjectType& send_buf, \
+                                  BufferObjectType& recv_buf, \
+                                  size_t count, \
+                                  reduction reduction, \
+                                  int root, \
+                                  const communicator& comm, \
+                                  const reduce_attr& attr, \
+                                  const vector_class<event>& deps); \
+\
+    template event CCL_API reduce_scatter(const BufferObjectType& send_buf, \
+                                          BufferObjectType& recv_buf, \
+                                          size_t recv_count, \
+                                          reduction reduction, \
+                                          const communicator& comm, \
+                                          const stream& op_stream, \
+                                          const reduce_scatter_attr& attr, \
+                                          const vector_class<event>& deps); \
+\
+    template event CCL_API reduce_scatter(const BufferObjectType& send_buf, \
+                                          BufferObjectType& recv_buf, \
+                                          size_t recv_count, \
+                                          reduction reduction, \
+                                          const communicator& comm, \
+                                          const reduce_scatter_attr& attr, \
+                                          const vector_class<event>& deps);
+
+} // namespace v1
+
+namespace preview {
+
+#define API_COMM_SPARSE_OP_PTR_EXPLICIT_INSTANTIATION(index_type, value_type) \
+\
+    template ccl::event CCL_API sparse_allreduce(const index_type* send_ind_buf, \
+                                                 size_t send_ind_count, \
+                                                 const value_type* send_val_buf, \
+                                                 size_t send_val_count, \
+                                                 index_type* recv_ind_buf, \
+                                                 size_t recv_ind_count, \
+                                                 value_type* recv_val_buf, \
+                                                 size_t recv_val_count, \
+                                                 ccl::reduction reduction, \
+                                                 const ccl::communicator& comm, \
+                                                 const ccl::stream& op_stream, \
+                                                 const ccl::sparse_allreduce_attr& attr, \
+                                                 const ccl::vector_class<ccl::event>& deps); \
+\
+    template ccl::event CCL_API sparse_allreduce(const index_type* send_ind_buf, \
+                                                 size_t send_ind_count, \
+                                                 const value_type* send_val_buf, \
+                                                 size_t send_val_count, \
+                                                 index_type* recv_ind_buf, \
+                                                 size_t recv_ind_count, \
+                                                 value_type* recv_val_buf, \
+                                                 size_t recv_val_count, \
+                                                 ccl::reduction reduction, \
+                                                 const ccl::communicator& comm, \
+                                                 const ccl::sparse_allreduce_attr& attr, \
+                                                 const ccl::vector_class<ccl::event>& deps);
+
+/*
+#define API_COMM_SPARSE_OP_REF_EXPLICIT_INSTANTIATION(index_object_type, value_object_type) \
+\
+template ccl::event CCL_API \
+sparse_allreduce(const index_object_type& send_ind_buf, \
+                 size_t send_ind_count, \
+                 const value_object_type& send_val_buf, \
+                 size_t send_val_count, \
+                 index_object_type& recv_ind_buf, \
+                 size_t recv_ind_count, \
+                 value_object_type& recv_val_buf, \
+                 size_t recv_val_count, \
+                 ccl::reduction reduction, \
+                 const ccl::communicator& comm, \
+                 const ccl::stream& op_stream, \
+                 const ccl::sparse_allreduce_attr& attr, \
+                 const ccl::vector_class<event>& deps); \
+\
+template ccl::event CCL_API \
+sparse_allreduce(const index_object_type& send_ind_buf, \
+                 size_t send_ind_count, \
+                 const value_object_type& send_val_buf, \
+                 size_t send_val_count, \
+                 index_object_type& recv_ind_buf, \
+                 size_t recv_ind_count, \
+                 value_object_type& recv_val_buf, \
+                 size_t recv_val_count, \
+                 ccl::reduction reduction, \
+                 const ccl::communicator& comm, \
+                 const ccl::sparse_allreduce_attr& attr, \
+                 const ccl::vector_class<event>& deps);
+*/
+
+} // namespace preview
+
+} // namespace ccl
diff --git a/src/ccl_app_api_coll_attr.cpp b/src/ccl_app_api_coll_attr.cpp
index 63b6829f0..1c519531b 100644
--- a/src/ccl_app_api_coll_attr.cpp
+++ b/src/ccl_app_api_coll_attr.cpp
@@ -13,28 +13,31 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_aliases.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 
 // Core file with PIMPL implementation
 #include "coll_attr_impl.hpp"
 #include "coll/coll_attributes.hpp"
 
 namespace ccl {
+
+namespace v1 {
+
 #define COMMA ,
 
 #define API_FORCE_INSTANTIATION_SET(class_name, IN_attrType, IN_attrId, IN_Value) \
     template CCL_API \
-        typename details::ccl_api_type_attr_traits<IN_attrType, IN_attrId>::return_type \
+        typename detail::ccl_api_type_attr_traits<IN_attrType, IN_attrId>::return_type \
         class_name::set<IN_attrId, IN_Value>(const IN_Value& v);
 
 #define API_FORCE_INSTANTIATION_GET(class_name, IN_attrType, IN_attrId) \
-    template CCL_API const typename details::ccl_api_type_attr_traits<IN_attrType, \
-                                                                      IN_attrId>::return_type& \
+    template CCL_API const typename detail::ccl_api_type_attr_traits<IN_attrType, \
+                                                                     IN_attrId>::return_type& \
     class_name::get<IN_attrId>() const;
 
 #define API_FORCE_INSTANTIATION(class_name, IN_attrType, IN_attrId, IN_Value) \
@@ -44,10 +47,10 @@ namespace ccl {
 #define COMMON_API_FORCE_INSTANTIATION(class_name) \
     API_FORCE_INSTANTIATION( \
         class_name, operation_attr_id, operation_attr_id::version, ccl::library_version) \
-    API_FORCE_INSTANTIATION( \
-        class_name, operation_attr_id, operation_attr_id::prologue_fn, ccl::prologue_fn) \
-    API_FORCE_INSTANTIATION( \
-        class_name, operation_attr_id, operation_attr_id::epilogue_fn, ccl::epilogue_fn) \
+    /*API_FORCE_INSTANTIATION(*/ \
+    /*class_name, operation_attr_id, operation_attr_id::prologue_fn, ccl::prologue_fn)*/ \
+    /*API_FORCE_INSTANTIATION(*/ \
+    /*class_name, operation_attr_id, operation_attr_id::epilogue_fn, ccl::epilogue_fn)*/ \
 \
     API_FORCE_INSTANTIATION_SET( \
         class_name, operation_attr_id, operation_attr_id::priority, size_t) \
@@ -69,9 +72,9 @@ CCL_API allgatherv_attr::allgatherv_attr(allgatherv_attr&& src) : base_t(std::mo
 CCL_API allgatherv_attr::allgatherv_attr(const allgatherv_attr& src) : base_t(src) {}
 
 CCL_API allgatherv_attr::allgatherv_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API allgatherv_attr::~allgatherv_attr() {}
 
@@ -83,9 +86,9 @@ CCL_API allreduce_attr::allreduce_attr(allreduce_attr&& src) : base_t(std::move(
 CCL_API allreduce_attr::allreduce_attr(const allreduce_attr& src) : base_t(src) {}
 
 CCL_API allreduce_attr::allreduce_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API allreduce_attr::~allreduce_attr() {}
 
@@ -97,9 +100,9 @@ CCL_API alltoall_attr::alltoall_attr(alltoall_attr&& src) : base_t(std::move(src
 CCL_API alltoall_attr::alltoall_attr(const alltoall_attr& src) : base_t(src) {}
 
 CCL_API alltoall_attr::alltoall_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API alltoall_attr::~alltoall_attr() {}
 
@@ -111,9 +114,9 @@ CCL_API alltoallv_attr::alltoallv_attr(alltoallv_attr&& src) : base_t(std::move(
 CCL_API alltoallv_attr::alltoallv_attr(const alltoallv_attr& src) : base_t(src) {}
 
 CCL_API alltoallv_attr::alltoallv_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API alltoallv_attr::~alltoallv_attr() {}
 
@@ -125,9 +128,9 @@ CCL_API barrier_attr::barrier_attr(barrier_attr&& src) : base_t(std::move(src))
 CCL_API barrier_attr::barrier_attr(const barrier_attr& src) : base_t(src) {}
 
 CCL_API barrier_attr::barrier_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API barrier_attr::~barrier_attr() {}
 
@@ -139,9 +142,9 @@ CCL_API broadcast_attr::broadcast_attr(broadcast_attr&& src) : base_t(std::move(
 CCL_API broadcast_attr::broadcast_attr(const broadcast_attr& src) : base_t(src) {}
 
 CCL_API broadcast_attr::broadcast_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API broadcast_attr::~broadcast_attr() {}
 
@@ -153,9 +156,9 @@ CCL_API reduce_attr::reduce_attr(reduce_attr&& src) : base_t(std::move(src)) {}
 CCL_API reduce_attr::reduce_attr(const reduce_attr& src) : base_t(src) {}
 
 CCL_API reduce_attr::reduce_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API reduce_attr::~reduce_attr() {}
 
@@ -168,9 +171,9 @@ CCL_API reduce_scatter_attr::reduce_scatter_attr(reduce_scatter_attr&& src)
 CCL_API reduce_scatter_attr::reduce_scatter_attr(const reduce_scatter_attr& src) : base_t(src) {}
 
 CCL_API reduce_scatter_attr::reduce_scatter_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API reduce_scatter_attr::~reduce_scatter_attr() {}
 
@@ -184,9 +187,9 @@ CCL_API sparse_allreduce_attr::sparse_allreduce_attr(const sparse_allreduce_attr
         : base_t(src) {}
 
 CCL_API sparse_allreduce_attr::sparse_allreduce_attr(
-    const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                     operation_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                    operation_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API sparse_allreduce_attr::~sparse_allreduce_attr() {}
 
@@ -195,14 +198,14 @@ CCL_API const void* sparse_allreduce_attr::set<sparse_allreduce_attr_id::fn_ctx,
     const void* const& v) {
     return get_impl()->set_attribute_value(
         v,
-        details::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
-                                          sparse_allreduce_attr_id::fn_ctx>{});
+        detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
+                                         sparse_allreduce_attr_id::fn_ctx>{});
 }
 template <>
 CCL_API const void* const& sparse_allreduce_attr::get<sparse_allreduce_attr_id::fn_ctx>() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
-                                          sparse_allreduce_attr_id::fn_ctx>{});
+        detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
+                                         sparse_allreduce_attr_id::fn_ctx>{});
 }
 
 /**
@@ -248,4 +251,7 @@ API_FORCE_INSTANTIATION(sparse_allreduce_attr,
 #undef API_FORCE_INSTANTIATION
 #undef COMMON_API_FORCE_INSTANTIATION
 #undef COMMA
+
+} // namespace v1
+
 } // namespace ccl
diff --git a/src/ccl_app_api_comm_attr.cpp b/src/ccl_app_api_comm_attr.cpp
new file mode 100644
index 000000000..5468c6bd5
--- /dev/null
+++ b/src/ccl_app_api_comm_attr.cpp
@@ -0,0 +1,77 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/comm_attr_ids.hpp"
+#include "oneapi/ccl/comm_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_attr.hpp"
+
+// Core file with PIMPL implementation
+#include "common/comm/comm_common_attr.hpp"
+#include "comm_attr_impl.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+#define API_FORCE_INSTANTIATION(class_name, IN_attrId, IN_Value, OUT_Traits_Value) \
+    template CCL_API IN_Value class_name::set<IN_attrId, IN_Value>(const IN_Value& v); \
+\
+    template CCL_API const typename OUT_Traits_Value<comm_attr_id, IN_attrId>::type& \
+    class_name::get<IN_attrId>() const; \
+\
+    template CCL_API bool class_name::is_valid<IN_attrId>() const noexcept;
+
+/**
+ * comm_attr attributes definition
+ */
+CCL_API comm_attr::comm_attr(ccl_empty_attr)
+        : base_t(impl_value_t(new impl_t(ccl_empty_attr::version))) {}
+CCL_API comm_attr::comm_attr(comm_attr&& src) : base_t(std::move(src)) {}
+
+CCL_API comm_attr::comm_attr(const comm_attr& src) : base_t(src) {}
+
+CCL_API comm_attr::comm_attr(
+    const typename detail::ccl_api_type_attr_traits<comm_attr_id,
+                                                    comm_attr_id::version>::return_type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
+
+CCL_API comm_attr::~comm_attr() noexcept {}
+
+CCL_API comm_attr& comm_attr::operator=(const comm_attr& src) {
+    this->get_impl() = src.get_impl();
+    return *this;
+}
+
+CCL_API comm_attr& comm_attr::operator=(comm_attr&& src) {
+    if (src.get_impl() != this->get_impl()) {
+        src.get_impl().swap(this->get_impl());
+        src.get_impl().reset();
+    }
+    return *this;
+}
+
+API_FORCE_INSTANTIATION(comm_attr,
+                        comm_attr_id::version,
+                        ccl::library_version,
+                        detail::ccl_api_type_attr_traits)
+
+#undef API_FORCE_INSTANTIATION
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/ccl_app_api_comm_split_attr.cpp b/src/ccl_app_api_comm_split_attr.cpp
index 3a7c2dba8..71f4b70dd 100644
--- a/src/ccl_app_api_comm_split_attr.cpp
+++ b/src/ccl_app_api_comm_split_attr.cpp
@@ -13,24 +13,25 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_aliases.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
 
 // Core file with PIMPL implementation
 #include "common/comm/comm_split_common_attr.hpp"
 #include "comm_split_attr_impl.hpp"
 
 namespace ccl {
-#define COMMA ,
+
+namespace v1 {
+
 #define API_FORCE_INSTANTIATION(class_name, IN_attrId, IN_Value, OUT_Traits_Value) \
     template CCL_API IN_Value class_name::set<IN_attrId, IN_Value>(const IN_Value& v); \
 \
-    template CCL_API const typename details::OUT_Traits_Value<comm_split_attr_id, \
-                                                              IN_attrId>::type& \
+    template CCL_API const typename OUT_Traits_Value<comm_split_attr_id, IN_attrId>::type& \
     class_name::get<IN_attrId>() const; \
 \
     template CCL_API bool class_name::is_valid<IN_attrId>() const noexcept;
@@ -39,17 +40,15 @@ namespace ccl {
  * comm_split_attr attributes definition
  */
 CCL_API comm_split_attr::comm_split_attr(ccl_empty_attr)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(ccl_empty_attr::version))) {}
-CCL_API comm_split_attr::comm_split_attr(comm_split_attr&& src)
-        : base_t(std::move(src)) {}
+        : base_t(impl_value_t(new impl_t(ccl_empty_attr::version))) {}
+CCL_API comm_split_attr::comm_split_attr(comm_split_attr&& src) : base_t(std::move(src)) {}
 
-CCL_API comm_split_attr::comm_split_attr(const comm_split_attr& src)
-        : base_t(src) {}
+CCL_API comm_split_attr::comm_split_attr(const comm_split_attr& src) : base_t(src) {}
 
 CCL_API comm_split_attr::comm_split_attr(
-    const typename details::ccl_api_type_attr_traits<comm_split_attr_id,
-                                                  comm_split_attr_id::version>::type& version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
+    const typename detail::ccl_api_type_attr_traits<comm_split_attr_id,
+                                                    comm_split_attr_id::version>::type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
 
 CCL_API comm_split_attr::~comm_split_attr() noexcept {}
 
@@ -65,19 +64,22 @@ CCL_API comm_split_attr& comm_split_attr::operator=(comm_split_attr&& src) {
     }
     return *this;
 }
+
 API_FORCE_INSTANTIATION(comm_split_attr,
                         comm_split_attr_id::color,
                         int,
-                        ccl_api_type_attr_traits)
+                        detail::ccl_api_type_attr_traits)
 API_FORCE_INSTANTIATION(comm_split_attr,
                         comm_split_attr_id::group,
-                        group_split_type,
-                        ccl_api_type_attr_traits)
+                        split_group,
+                        detail::ccl_api_type_attr_traits)
 API_FORCE_INSTANTIATION(comm_split_attr,
                         comm_split_attr_id::version,
                         ccl::library_version,
-                        ccl_api_type_attr_traits)
+                        detail::ccl_api_type_attr_traits)
 
 #undef API_FORCE_INSTANTIATION
-#undef COMMA
+
+} // namespace v1
+
 } // namespace ccl
diff --git a/src/ccl_app_api_datatype_attr.cpp b/src/ccl_app_api_datatype_attr.cpp
index ed7ca3bc5..0f1dcfacc 100644
--- a/src/ccl_app_api_datatype_attr.cpp
+++ b/src/ccl_app_api_datatype_attr.cpp
@@ -13,75 +13,74 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_aliases.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_datatype_attr_ids.hpp"
-#include "oneapi/ccl/ccl_datatype_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_datatype_attr.hpp"
-
-// Core file with PIMPL implementation
-#include "common/datatype/datatype_attr.hpp"
-#include "datatype_attr_impl.hpp"
-
-namespace ccl {
-
-#define COMMA ,
-#define API_FORCE_SETTER_INSTANTIATION(class_name, IN_attrId, IN_Value, OUT_Traits_Value) \
-    template CCL_API IN_Value class_name::set<IN_attrId, IN_Value>(const IN_Value& v);
-
-#define API_FORCE_GETTER_INSTANTIATION(class_name, IN_attrId, IN_Value, OUT_Traits_Value) \
-    template CCL_API const typename details::OUT_Traits_Value<datatype_attr_id, \
-                                                              IN_attrId>::return_type& \
-    class_name::get<IN_attrId>() const;
-
-/**
- * datatype_attr attributes definition
- */
-CCL_API datatype_attr::datatype_attr(datatype_attr&& src) : base_t(std::move(src)) {}
-
-CCL_API datatype_attr::datatype_attr(const datatype_attr& src) : base_t(src) {}
-
-CCL_API datatype_attr::datatype_attr(
-    const typename details::ccl_api_type_attr_traits<datatype_attr_id,
-                                                     datatype_attr_id::version>::return_type&
-        version)
-        : base_t(std::shared_ptr<impl_t>(new impl_t(version))) {}
-
-CCL_API datatype_attr::~datatype_attr() noexcept {}
-
-CCL_API datatype_attr& datatype_attr::operator=(const datatype_attr& src) {
-    this->get_impl() = src.get_impl();
-    return *this;
-}
-
-CCL_API datatype_attr& datatype_attr::operator=(datatype_attr&& src) {
-    if (src.get_impl() != this->get_impl()) {
-        src.get_impl().swap(this->get_impl());
-        src.get_impl().reset();
-    }
-    return *this;
-}
-
-API_FORCE_SETTER_INSTANTIATION(datatype_attr,
-                               datatype_attr_id::size,
-                               int,
-                               ccl_api_type_attr_traits);
-API_FORCE_SETTER_INSTANTIATION(datatype_attr,
-                               datatype_attr_id::size,
-                               size_t,
-                               ccl_api_type_attr_traits);
-API_FORCE_GETTER_INSTANTIATION(datatype_attr,
-                               datatype_attr_id::size,
-                               size_t,
-                               ccl_api_type_attr_traits);
-API_FORCE_GETTER_INSTANTIATION(datatype_attr,
-                               datatype_attr_id::version,
-                               ccl::library_version,
-                               ccl_api_type_attr_traits);
-
-#undef API_FORCE_SETTER_INSTANTIATION
-#undef API_FORCE_GETTER_INSTANTIATION
-#undef COMMA
-
-} // namespace ccl
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/datatype_attr_ids.hpp"
+#include "oneapi/ccl/datatype_attr_ids_traits.hpp"
+#include "oneapi/ccl/datatype_attr.hpp"
+
+// Core file with PIMPL implementation
+#include "common/datatype/datatype_attr.hpp"
+#include "datatype_attr_impl.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+#define API_FORCE_SETTER_INSTANTIATION(class_name, IN_attrId, IN_Value, OUT_Traits_Value) \
+    template CCL_API IN_Value class_name::set<IN_attrId, IN_Value>(const IN_Value& v);
+
+#define API_FORCE_GETTER_INSTANTIATION(class_name, IN_attrId, OUT_Traits_Value) \
+    template CCL_API const typename OUT_Traits_Value<datatype_attr_id, IN_attrId>::return_type& \
+    class_name::get<IN_attrId>() const;
+
+/**
+ * datatype_attr attributes definition
+ */
+CCL_API datatype_attr::datatype_attr(datatype_attr&& src) : base_t(std::move(src)) {}
+
+CCL_API datatype_attr::datatype_attr(const datatype_attr& src) : base_t(src) {}
+
+CCL_API datatype_attr::datatype_attr(
+    const typename detail::ccl_api_type_attr_traits<datatype_attr_id,
+                                                    datatype_attr_id::version>::return_type&
+        version)
+        : base_t(impl_value_t(new impl_t(version))) {}
+
+CCL_API datatype_attr::~datatype_attr() noexcept {}
+
+CCL_API datatype_attr& datatype_attr::operator=(const datatype_attr& src) {
+    this->get_impl() = src.get_impl();
+    return *this;
+}
+
+CCL_API datatype_attr& datatype_attr::operator=(datatype_attr&& src) {
+    if (src.get_impl() != this->get_impl()) {
+        src.get_impl().swap(this->get_impl());
+        src.get_impl().reset();
+    }
+    return *this;
+}
+
+API_FORCE_SETTER_INSTANTIATION(datatype_attr,
+                               datatype_attr_id::size,
+                               int,
+                               detail::ccl_api_type_attr_traits);
+API_FORCE_SETTER_INSTANTIATION(datatype_attr,
+                               datatype_attr_id::size,
+                               size_t,
+                               detail::ccl_api_type_attr_traits);
+API_FORCE_GETTER_INSTANTIATION(datatype_attr,
+                               datatype_attr_id::size,
+                               detail::ccl_api_type_attr_traits);
+API_FORCE_GETTER_INSTANTIATION(datatype_attr,
+                               datatype_attr_id::version,
+                               detail::ccl_api_type_attr_traits);
+
+#undef API_FORCE_SETTER_INSTANTIATION
+#undef API_FORCE_GETTER_INSTANTIATION
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/ccl_app_api_event.cpp b/src/ccl_app_api_event.cpp
index 143cbbdbb..c2ca83a55 100644
--- a/src/ccl_app_api_event.cpp
+++ b/src/ccl_app_api_event.cpp
@@ -13,70 +13,78 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "common/event/impls/event_impl.hpp"
-#include "common/event/impls/empty_event.hpp"
-#include "common/event/impls/native_event.hpp"
-
-namespace ccl {
-
-CCL_API event::event() noexcept : base_t(impl_value_t(new empty_event_impl())) {}
-CCL_API event::event(event&& src) noexcept : base_t(std::move(src)) {}
-CCL_API event::event(impl_value_t&& impl) noexcept : base_t(std::move(impl)) {}
-CCL_API event::~event() noexcept {}
-
-CCL_API event& event::operator=(event&& src) noexcept {
-    if (this->get_impl() != src.get_impl()) {
-        this->get_impl() = std::move(src.get_impl());
-    }
-    return *this;
-}
-
-bool CCL_API event::operator==(const event& rhs) const noexcept {
-    return this->get_impl() == rhs.get_impl();
-}
-
-bool CCL_API event::operator!=(const event& rhs) const noexcept {
-    return this->get_impl() != rhs.get_impl();
-}
-
-CCL_API event::operator bool() {
-    return this->test();
-}
-
-void CCL_API event::wait() {
-    get_impl()->wait();
-}
-
-bool CCL_API event::test() {
-    return get_impl()->test();
-}
-
-bool CCL_API event::cancel() {
-    return get_impl()->cancel();
-}
-
-CCL_API event::native_t& event::get_native() {
-    return const_cast<event::native_t&>(get_impl()->get_native());
-}
-
-CCL_API const event::native_t& event::get_native() const {
-    return get_impl()->get_native();
-}
-
-event CCL_API event::create_from_native(native_t& native_event) {
-    library_version version;
-    version.major = CCL_MAJOR_VERSION;
-    version.minor = CCL_MINOR_VERSION;
-    version.update = CCL_UPDATE_VERSION;
-    version.product_status = CCL_PRODUCT_STATUS;
-    version.build_date = CCL_PRODUCT_BUILD_DATE;
-    version.full = CCL_PRODUCT_FULL;
-
-    return impl_value_t(
-        new native_event_impl(native_event, version)
-    );
-}
-
-} // namespace ccl
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "common/event/impls/event_impl.hpp"
+#include "common/event/impls/empty_event.hpp"
+#include "common/event/impls/native_event.hpp"
+#include "common/utils/version.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+CCL_API event::event() noexcept : base_t(impl_value_t(new empty_event_impl())) {}
+CCL_API event::event(event&& src) noexcept : base_t(std::move(src)) {}
+CCL_API event::event(impl_value_t&& impl) noexcept : base_t(std::move(impl)) {}
+CCL_API event::~event() noexcept {}
+
+CCL_API event& event::operator=(event&& src) noexcept {
+    if (this->get_impl() != src.get_impl()) {
+        this->get_impl() = std::move(src.get_impl());
+    }
+    return *this;
+}
+
+bool CCL_API event::operator==(const event& rhs) const noexcept {
+    return this->get_impl() == rhs.get_impl();
+}
+
+bool CCL_API event::operator!=(const event& rhs) const noexcept {
+    return this->get_impl() != rhs.get_impl();
+}
+
+CCL_API event::operator bool() {
+    return this->test();
+}
+
+void CCL_API event::wait() {
+    get_impl()->wait();
+}
+
+bool CCL_API event::test() {
+    return get_impl()->test();
+}
+
+bool CCL_API event::cancel() {
+    return get_impl()->cancel();
+}
+
+CCL_API event::native_t& event::get_native() {
+    return const_cast<event::native_t&>(get_impl()->get_native());
+}
+
+CCL_API const event::native_t& event::get_native() const {
+    return get_impl()->get_native();
+}
+
+event CCL_API event::create_from_native(native_t& native_event) {
+    auto version = utils::get_library_version();
+
+    auto ev = std::unique_ptr<ccl_event>(new ccl_event(native_event, version));
+
+    return impl_value_t(new native_event_impl(std::move(ev)));
+}
+
+event CCL_API event::create_from_native(native_handle_t native_event_handle, context_t context) {
+    auto version = utils::get_library_version();
+
+    auto ev = std::unique_ptr<ccl_event>(new ccl_event(native_event_handle, context, version));
+    ev->build_from_params();
+
+    return impl_value_t(new native_event_impl(std::move(ev)));
+}
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/ccl_app_api_init_attr.cpp b/src/ccl_app_api_init_attr.cpp
new file mode 100644
index 000000000..2e2b72d8b
--- /dev/null
+++ b/src/ccl_app_api_init_attr.cpp
@@ -0,0 +1,71 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/init_attr_ids.hpp"
+#include "oneapi/ccl/init_attr_ids_traits.hpp"
+#include "oneapi/ccl/init_attr.hpp"
+
+// Core file with PIMPL implementation
+#include "init_attr_impl.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+#define API_FORCE_SETTER_INSTANTIATION(class_name, IN_attrId, IN_Value, OUT_Traits_Value) \
+    template CCL_API IN_Value class_name::set<IN_attrId, IN_Value>(const IN_Value& v);
+
+#define API_FORCE_GETTER_INSTANTIATION(class_name, IN_attrId, OUT_Traits_Value) \
+    template CCL_API const typename OUT_Traits_Value<init_attr_id, IN_attrId>::return_type& \
+    class_name::get<IN_attrId>() const;
+
+/**
+ * init_attr attributes definition
+ */
+CCL_API init_attr::init_attr(init_attr&& src) : base_t(std::move(src)) {}
+
+CCL_API init_attr::init_attr(const init_attr& src) : base_t(src) {}
+
+CCL_API init_attr::init_attr(
+    const typename detail::ccl_api_type_attr_traits<init_attr_id,
+                                                    init_attr_id::version>::return_type& version)
+        : base_t(impl_value_t(new impl_t(version))) {}
+
+CCL_API init_attr::~init_attr() noexcept {}
+
+CCL_API init_attr& init_attr::operator=(const init_attr& src) {
+    this->get_impl() = src.get_impl();
+    return *this;
+}
+
+CCL_API init_attr& init_attr::operator=(init_attr&& src) {
+    if (src.get_impl() != this->get_impl()) {
+        src.get_impl().swap(this->get_impl());
+        src.get_impl().reset();
+    }
+    return *this;
+}
+
+API_FORCE_GETTER_INSTANTIATION(init_attr, init_attr_id::version, detail::ccl_api_type_attr_traits);
+
+#undef API_FORCE_SETTER_INSTANTIATION
+#undef API_FORCE_GETTER_INSTANTIATION
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/ccl_app_api_kvs_attr.cpp b/src/ccl_app_api_kvs_attr.cpp
new file mode 100644
index 000000000..a21bb5381
--- /dev/null
+++ b/src/ccl_app_api_kvs_attr.cpp
@@ -0,0 +1,77 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/kvs_attr_ids.hpp"
+#include "oneapi/ccl/kvs_attr_ids_traits.hpp"
+#include "oneapi/ccl/kvs_attr.hpp"
+
+// Core file with PIMPL implementation
+#include "atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/kvs_common_attr.hpp"
+#include "kvs_attr_impl.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+#define API_FORCE_INSTANTIATION(class_name, IN_attrId, IN_Value, OUT_Traits_Value) \
+    template CCL_API IN_Value class_name::set<IN_attrId, IN_Value>(const IN_Value& v); \
+\
+    template CCL_API const typename OUT_Traits_Value<kvs_attr_id, IN_attrId>::type& \
+    class_name::get<IN_attrId>() const; \
+\
+    template CCL_API bool class_name::is_valid<IN_attrId>() const noexcept;
+
+/**
+ * kvs_attr attributes definition
+ */
+CCL_API kvs_attr::kvs_attr(ccl_empty_attr)
+        : base_t(impl_value_t(new impl_t(ccl_empty_attr::version))) {}
+CCL_API kvs_attr::kvs_attr(kvs_attr&& src) : base_t(std::move(src)) {}
+
+CCL_API kvs_attr::kvs_attr(const kvs_attr& src) : base_t(src) {}
+
+CCL_API kvs_attr::kvs_attr(
+    const typename detail::ccl_api_type_attr_traits<kvs_attr_id, kvs_attr_id::version>::return_type&
+        version)
+        : base_t(impl_value_t(new impl_t(version))) {}
+
+CCL_API kvs_attr::~kvs_attr() noexcept {}
+
+CCL_API kvs_attr& kvs_attr::operator=(const kvs_attr& src) {
+    this->get_impl() = src.get_impl();
+    return *this;
+}
+
+CCL_API kvs_attr& kvs_attr::operator=(kvs_attr&& src) {
+    if (src.get_impl() != this->get_impl()) {
+        src.get_impl().swap(this->get_impl());
+        src.get_impl().reset();
+    }
+    return *this;
+}
+
+API_FORCE_INSTANTIATION(kvs_attr,
+                        kvs_attr_id::version,
+                        ccl::library_version,
+                        detail::ccl_api_type_attr_traits)
+
+#undef API_FORCE_INSTANTIATION
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/ccl_cpp_api.cpp b/src/ccl_cpp_api.cpp
deleted file mode 100644
index 813d69471..000000000
--- a/src/ccl_cpp_api.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#if 0
-#include "oneapi/ccl.hpp"
-
-#include "coll/coll_attributes.hpp"
-
-#include "common/comm/comm_split_common_attr.hpp"
-#include "comm_split_attr_impl.hpp"
-
-#include "common/comm/l0/comm_context_storage.hpp"
-
-#include "common/event/event_internal/event_internal_impl.hpp"
-#include "stream_impl.hpp"
-
-#include "common/global/global.hpp"
-#include "common/comm/comm.hpp"
-
-#include "common/comm/l0/comm_context.hpp"
-#include "oneapi/ccl/ccl_communicator.hpp"
-
-#include "common/global/global.hpp"
-#include "exec/exec.hpp"
-
-#include "common/comm/comm_interface.hpp"
-
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-
-#ifdef CCL_ENABLE_SYCL
-#include <CL/sycl.hpp>
-#endif
-
-#define CCL_CHECK_AND_THROW(result, diagnostic) \
-    do { \
-        if (result != ccl_status_success) { \
-            throw ccl::exception(diagnostic); \
-        } \
-    } while (0);
-
-
-namespace ccl
-{
-
-CCL_API ccl::environment::environment()
-{
-    static auto result = global_data::get().init();
-    CCL_CHECK_AND_THROW(result, "failed to initialize CCL");
-}
-
-CCL_API ccl::environment::~environment()
-{}
-
-CCL_API ccl::environment& ccl::environment::instance()
-{
-    static ccl::environment env;
-    return env;
-}
-
-void CCL_API ccl::environment::set_resize_fn(ccl_resize_fn_t callback)
-{
-    ccl_status_t result = ccl_set_resize_fn(callback);
-    CCL_CHECK_AND_THROW(result, "failed to set resize callback");
-    return;
-}
-
-ccl::library_version CCL_API ccl::environment::get_version() const
-{
-    ccl::library_version ret;
-    ccl_status_t result = ccl_get_version(&ret);
-    CCL_CHECK_AND_THROW(result, "failed to get version");
-    return ret;
-}
-/*
-static ccl::stream& get_empty_stream()
-{
-    static ccl::stream_t empty_stream  = ccl::environment::instance().create_stream();
-    return empty_stream;
-}
-*/
-
-/**
- * Factory methods
- */
-// KVS
-kvs_t CCL_API environment::create_main_kvs() const
-{
-    return std::shared_ptr<kvs>(new kvs);
-}
-
-kvs_t CCL_API environment::create_kvs(const kvs::addr_t& addr) const
-{
-    return std::shared_ptr<kvs>(new kvs(addr));
-}
-
-//Communicator
-communicator CCL_API environment::create_communicator() const
-{
-    return communicator::create_communicator();
-}
-
-communicator CCL_API environment::create_communicator(const size_t size,
-                                       shared_ptr_class<kvs_interface> kvs) const
-{
-    return communicator::create_communicator(size, kvs);
-}
-
-communicator CCL_API environment::create_communicator(const size_t size,
-                                     const size_t rank,
-                                     shared_ptr_class<kvs_interface> kvs) const
-{
-    return communicator::create_communicator(size, rank, kvs);
-}
-
-//Device communicator
-#ifdef MULTI_GPU_SUPPORT
-
-template <class ...attr_value_pair_t>
-comm_split_attr environment::create_comm_split_attr(attr_value_pair_t&&...avps) const
-{
-    return comm_split_attr::create_comm_split_attr(std::forward<attr_value_pair_t>(avps)...);
-}
-
-template<class DeviceType,
-             class ContextType>
-vector_class<communicator> CCL_API environment::create_communicators(
-        const size_t devices_size,
-        const vector_class<DeviceType>& local_devices,
-        ContextType& context,
-        shared_ptr_class<kvs_interface> kvs) const
-{
-    return communicator::create_communicators(devices_size, local_devices, context, kvs);
-}
-
-template<class DeviceType,
-         class ContextType>
-vector_class<communicator> CCL_API environment::create_communicators(
-        const size_t cluster_devices_size, /*global devics count*/
-        const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-        ContextType& context,
-        shared_ptr_class<kvs_interface> kvs)
-{
-    return communicator::create_communicators(cluster_devices_size, local_rank_device_map, context, kvs);
-}
-
-
-template<class DeviceType,
-         class ContextType>
-vector_class<communicator> CCL_API environment::create_communicators(
-        const size_t cluster_devices_size, /*global devics count*/
-        const map_class<rank_t, DeviceType>& local_rank_device_map,
-        ContextType& context,
-        shared_ptr_class<kvs_interface> kvs)
-{
-    return communicator::create_communicators(cluster_devices_size, local_rank_device_map, context, kvs);
-}
-
-
-//Stream
-template <class native_stream_type,
-          typename T>
-stream CCL_API environment::create_stream(native_stream_type& native_stream)
-{
-    return stream::create_stream(native_stream);
-}
-
-template <class native_stream_type, class native_context_type,
-          typename T>
-stream CCL_API environment::create_stream(native_stream_type& native_stream, native_context_type& native_ctx)
-{
-    return stream::create_stream(native_stream, native_ctx);
-}
-
-template <class ...attr_value_pair_t>
-stream CCL_API environment::create_stream_from_attr(typename unified_device_type::ccl_native_t device, attr_value_pair_t&&...avps)
-{
-    return stream::create_stream_from_attr(device, std::forward<attr_value_pair_t>(avps)...);
-}
-
-template <class ...attr_value_pair_t>
-stream CCL_API environment::create_stream_from_attr(typename unified_device_type::ccl_native_t device,
-                               typename unified_device_context_type::ccl_native_t context,
-                               attr_value_pair_t&&...avps)
-{
-    return stream::create_stream_from_attr(device, context, std::forward<attr_value_pair_t>(avps)...);
-}
-
-
-//Event
-template <class event_type,
-          typename T>
-event CCL_API environment::create_event(event_type& native_event)
-{
-    return event::create_event(native_event);
-}
-
-template <class event_type,
-          class ...attr_value_pair_t>
-event CCL_API environment::create_event_from_attr(event_type& native_event_handle,
-                             typename unified_device_context_type::ccl_native_t context,
-                             attr_value_pair_t&&...avps)
-{
-    return event::create_event_from_attr(native_event_handle, context,  std::forward<attr_value_pair_t>(avps)...);
-}
-/*
-#define STREAM_CREATOR_INSTANTIATION(type)                                                                                                           \
-template ccl::stream_t CCL_API ccl::environment::create_stream(type& stream);
-
-#ifdef CCL_ENABLE_SYCL
-STREAM_CREATOR_INSTANTIATION(cl::sycl::queue)
-#endif
-*/
-#endif //MULTI_GPU_SUPPORT
-}
-#include "types_generator_defines.hpp"
-#include "oneapi/ccl/ccl_cpp_api_explicit_in.hpp"
-#endif //0
diff --git a/src/ccl_cpp_api_explicit_in.hpp b/src/ccl_cpp_api_explicit_in.hpp
deleted file mode 100644
index 9a36f6393..000000000
--- a/src/ccl_cpp_api_explicit_in.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#ifndef COMMA
-#define COMMA ,
-#endif
-
-//TODO
-#if 0
-/**
- * Attributes
- */
-HOST_ATTRIBUTE_INSTANTIATION(ccl_host_color,
-                               typename ccl::comm_split_attr_id_traits<ccl_host_color>::type);
-HOST_ATTRIBUTE_INSTANTIATION(ccl_host_version,
-                               typename ccl::comm_split_attr_id_traits<ccl_host_version>::type);
-
-API_COLL_EXPLICIT_INSTANTIATION(char);
-API_COLL_EXPLICIT_INSTANTIATION(int);
-API_COLL_EXPLICIT_INSTANTIATION(int64_t);
-API_COLL_EXPLICIT_INSTANTIATION(uint64_t);
-API_COLL_EXPLICIT_INSTANTIATION(float);
-API_COLL_EXPLICIT_INSTANTIATION(double);
-
-#ifdef CCL_ENABLE_SYCL
-    API_COLL_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<char COMMA 1>);
-    API_COLL_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<int COMMA 1>);
-    API_COLL_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>);
-    API_COLL_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<uint64_t COMMA 1>);
-    API_COLL_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<float COMMA 1>);
-    API_COLL_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(char, char);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(char, int);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(char, ccl::bf16);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(char, float);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(char, double);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(char, int64_t);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(char, uint64_t);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int, char);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int, int);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int, ccl::bf16);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int, float);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int, double);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int, int64_t);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int, uint64_t);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int64_t, char);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int64_t, int);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int64_t, ccl::bf16);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int64_t, float);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int64_t, double);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int64_t, int64_t);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(int64_t, uint64_t);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(uint64_t, char);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(uint64_t, int);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(uint64_t, ccl::bf16);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(uint64_t, float);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(uint64_t, double);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(uint64_t, int64_t);
-API_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(uint64_t, uint64_t);
-
-#ifdef CCL_ENABLE_SYCL
-    API_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<int COMMA 1>,
-                                                      cl::sycl::buffer<float COMMA 1>);
-    API_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<int COMMA 1>,
-                                                      cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-    API_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                      cl::sycl::buffer<float COMMA 1>);
-    API_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                      cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-#undef COMMA
-
-#endif //TODO
diff --git a/src/ccl_cpp_communicator.cpp b/src/ccl_cpp_communicator.cpp
index 5a1743776..7e608b59b 100644
--- a/src/ccl_cpp_communicator.cpp
+++ b/src/ccl_cpp_communicator.cpp
@@ -13,31 +13,39 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_aliases.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
 
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
 
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
+#include "oneapi/ccl/comm_attr_ids.hpp"
+#include "oneapi/ccl/comm_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_attr.hpp"
 
-#include "common/event/event_internal/event_internal_attr_ids.hpp"
-#include "common/event/event_internal/event_internal_attr_ids_traits.hpp"
-#include "common/event/event_internal/event_internal.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
 
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_stream.hpp"
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/stream.hpp"
 
-#include "oneapi/ccl/ccl_event.hpp"
+#include "oneapi/ccl/device_attr_ids.hpp"
+#include "oneapi/ccl/device_attr_ids_traits.hpp"
+#include "oneapi/ccl/device.hpp"
 
-#include "oneapi/ccl/ccl_communicator.hpp"
+#include "oneapi/ccl/context_attr_ids.hpp"
+#include "oneapi/ccl/context_attr_ids_traits.hpp"
+#include "oneapi/ccl/context.hpp"
+
+#include "oneapi/ccl/event.hpp"
+
+#include "oneapi/ccl/communicator.hpp"
 #include "common/comm/l0/comm_context_storage.hpp"
 
 #include "common/global/global.hpp"
@@ -50,10 +58,11 @@
 
 namespace ccl {
 
+namespace v1 {
+
 CCL_API communicator::communicator(impl_value_t&& impl) : base_t(std::move(impl)) {}
 
-CCL_API communicator::communicator(communicator&& src)
-        : base_t(std::move(src)) {}
+CCL_API communicator::communicator(communicator&& src) : base_t(std::move(src)) {}
 
 CCL_API communicator& communicator::operator=(communicator&& src) {
     if (src.get_impl() != this->get_impl()) {
@@ -65,11 +74,11 @@ CCL_API communicator& communicator::operator=(communicator&& src) {
 
 CCL_API communicator::~communicator() {}
 
-CCL_API size_t communicator::rank() const {
+CCL_API int communicator::rank() const {
     return get_impl()->rank();
 }
 
-CCL_API size_t communicator::size() const {
+CCL_API int communicator::size() const {
     return get_impl()->size();
 }
 
@@ -82,32 +91,37 @@ CCL_API communicator communicator::split(const comm_split_attr& attr) {
     return communicator(get_impl()->split(attr));
 }
 
-CCL_API communicator::ccl_device_t communicator::get_device() {
-    return get_impl()->get_device();
+CCL_API device communicator::get_device() const {
+    return device::create_device(get_impl()->get_device());
 }
 
-CCL_API communicator::ccl_context_t communicator::get_context() {
-    return get_impl()->get_context();
+CCL_API context communicator::get_context() const {
+    return context::create_context(get_impl()->get_context());
 }
 
+} // namespace v1
+
 } // namespace ccl
 
 /****API force instantiations for factory methods******/
-API_DEVICE_COMM_CREATE_WO_RANK_EXPLICIT_INSTANTIATION(ccl::device, ccl::context)
-API_DEVICE_COMM_CREATE_WITH_RANK_IN_VECTOR_EXPLICIT_INSTANTIATION(ccl::device,
-                                                                  ccl::context)
-API_DEVICE_COMM_CREATE_WITH_RANK_IN_MAP_EXPLICIT_INSTANTIATION(ccl::device, ccl::context)
-
-API_DEVICE_COMM_CREATE_WO_RANK_EXPLICIT_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t, typename ccl::unified_device_context_type::ccl_native_t)
-API_DEVICE_COMM_CREATE_WITH_RANK_IN_VECTOR_EXPLICIT_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t, typename ccl::unified_device_context_type::ccl_native_t)
-API_DEVICE_COMM_CREATE_WITH_RANK_IN_MAP_EXPLICIT_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t, typename ccl::unified_device_context_type::ccl_native_t)
-
-API_DEVICE_COMM_CREATE_WO_RANK_EXPLICIT_INSTANTIATION(
-    ccl::device_index_type,
-    typename ccl::unified_device_context_type::ccl_native_t)
-API_DEVICE_COMM_CREATE_WITH_RANK_IN_VECTOR_EXPLICIT_INSTANTIATION(
+API_COMM_CREATE_WO_RANK_EXPLICIT_INSTANTIATION(ccl::device, ccl::context)
+API_COMM_CREATE_WITH_RANK_IN_VECTOR_EXPLICIT_INSTANTIATION(ccl::device, ccl::context)
+API_COMM_CREATE_WITH_RANK_IN_MAP_EXPLICIT_INSTANTIATION(ccl::device, ccl::context)
+
+API_COMM_CREATE_WO_RANK_EXPLICIT_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t,
+                                               typename ccl::unified_context_type::ccl_native_t)
+API_COMM_CREATE_WITH_RANK_IN_VECTOR_EXPLICIT_INSTANTIATION(
+    typename ccl::unified_device_type::ccl_native_t,
+    typename ccl::unified_context_type::ccl_native_t)
+API_COMM_CREATE_WITH_RANK_IN_MAP_EXPLICIT_INSTANTIATION(
+    typename ccl::unified_device_type::ccl_native_t,
+    typename ccl::unified_context_type::ccl_native_t)
+
+API_COMM_CREATE_WO_RANK_EXPLICIT_INSTANTIATION(ccl::device_index_type,
+                                               typename ccl::unified_context_type::ccl_native_t)
+API_COMM_CREATE_WITH_RANK_IN_VECTOR_EXPLICIT_INSTANTIATION(
     ccl::device_index_type,
-    typename ccl::unified_device_context_type::ccl_native_t)
-API_DEVICE_COMM_CREATE_WITH_RANK_IN_MAP_EXPLICIT_INSTANTIATION(
+    typename ccl::unified_context_type::ccl_native_t)
+API_COMM_CREATE_WITH_RANK_IN_MAP_EXPLICIT_INSTANTIATION(
     ccl::device_index_type,
-    typename ccl::unified_device_context_type::ccl_native_t)
+    typename ccl::unified_context_type::ccl_native_t)
diff --git a/src/ccl_cpp_context.cpp b/src/ccl_cpp_context.cpp
index ff7e409d3..7db9a742b 100644
--- a/src/ccl_cpp_context.cpp
+++ b/src/ccl_cpp_context.cpp
@@ -13,11 +13,13 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "context_impl.hpp"
 
 namespace ccl {
 
+namespace v1 {
+
 CCL_API context::context(context&& src) : base_t(std::move(src)) {}
 CCL_API context::context(const context& src) : base_t(src) {}
 
@@ -40,25 +42,37 @@ CCL_API context& context::operator=(const context& src) {
     return *this;
 }
 
+bool CCL_API context::operator==(const context& rhs) const noexcept {
+    return this->get_impl() == rhs.get_impl();
+}
+
+bool CCL_API context::operator!=(const context& rhs) const noexcept {
+    return this->get_impl() != rhs.get_impl();
+}
+
+bool CCL_API context::operator<(const context& rhs) const noexcept {
+    return this->get_impl() < rhs.get_impl();
+}
+
 CCL_API void context::build_from_params() {
     get_impl()->build_from_params();
 }
 
-CCL_API context::native_t& context::get_native()
-{
+CCL_API context::native_t& context::get_native() {
     return const_cast<context::native_t&>(static_cast<const context*>(this)->get_native());
 }
 
-CCL_API const context::native_t& context::get_native() const
-{
+CCL_API const context::native_t& context::get_native() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<ccl::context_attr_id, ccl::context_attr_id::native_handle>{});
+        detail::ccl_api_type_attr_traits<context_attr_id, context_attr_id::native_handle>{});
 }
-} // namespace ccl
 
+} // namespace v1
+
+} // namespace ccl
 
-API_DEVICE_CONTEXT_CREATION_FORCE_INSTANTIATION(typename ccl::unified_device_context_type::ccl_native_t)
+API_CONTEXT_CREATION_FORCE_INSTANTIATION(typename ccl::unified_context_type::ccl_native_t)
 
-API_DEVICE_CONTEXT_FORCE_INSTANTIATION(ccl::context_attr_id::version, ccl::library_version);
-API_DEVICE_CONTEXT_FORCE_INSTANTIATION_GET(ccl::context_attr_id::cl_backend);
-API_DEVICE_CONTEXT_FORCE_INSTANTIATION_GET(ccl::context_attr_id::native_handle);
+API_CONTEXT_FORCE_INSTANTIATION(ccl::context_attr_id::version, ccl::library_version);
+API_CONTEXT_FORCE_INSTANTIATION_GET(ccl::context_attr_id::cl_backend);
+API_CONTEXT_FORCE_INSTANTIATION_GET(ccl::context_attr_id::native_handle);
diff --git a/src/ccl_cpp_device.cpp b/src/ccl_cpp_device.cpp
index 68e9c4003..826fcabcc 100644
--- a/src/ccl_cpp_device.cpp
+++ b/src/ccl_cpp_device.cpp
@@ -13,11 +13,13 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "device_impl.hpp"
 
 namespace ccl {
 
+namespace v1 {
+
 CCL_API device::device(device&& src) : base_t(std::move(src)) {}
 
 CCL_API device::device(const device& src) : base_t(src) {}
@@ -41,20 +43,33 @@ CCL_API device& device::operator=(const device& src) {
     return *this;
 }
 
+bool CCL_API device::operator==(const device& rhs) const noexcept {
+    return this->get_impl() == rhs.get_impl();
+}
+
+bool CCL_API device::operator!=(const device& rhs) const noexcept {
+    return this->get_impl() != rhs.get_impl();
+}
+
+bool CCL_API device::operator<(const device& rhs) const noexcept {
+    return this->get_impl() < rhs.get_impl();
+}
+
 CCL_API void device::build_from_params() {
     get_impl()->build_from_params();
 }
 
-CCL_API device::native_t& device::get_native()
-{
+CCL_API device::native_t& device::get_native() {
     return const_cast<device::native_t&>(static_cast<const device*>(this)->get_native());
 }
 
-CCL_API const device::native_t& device::get_native() const
-{
+CCL_API const device::native_t& device::get_native() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<ccl::device_attr_id, ccl::device_attr_id::native_handle>{});
+        detail::ccl_api_type_attr_traits<device_attr_id, device_attr_id::native_handle>{});
 }
+
+} // namespace v1
+
 } // namespace ccl
 
 API_DEVICE_CREATION_FORCE_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t)
diff --git a/src/ccl_cpp_environment.cpp b/src/ccl_cpp_environment.cpp
index 1fc00ccd0..a31010b4a 100644
--- a/src/ccl_cpp_environment.cpp
+++ b/src/ccl_cpp_environment.cpp
@@ -16,85 +16,65 @@
 #include "environment_impl.hpp"
 #include "common/global/global.hpp"
 #include "exec/exec.hpp"
+#include "common/utils/version.hpp"
 
 #if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
 #include "common/comm/l0/comm_context.hpp"
 #include "common/comm/comm_interface.hpp"
 #endif //#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
 
-//#include "ccl.h"    //TODO datatypes
-
 #include <memory>
 
 #include "common/comm/single_device_communicator/single_device_communicator.hpp"
 
 namespace ccl {
-CCL_API ccl::environment::environment() {
+
+namespace detail {
+
+CCL_API environment::environment() {
     static auto result = global_data::get().init();
     CCL_CHECK_AND_THROW(result, "failed to initialize CCL");
 }
 
-CCL_API ccl::environment::~environment() {}
+CCL_API environment::~environment() {}
 
-CCL_API ccl::environment& ccl::environment::instance() {
-    static ccl::environment env;
+CCL_API environment& environment::instance() {
+    static environment env;
     return env;
 }
 
-// void CCL_API ccl::environment::set_resize_fn(ccl_resize_fn_t callback)
-// {
-//     ccl_status_t result = ccl_set_resize_fn(callback);
-//     CCL_CHECK_AND_THROW(result, "failed to set resize callback");
-//     return;
-// }
-
-ccl::library_version CCL_API ccl::environment::get_library_version() const {
-    ccl::library_version ret;
+ccl::library_version CCL_API environment::get_library_version() {
+    return utils::get_library_version();
+}
 
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
+/******************** KVS ********************/
 
-    return ret;
-}
-/*
-static ccl::stream& get_empty_stream()
-{
-    static ccl::stream_t empty_stream  = ccl::environment::instance().create_stream();
-    return empty_stream;
+shared_ptr_class<kvs> environment::create_main_kvs(const kvs_attr& attr) const {
+    return std::shared_ptr<kvs>(new kvs(attr));
 }
-*/
 
-/**
- * Factory methods
- */
-// KVS
-shared_ptr_class<kvs> CCL_API environment::create_main_kvs() const {
-    return std::shared_ptr<kvs>(new kvs);
+shared_ptr_class<kvs> environment::create_kvs(const kvs::address_type& addr,
+                                              const kvs_attr& attr) const {
+    return std::shared_ptr<kvs>(new kvs(addr, attr));
 }
 
-shared_ptr_class<kvs> CCL_API environment::create_kvs(const kvs::address_type& addr) const {
-    return std::shared_ptr<kvs>(new kvs(addr));
-}
+/******************** DEVICE ********************/
 
-// device
-device CCL_API environment::create_device(empty_t empty) const
-{
+device environment::create_device(empty_t empty) const {
     static typename ccl::unified_device_type::ccl_native_t default_native_device;
     return device::create_device(default_native_device);
 }
 
-// context
-context CCL_API environment::create_context(empty_t empty) const
-{
-    static typename ccl::unified_device_context_type::ccl_native_t default_native_context;
+/******************** CONTEXT ********************/
+
+context environment::create_context(empty_t empty) const {
+    static typename ccl::unified_context_type::ccl_native_t default_native_context;
     return context::create_context(default_native_context);
 }
 
-ccl::datatype CCL_API environment::register_datatype(const ccl::datatype_attr& attr) {
+/******************** DATATYPE ********************/
+
+ccl::datatype environment::register_datatype(const datatype_attr& attr) {
     while (unlikely(ccl::global_data::get().executor->is_locked)) {
         std::this_thread::yield();
     }
@@ -104,7 +84,7 @@ ccl::datatype CCL_API environment::register_datatype(const ccl::datatype_attr& a
     return ccl::global_data::get().dtypes->create(attr);
 }
 
-void CCL_API environment::deregister_datatype(ccl::datatype dtype) {
+void environment::deregister_datatype(ccl::datatype dtype) {
     while (unlikely(ccl::global_data::get().executor->is_locked)) {
         std::this_thread::yield();
     }
@@ -114,7 +94,7 @@ void CCL_API environment::deregister_datatype(ccl::datatype dtype) {
     ccl::global_data::get().dtypes->free(dtype);
 }
 
-size_t CCL_API environment::get_datatype_size(ccl::datatype dtype) const {
+size_t environment::get_datatype_size(ccl::datatype dtype) const {
     while (unlikely(ccl::global_data::get().executor->is_locked)) {
         std::this_thread::yield();
     }
@@ -122,109 +102,78 @@ size_t CCL_API environment::get_datatype_size(ccl::datatype dtype) const {
     return ccl::global_data::get().dtypes->get(dtype).size();
 }
 
-} // namespace ccl
+/******************** STREAM ********************/
+
+stream CCL_API environment::create_stream(typename unified_device_type::ccl_native_t device) {
+    auto version = utils::get_library_version();
+    return stream{ stream_provider_dispatcher::create(device, version) };
+}
+
+stream CCL_API environment::create_stream(typename unified_device_type::ccl_native_t device,
+                                          typename unified_context_type::ccl_native_t context) {
+    auto version = utils::get_library_version();
+    return stream{ stream_provider_dispatcher::create(device, context, version) };
+}
+
+/******************** COMMUNICATOR ********************/
 
 #ifdef CCL_ENABLE_SYCL
-ccl::communicator CCL_API ccl::environment::create_single_device_communicator(
-    const size_t comm_size,
-    const size_t rank,
+communicator environment::create_single_device_communicator(
+    const int comm_size,
+    const int rank,
     const cl::sycl::device& device,
     const cl::sycl::context& context,
-    ccl::shared_ptr_class<ccl::kvs_interface> kvs) const {
+    ccl::shared_ptr_class<kvs_interface> kvs) const {
     LOG_TRACE("Create single device communicator from SYCL device");
 
     std::shared_ptr<ikvs_wrapper> kvs_wrapper(new users_kvs(kvs));
     std::shared_ptr<atl_wrapper> atl =
         std::shared_ptr<atl_wrapper>(new atl_wrapper(comm_size, { rank }, kvs_wrapper));
 
-    ccl::comm_split_attr attr = create_comm_split_attr(
-        ccl::attr_val<ccl::comm_split_attr_id::group>(ccl::group_split_type::undetermined));
-    ccl::communicator_interface_ptr impl =
-        ccl::communicator_interface::create_communicator_impl(device, context, rank, comm_size, attr, atl);
+    comm_split_attr attr = create_comm_split_attr(attr_val<comm_split_attr_id::group>(
+        split_group::cluster /*group_split_type::undetermined*/));
+    ccl::communicator_interface_ptr impl = ccl::communicator_interface::create_communicator_impl(
+        device, context, rank, comm_size, attr, atl);
 
     //TODO use gpu_comm_attr to automatically visit()
     auto single_dev_comm = std::dynamic_pointer_cast<single_device_communicator>(impl);
     //single_dev_comm->set_context(context);
-    return ccl::communicator(std::move(impl));
+    return communicator(std::move(impl));
 }
-
 #endif
 
-//Communicator
-ccl::communicator CCL_API ccl::environment::create_communicator() const {
-    return ccl::communicator::create_communicator();
+communicator environment::create_communicator(const comm_attr& attr) const {
+    return communicator::create_communicator(attr);
 }
 
-ccl::communicator CCL_API ccl::environment::create_communicator(const size_t size,
-                                                      ccl::shared_ptr_class<ccl::kvs_interface> kvs) const {
-    return ccl::communicator::create_communicator(size, kvs);
+communicator environment::create_communicator(const size_t size,
+                                              ccl::shared_ptr_class<kvs_interface> kvs,
+                                              const comm_attr& attr) const {
+    return communicator::create_communicator(size, kvs, attr);
 }
 
-ccl::communicator CCL_API ccl::environment::create_communicator(const size_t size,
-                                                      const size_t rank,
-                                                      ccl::shared_ptr_class<ccl::kvs_interface> kvs) const {
-    return ccl::communicator::create_communicator(size, rank, kvs);
+communicator environment::create_communicator(const size_t size,
+                                              const int rank,
+                                              ccl::shared_ptr_class<kvs_interface> kvs,
+                                              const comm_attr& attr) const {
+    return communicator::create_communicator(size, rank, kvs, attr);
 }
 
-/***************************TypeGenerations*********************************************************/
-namespace ccl {
-template <>
-stream CCL_API environment::create_postponed_api_type<
-    stream,
-    typename unified_device_type::ccl_native_t,
-    typename unified_device_context_type::ccl_native_t>(
-    typename unified_device_type::ccl_native_t device,
-    typename unified_device_context_type::ccl_native_t context) const {
-    library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    return stream{ stream_provider_dispatcher::create(device, context, ret) };
-}
-template <>
-stream CCL_API
-environment::create_postponed_api_type<stream,
-                                            typename unified_device_type::ccl_native_t>(
-    typename unified_device_type::ccl_native_t device) const {
-    library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    return stream{ stream_provider_dispatcher::create(device, ret) };
-}
-}
-CREATE_OP_ATTR_INSTANTIATION(ccl::allgatherv_attr)
-CREATE_OP_ATTR_INSTANTIATION(ccl::allreduce_attr)
-CREATE_OP_ATTR_INSTANTIATION(ccl::alltoall_attr)
-CREATE_OP_ATTR_INSTANTIATION(ccl::alltoallv_attr)
-CREATE_OP_ATTR_INSTANTIATION(ccl::broadcast_attr)
-CREATE_OP_ATTR_INSTANTIATION(ccl::reduce_attr)
-CREATE_OP_ATTR_INSTANTIATION(ccl::reduce_scatter_attr)
-CREATE_OP_ATTR_INSTANTIATION(ccl::sparse_allreduce_attr)
-
-CREATE_OP_ATTR_INSTANTIATION(ccl::comm_split_attr)
-
-CREATE_OP_ATTR_INSTANTIATION(ccl::datatype_attr)
+} // namespace detail
+
+} // namespace ccl
+
+/******************** TypeGenerations ********************/
 
 CREATE_DEV_COMM_INSTANTIATION(ccl::device, ccl::context)
-CREATE_DEV_COMM_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t, typename ccl::unified_device_context_type::ccl_native_t)
-CREATE_DEV_COMM_INSTANTIATION(ccl::device_index_type, typename ccl::unified_device_context_type::ccl_native_t)
+CREATE_DEV_COMM_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t,
+                              typename ccl::unified_context_type::ccl_native_t)
+CREATE_DEV_COMM_INSTANTIATION(ccl::device_index_type,
+                              typename ccl::unified_context_type::ccl_native_t)
 
 CREATE_STREAM_INSTANTIATION(typename ccl::unified_stream_type::ccl_native_t)
-CREATE_STREAM_EXT_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t, typename ccl::unified_device_context_type::ccl_native_t)
+CREATE_STREAM_EXT_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t,
+                                typename ccl::unified_context_type::ccl_native_t)
 
-CREATE_CONTEXT_INSTANTIATION(typename ccl::unified_device_context_type::ccl_native_t)
+CREATE_CONTEXT_INSTANTIATION(typename ccl::unified_context_type::ccl_native_t)
 CREATE_DEVICE_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t)
-
-/*
-CREATE_EVENT_INSTANTIATION(cl::sycl::event)
-CREATE_EVENT_EXT_INSTANTIATION(cl_event)
-*/
diff --git a/src/ccl_cpp_gpu_api.cpp b/src/ccl_cpp_gpu_api.cpp
deleted file mode 100644
index ffc7f2829..000000000
--- a/src/ccl_cpp_gpu_api.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-//TODO
-#if 0
-#include <stdexcept>
-
-#include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "common/global/global.hpp"
-#include "exec/exec.hpp"
-
-#include "common/comm/comm_interface.hpp"
-#include "common/comm/host_communicator/host_communicator.hpp"
-#include "common/comm/l0/gpu_comm_attr.hpp"
-#include "common/comm/l0/device_community.hpp"
-
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "oneapi/ccl/native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
-
-#ifdef CCL_ENABLE_SYCL
-#include <CL/sycl.hpp>
-#endif
-
-std::ostream& operator<<(std::ostream& out, const ccl::device_index_type& index)
-{
-    out << ccl::to_string(index);
-    return out;
-}
-
-namespace ccl
-{
-
-
-/* GPU communicator attributes
- */
-CCL_API ccl::ccl_device_attr::ccl_device_attr(const ccl::ccl_comm_split_attr& src) :
- base_t(src),
- pimpl(new ccl::device_attr_impl())
-{
-}
-
-CCL_API ccl::ccl_device_attr::~ccl_device_attr() noexcept
-{
-}
-
-template<device_comm_split_attr_id attrId,
-             class Value,
-             typename T>
-CCL_API Value ccl::ccl_device_attr::set_value(Value&& v)
-{
-    return pimpl->set_attribute_value(std::forward<Value>(v));
-}
-
-template<device_comm_split_attr_id attrId>
-CCL_API const typename ccl::ccl_device_attributes_traits<attrId>::type& ccl::ccl_device_attr::get_value() const
-{
-    return pimpl->get_attribute_value(
-            std::integral_constant<device_comm_split_attr_id, attrId> {});
-}
-
-/* Global Environment*/
-template<class stream_native_type, typename T>
-CCL_API ccl::stream_t ccl::environment::create_stream(stream_native_type& s)
-{
-    return ccl::stream_t(new ccl::stream(stream_provider_dispatcher::create(s)));
-}
-
-CCL_API ccl::comm_group_t ccl::environment::create_comm_group(size_t current_device_group_size, size_t process_device_group_size,
-                                                              ccl::shared_communicator_t parent_comm /* = ccl::shared_communicator_t()*/)
-{
-    if (!parent_comm)
-    {
-        //use global communicator by default
-        ccl::shared_communicator_t(ccl::environment::instance().create_communicator()).swap(parent_comm);
-    }
-
-    ccl::comm_group_t group;
-    {
-        // register group slot in global context table, based on communicator id
-        auto host_comm_impl = std::dynamic_pointer_cast<host_communicator>(parent_comm->pimpl);
-        if (!host_comm_impl)
-        {
-            throw ccl::exception(std::string(__FUNCTION__) + " - failed, invalid host communicator type");
-        }
-
-        group_context::group_unique_key unique_id =
-                    host_comm_impl->get_host_attr()->get_value<ccl_host_color>();
-
-        std::unique_lock<ccl_spinlock> lock(global_ctx.mutex);
-        auto ctx_it = global_ctx.communicator_group_map.find(unique_id);
-        if(ctx_it == global_ctx.communicator_group_map.end())
-        {
-            group.reset(new ccl::comm_group(parent_comm,
-                                            current_device_group_size,
-                                            process_device_group_size));
-            global_ctx.communicator_group_map.insert({
-                                                        unique_id,
-                                                        group
-                                                     });
-        }
-        else
-        {
-            group = ctx_it->second;
-        }
-    }
-
-    // sync existing group: blocking operation - wait for all groups
-    group->pimpl->sync_group_size(current_device_group_size);
-    return group;
-}
-
-CCL_API ccl::comm_group::comm_group(ccl::shared_communicator_t parent_comm,
-                                    size_t current_device_group_size, size_t process_device_group_size):
-    pimpl(new ccl::gpu_comm_attr(parent_comm, current_device_group_size, process_device_group_size))
-{
-};
-
-/**
- *  Create communicator API:
- */
-CCL_API ccl::comm_split_attr ccl::comm_group::create_comm_split_attr()
-{
-    // TODO
-    const auto& host_comm = pimpl->get_host_communicator();
-    return ccl::comm_split_attr{new ccl::ccl_device_attr(*(host_comm->get_comm_split_attr()))};
-}
-/*
- *  Single device communicator creation
- */
-template <class DeviceType,
-          typename std::enable_if<std::is_class<typename std::remove_cv<DeviceType>::type>::value,
-                                      int>::type>
-CCL_API ccl::communicator_t ccl::comm_group::create_communicator(const DeviceType& device,
-                                                                     ccl::comm_split_attr attr/* = comm_device_attr_t()*/)
-{
-    LOG_TRACE("Create communicator from device");
-    ccl::communicator_interface_ptr impl =
-            ccl::communicator_interface::create_communicator_impl(device,
-                                                                  pimpl->thread_id,
-                                                                  pimpl->ccl_communicator->rank(),
-                                                                  attr,
-                                                                  pimpl->ccl_communicator->comm_impl.atl);
-    // registering device in group - is non blocking operation, until it is not the last device
-    pimpl->sync_register_communicator(impl);
-    return ccl::communicator_t(new ccl::communicator(impl));
-}
-
-template <class DeviceType,
-          typename std::enable_if<not std::is_class<typename std::remove_cv<DeviceType>::type>::value,
-                                      int>::type>
-CCL_API ccl::communicator_t ccl::comm_group::create_communicator(DeviceType device_id,
-                                                                    ccl::comm_split_attr attr/* = nullptr*/)
-{
-    LOG_TRACE("Create communicator from id: ", device_id);
-
-    ccl::communicator_interface_ptr impl = ccl::communicator_interface::create_communicator_impl(device_id,
-                                                                                                 pimpl->thread_id,
-                                                                                                 pimpl->ccl_communicator->rank(),
-                                                                                                 attr);
-    // registering device in group - is non blocking operation, until it is not the last device
-    pimpl->sync_register_communicator(impl);
-    return ccl::communicator_t(new ccl::communicator(impl));
-}
-
-/**
- *  Multiple device communicators creation vectorized API implementation
- */
-template<class InputIt>
-CCL_API std::vector<ccl::communicator_t> ccl::comm_group::create_communicators(InputIt first, InputIt last,
-                                                                                   ccl::comm_split_attr attr/* = nullptr*/)
-{
-
-    using iterator_value_type = typename std::iterator_traits<InputIt>::value_type;
-/*
-    using expected_value_type = typename unified_device_type::device_t;
-    static_assert(std::is_same<iterator_value_type, expected_value_type>::value,
-                  "Not valid InputIt in create_communicators");
-*/
-    size_t indices_count = std::distance(first, last);
-    LOG_TRACE("Create device communicators from index iterators type, count: ", indices_count);
-
-    std::vector<ccl::communicator_t> comms;
-    comms.reserve(indices_count);
-    std::transform(first, last, std::back_inserter(comms), [this, attr](const iterator_value_type& device_id)
-    {
-        return create_communicator(device_id, attr);
-    });
-    return comms;
-}
-
-template<template<class...> class Container, class Type>
-CCL_API std::vector<ccl::communicator_t> ccl::comm_group::create_communicators(const Container<Type>& device_ids,
-                                                                                   ccl::comm_split_attr attr/* = nullptr*/)
-{
-    //static_assert(std::is_same<Type, ccl::device_index_type>::value, "Invalid Type in create_communicators");
-    LOG_TRACE("Create device communicators from index type, count: ", device_ids.size(),
-              ". Redirect to iterators version");
-    return create_communicators(device_ids.begin(), device_ids.end(), attr);
-}
-
-CCL_API ccl::comm_group::device_context_native_const_reference_t ccl::comm_group::get_context() const
-{
-    //TODO use PIMPL as context provider
-    static unified_device_context_type context;
-    return context.get();
-}
-
-
-/***********************************************************************/
-#define DEVICE_ATTRIBUTE_INSTANTIATION(ATTR_ID, VALUE_TYPE) \
-    template VALUE_TYPE CCL_API ccl::ccl_device_attr::set_value<ATTR_ID, VALUE_TYPE>(VALUE_TYPE && \
-                                                                                     v); \
-    template CCL_API const VALUE_TYPE& ccl::ccl_device_attr::get_value<ATTR_ID>() const;
-
-#define STREAM_CREATOR_INSTANTIATION(type) \
-    template ccl::stream_t CCL_API ccl::environment::create_stream(type& stream);
-
-#define COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(type) \
-    template std::vector<ccl::communicator_t> CCL_API ccl::comm_group::create_communicators( \
-        const type& device_ids, ccl::comm_split_attr attr);
-
-// device attribute instantiations
-DEVICE_ATTRIBUTE_INSTANTIATION(ccl_device_preferred_topology_class,
-                               typename ccl::ccl_device_attributes_traits<ccl_device_preferred_topology_class>::type);
-DEVICE_ATTRIBUTE_INSTANTIATION(ccl_device_preferred_group,
-                               typename ccl::ccl_device_attributes_traits<ccl_device_preferred_group>::type);
-
-
-// stream instantiations
-STREAM_CREATOR_INSTANTIATION(ze_command_queue_handle_t)
-#ifdef CCL_ENABLE_SYCL
-    STREAM_CREATOR_INSTANTIATION(cl::sycl::queue)
-#endif
-
-// container-based method force-instantiation will trigger ALL other methods instantiations
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(std::vector<ccl::device_index_type>);
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(std::list<ccl::device_index_type>);
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::device_indices_t);
-#ifdef CCL_ENABLE_SYCL
-    COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(cl::sycl::vector_class<cl::sycl::device>);
-#endif
-
-#endif //TODO
diff --git a/src/ccl_cpp_kvs.cpp b/src/ccl_cpp_kvs.cpp
index d84d7a3ec..f18b130b8 100644
--- a/src/ccl_cpp_kvs.cpp
+++ b/src/ccl_cpp_kvs.cpp
@@ -13,4 +13,14 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+
+#include "oneapi/ccl/kvs_attr_ids.hpp"
+#include "oneapi/ccl/kvs_attr_ids_traits.hpp"
+#include "oneapi/ccl/kvs_attr.hpp"
+
 #include "kvs_impl.hpp"
diff --git a/src/ccl_cpp_stream.cpp b/src/ccl_cpp_stream.cpp
index 1d36237a5..2fc96e2be 100644
--- a/src/ccl_cpp_stream.cpp
+++ b/src/ccl_cpp_stream.cpp
@@ -13,16 +13,16 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "stream_impl.hpp"
 #include "oneapi/ccl/native_device_api/export_api.hpp"
 
-#ifndef COMMA
-#define COMMA ,
-#endif
 namespace ccl {
+
+namespace v1 {
+
 CCL_API stream::stream(
-    const typename details::ccl_api_type_attr_traits<stream_attr_id, stream_attr_id::version>::type&
+    const typename detail::ccl_api_type_attr_traits<stream_attr_id, stream_attr_id::version>::type&
         version)
         : base_t(impl_value_t()) {}
 
@@ -45,30 +45,25 @@ CCL_API void stream::build_from_params() {
     get_impl()->build_from_params();
 }
 
-CCL_API stream::native_t& stream::get_native()
-{
+CCL_API stream::native_t& stream::get_native() {
     return const_cast<stream::native_t&>(static_cast<const stream*>(this)->get_native());
 }
 
-CCL_API const stream::native_t& stream::get_native() const
-{
+CCL_API const stream::native_t& stream::get_native() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::native_handle>{});
+        detail::ccl_api_type_attr_traits<stream_attr_id, stream_attr_id::native_handle>{});
 }
+
+} // namespace v1
+
 } // namespace ccl
 
+API_STREAM_CREATION_FORCE_INSTANTIATION(typename ccl::unified_stream_type::ccl_native_t)
+API_STREAM_CREATION_EXT_FORCE_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t,
+                                            typename ccl::unified_context_type::ccl_native_t)
 #ifdef CCL_ENABLE_SYCL
-API_STREAM_CREATION_FORCE_INSTANTIATION(cl::sycl::queue)
 API_STREAM_CREATION_FORCE_INSTANTIATION(cl_command_queue)
-API_STREAM_CREATION_EXT_FORCE_INSTANTIATION(cl::sycl::device, cl::sycl::context)
 #else
-#ifdef MULTI_GPU_SUPPORT
-API_STREAM_CREATION_FORCE_INSTANTIATION(
-    native::cl_base<ze_command_queue_handle_t COMMA native::ccl_device>)
-API_STREAM_CREATION_FORCE_INSTANTIATION(
-    ccl::shared_ptr_class<native::cl_base<ze_command_queue_handle_t COMMA native::ccl_device>>)
-API_STREAM_CREATION_FORCE_INSTANTIATION(ccl::shared_ptr_class<native::ccl_device>)
-#endif
 //API_STREAM_CREATION_FORCE_INSTANTIATION(ccl::empty_t)
 #endif
 
@@ -78,11 +73,9 @@ API_STREAM_FORCE_INSTANTIATION_GET(
 API_STREAM_FORCE_INSTANTIATION_GET(
     ccl::stream_attr_id::device); //, typename ccl::unified_device_type::ccl_native_t);
 API_STREAM_FORCE_INSTANTIATION(ccl::stream_attr_id::context,
-                               typename ccl::unified_device_context_type::ccl_native_t);
+                               typename ccl::unified_context_type::ccl_native_t);
 API_STREAM_FORCE_INSTANTIATION(ccl::stream_attr_id::ordinal, uint32_t);
 API_STREAM_FORCE_INSTANTIATION(ccl::stream_attr_id::index, uint32_t);
 API_STREAM_FORCE_INSTANTIATION(ccl::stream_attr_id::flags, size_t);
 API_STREAM_FORCE_INSTANTIATION(ccl::stream_attr_id::mode, size_t);
 API_STREAM_FORCE_INSTANTIATION(ccl::stream_attr_id::priority, size_t);
-
-#undef COMMA
diff --git a/src/ccl_cpp_utils.cpp b/src/ccl_cpp_utils.cpp
index 86fc1a450..a9cd45349 100644
--- a/src/ccl_cpp_utils.cpp
+++ b/src/ccl_cpp_utils.cpp
@@ -15,33 +15,28 @@
 */
 #include <sstream>
 
-#include "oneapi/ccl/ccl_config.h"
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/config.h"
+#include "oneapi/ccl/lp_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/utils/enums.hpp"
 
 std::ostream& operator<<(std::ostream& out, const ccl::device_index_type& index);
 
 namespace ccl {
 
-using datatype_str_enum =
-    utils::enum_to_str<utils::enum_to_underlying(datatype::last_predefined) + 1>;
-CCL_API string_class to_string(const ccl::datatype& dt) {
-    return datatype_str_enum({ "INT8",
-                               "UINT8",
-                               "INT16",
-                               "UINT16",
-                               "INT32",
-                               "UINT32",
-                               "INT64",
-                               "UINT64",
-                               "FLOAT16",
-                               "FLOAT32",
-                               "FLOAT64",
-                               "BFLOAT16" })
-        .choose(dt, "CUSTOM_TYPE");
+std::string to_string(const bfloat16& v) {
+    std::stringstream ss;
+    ss << "bf16::data " << v.data;
+    return ss.str();
 }
 
-CCL_API
+// std::string to_string(const float16& v) {
+//     std::stringstream ss;
+//     ss << "fp16::data " << v.data;
+//     return ss.str();
+// }
+
+/* CCL_API */
 std::string to_string(const device_index_type& device_id) {
     std::stringstream ss;
     ss << "[" << std::get<ccl::device_index_enum::driver_index_id>(device_id) << ":"
diff --git a/src/ccl_empty_attr.cpp b/src/ccl_empty_attr.cpp
index 1d8f6ad86..a535a7439 100644
--- a/src/ccl_empty_attr.cpp
+++ b/src/ccl_empty_attr.cpp
@@ -13,16 +13,20 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "common/utils/version.hpp"
 
 namespace ccl {
-ccl::library_version ccl_empty_attr::version{
-    CCL_MAJOR_VERSION,  CCL_MINOR_VERSION,      CCL_UPDATE_VERSION,
-    CCL_PRODUCT_STATUS, CCL_PRODUCT_BUILD_DATE, CCL_PRODUCT_FULL,
-};
+
+namespace v1 {
+
+library_version ccl_empty_attr::version = utils::get_library_version();
 
 template <class attr>
 attr ccl_empty_attr::create_empty() {
     return attr{ ccl_empty_attr::version };
 }
+
+} // namespace v1
+
 } // namespace ccl
diff --git a/src/ccl_empty_coll_attr.cpp b/src/ccl_empty_coll_attr.cpp
index 07ffdde88..c0ad30ce2 100644
--- a/src/ccl_empty_coll_attr.cpp
+++ b/src/ccl_empty_coll_attr.cpp
@@ -13,17 +13,19 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_aliases.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 
 namespace ccl {
 
+namespace v1 {
+
 template <class attr>
 CCL_API attr ccl_empty_attr::create_empty() {
     return attr{ ccl_empty_attr::version };
@@ -41,4 +43,6 @@ CCL_API reduce_scatter_attr default_reduce_scatter_attr =
 CCL_API sparse_allreduce_attr default_sparse_allreduce_attr =
     ccl_empty_attr::create_empty<sparse_allreduce_attr>();
 
+} // namespace v1
+
 } // namespace ccl
diff --git a/src/ccl_empty_comm_attr.cpp b/src/ccl_empty_comm_attr.cpp
new file mode 100644
index 000000000..93418e9f4
--- /dev/null
+++ b/src/ccl_empty_comm_attr.cpp
@@ -0,0 +1,38 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+
+#include "oneapi/ccl/comm_attr_ids.hpp"
+#include "oneapi/ccl/comm_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_attr.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+template <class attr>
+CCL_API attr ccl_empty_attr::create_empty() {
+    return attr{ ccl_empty_attr::version };
+}
+
+CCL_API comm_attr default_comm_attr = ccl_empty_attr::create_empty<comm_attr>();
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/ccl_empty_comm_split_attr.cpp b/src/ccl_empty_comm_split_attr.cpp
new file mode 100644
index 000000000..a113c7ea8
--- /dev/null
+++ b/src/ccl_empty_comm_split_attr.cpp
@@ -0,0 +1,38 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+template <class attr>
+CCL_API attr ccl_empty_attr::create_empty() {
+    return attr{ ccl_empty_attr::version };
+}
+
+CCL_API comm_split_attr default_comm_split_attr = ccl_empty_attr::create_empty<comm_split_attr>();
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/ccl_empty_init_attr.cpp b/src/ccl_empty_init_attr.cpp
new file mode 100644
index 000000000..af5363dc5
--- /dev/null
+++ b/src/ccl_empty_init_attr.cpp
@@ -0,0 +1,38 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+
+#include "oneapi/ccl/init_attr_ids.hpp"
+#include "oneapi/ccl/init_attr_ids_traits.hpp"
+#include "oneapi/ccl/init_attr.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+template <class attr>
+CCL_API attr ccl_empty_attr::create_empty() {
+    return attr{ ccl_empty_attr::version };
+}
+
+CCL_API init_attr default_init_attr = ccl_empty_attr::create_empty<init_attr>();
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/include/oneapi/ccl/ccl_comm_split_attr_ids.hpp b/src/ccl_empty_kvs_attr.cpp
similarity index 56%
rename from include/oneapi/ccl/ccl_comm_split_attr_ids.hpp
rename to src/ccl_empty_kvs_attr.cpp
index 3dd55e857..cb1fed3ea 100644
--- a/include/oneapi/ccl/ccl_comm_split_attr_ids.hpp
+++ b/src/ccl_empty_kvs_attr.cpp
@@ -13,34 +13,26 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#ifndef CCL_PRODUCT_FULL
-#error "Do not include this file directly. Please include 'ccl.hpp'"
-#endif
-
-namespace ccl {
-
-enum class comm_split_attr_id : int {
-    version,
-
-    color,
-    group,
-
-    last_value
-};
-
-enum class
-    group_split_type : int { // TODO fill in this enum with the actual values
-        undetermined = -1,
-        //device,
-        thread,
-        process,
-        //socket,
-        //node,
-        cluster,
-
-        last_value
-    };
-
-} // namespace ccl
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+
+#include "oneapi/ccl/kvs_attr_ids.hpp"
+#include "oneapi/ccl/kvs_attr_ids_traits.hpp"
+#include "oneapi/ccl/kvs_attr.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+template <class attr>
+CCL_API attr ccl_empty_attr::create_empty() {
+    return attr{ ccl_empty_attr::version };
+}
+
+CCL_API kvs_attr default_kvs_attr = ccl_empty_attr::create_empty<kvs_attr>();
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/ccl_empty_stream.cpp b/src/ccl_empty_stream.cpp
index 041a8e779..eb1d993b7 100644
--- a/src/ccl_empty_stream.cpp
+++ b/src/ccl_empty_stream.cpp
@@ -13,20 +13,22 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_aliases.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_stream.hpp"
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/stream.hpp"
 
 // Core file with PIMPL implementation
 //#include "stream_impl.hpp"
 
 namespace ccl {
 
+namespace v1 {
+
 template <class attr>
 CCL_API attr ccl_empty_attr::create_empty() {
     return attr{ ccl_empty_attr::version };
@@ -34,4 +36,6 @@ CCL_API attr ccl_empty_attr::create_empty() {
 
 CCL_API stream default_stream = ccl_empty_attr::create_empty<stream>();
 
+} // namespace v1
+
 } // namespace ccl
diff --git a/src/ccl_gpu_module.hpp b/src/ccl_gpu_module.hpp
index 5e7ce6a12..6a49ecfe1 100644
--- a/src/ccl_gpu_module.hpp
+++ b/src/ccl_gpu_module.hpp
@@ -15,11 +15,12 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "coll/algorithms/algorithms_enum.hpp"
+#include "internal_types.hpp"
 
 #ifdef MULTI_GPU_SUPPORT
-ccl_status_t CCL_API register_gpu_module_source(const char* source,
-                                                ccl::device_topology_type topology_class,
-                                                ccl_coll_type type);
+ccl::status register_gpu_module_source(const char* source,
+                                       ccl::device_topology_type topology_class,
+                                       ccl_coll_type type);
 #endif //MULTI_GPU_SUPPORT
diff --git a/src/ccl_gpu_modules.cpp b/src/ccl_gpu_modules.cpp
index 07c977a7b..88e2376b8 100644
--- a/src/ccl_gpu_modules.cpp
+++ b/src/ccl_gpu_modules.cpp
@@ -23,9 +23,9 @@
 #include "common/comm/l0/device_group_routing_schema.hpp"
 #include "coll/algorithms/algorithms_enum.hpp"
 
-ccl_status_t CCL_API register_gpu_module_source(const char* path,
-                                                ccl::device_topology_type topology_class,
-                                                ccl_coll_type type) {
+ccl::status register_gpu_module_source(const char* path,
+                                       ccl::device_topology_type topology_class,
+                                       ccl_coll_type type) {
     ccl::device_topology_type t_class = static_cast<ccl::device_topology_type>(topology_class);
     char pwd[PATH_MAX];
     char* ret = getcwd(pwd, sizeof(pwd));
@@ -70,17 +70,17 @@ ccl_status_t CCL_API register_gpu_module_source(const char* path,
                 native::specific_modules_source_data_storage::instance()
                     .load_kernel_source<ccl_coll_reduce_scatter>(path, t_class);
                 break;
-            default: 
-                throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                                    " - get unexpected ccl collective type: " +
-                                                    std::to_string(type));
+            default:
+                throw std::runtime_error(
+                    std::string(__PRETTY_FUNCTION__) +
+                    " - get unexpected ccl collective type: " + std::to_string(type));
                 break;
         }
     }
     catch (const std::exception& ex) {
         LOG_ERROR("Cannot preload kernel source by path: ", path, ", error: ", ex.what());
         CCL_ASSERT(false);
-        return ccl_status_runtime_error;
+        return ccl::status::runtime_error;
     }
 
     LOG_INFO("gpu kernel source by type \"",
@@ -88,7 +88,7 @@ ccl_status_t CCL_API register_gpu_module_source(const char* path,
              "\", topology class: \"",
              to_string(t_class),
              "\" loaded succesfully");
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 #endif //MULTI_GPU_SUPPORT
diff --git a/src/ccl_utils.cpp b/src/ccl_utils.cpp
index 2f723e6b8..411dc9bea 100644
--- a/src/ccl_utils.cpp
+++ b/src/ccl_utils.cpp
@@ -16,8 +16,7 @@
 #include <sstream>
 #include <stdexcept>
 
-#include "ccl.hpp"
-#include "ccl_type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
 
 std::ostream& operator<<(std::ostream& out, const ccl::device_index_type& index) {
     out << ccl::to_string(index);
@@ -26,7 +25,6 @@ std::ostream& operator<<(std::ostream& out, const ccl::device_index_type& index)
 
 namespace ccl {
 
-CCL_API
 std::string to_string(const device_index_type& device_id) {
     std::stringstream ss;
     ss << "[" << std::get<ccl::device_index_enum::driver_index_id>(device_id) << ":"
@@ -44,7 +42,6 @@ std::string to_string(const device_index_type& device_id) {
     return ss.str();
 }
 
-CCL_API
 device_index_type from_string(const std::string& device_id_str) {
     std::string::size_type from_pos = device_id_str.find('[');
     if (from_pos == std::string::npos) {
diff --git a/src/coll/algorithms/algorithms.hpp b/src/coll/algorithms/algorithms.hpp
index 5270a2103..57b5e8456 100644
--- a/src/coll/algorithms/algorithms.hpp
+++ b/src/coll/algorithms/algorithms.hpp
@@ -17,167 +17,88 @@
 
 #include "sched/master_sched.hpp"
 #include "sched/sched.hpp"
+#include "internal_types.hpp"
 
 #include <map>
 #include <type_traits>
 
 #define CCL_UNDEFINED_ALGO_ID (-1)
 
-ccl_status_t ccl_coll_build_naive_bcast(ccl_sched* sched,
-                                        ccl_buffer buf,
-                                        size_t count,
-                                        const ccl_datatype& dtype,
-                                        size_t root,
-                                        ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
-                                                         ccl_buffer buf,
-                                                         size_t count,
-                                                         const ccl_datatype& dtype,
-                                                         size_t root,
-                                                         ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
-                                                ccl_buffer send_buf,
-                                                ccl_buffer recv_buf,
-                                                size_t count,
-                                                const ccl_datatype& dtype,
-                                                ccl::reduction reduction,
-                                                size_t root,
-                                                ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
-                                                   ccl_buffer send_buf,
-                                                   ccl_buffer recv_buf,
-                                                   size_t count,
-                                                   const ccl_datatype& dtype,
-                                                   ccl::reduction reduction,
-                                                   ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_binomial_reduce(ccl_sched* sched,
-                                            ccl_buffer send_buf,
-                                            ccl_buffer recv_buf,
-                                            size_t count,
-                                            const ccl_datatype& dtype,
-                                            ccl::reduction reduction,
-                                            size_t root,
-                                            ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_ring_allreduce(ccl_sched* sched,
-                                           ccl_buffer send_buf,
-                                           ccl_buffer recv_buf,
-                                           size_t count,
-                                           const ccl_datatype& dtype,
-                                           ccl::reduction reduction,
-                                           ccl_comm* comm);
+ccl::status ccl_coll_build_naive_bcast(ccl_sched* sched,
+                                       ccl_buffer buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       int root,
+                                       ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
-                                               ccl_buffer send_buf,
-                                               ccl_buffer recv_buf,
-                                               size_t count,
-                                               const ccl_datatype& dtype,
-                                               ccl::reduction reduction,
-                                               ccl_comm* comm);
+ccl::status ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
+                                                        ccl_buffer buf,
+                                                        size_t count,
+                                                        const ccl_datatype& dtype,
+                                                        int root,
+                                                        ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
-                                                         ccl_buffer send_buf,
-                                                         ccl_buffer recv_buf,
-                                                         size_t count,
-                                                         const ccl_datatype& dtype,
-                                                         ccl::reduction reduction,
-                                                         ccl_comm* comm);
+ccl::status ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_starlike_allreduce(ccl_sched* sched,
+ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
                                                ccl_buffer send_buf,
                                                ccl_buffer recv_buf,
                                                size_t count,
                                                const ccl_datatype& dtype,
                                                ccl::reduction reduction,
+                                               int root,
                                                ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_naive_allgatherv(ccl_sched* sched,
-                                             ccl_buffer send_buf,
-                                             size_t send_count,
-                                             ccl_buffer recv_buf,
-                                             const size_t* recv_counts,
-                                             const ccl_datatype& dtype,
-                                             ccl_comm* comm);
-
-template <typename i_type, typename v_type>
-ccl_status_t ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
-                                                  ccl_buffer send_ind_buf,
-                                                  size_t send_ind_count,
-                                                  ccl_buffer send_val_buf,
-                                                  size_t send_val_count,
-                                                  void** recv_ind_buf,
-                                                  size_t* recv_ind_count,
-                                                  void** recv_val_buf,
-                                                  size_t* recv_val_count,
-                                                  const ccl_datatype& index_dtype,
-                                                  const ccl_datatype& value_dtype,
-                                                  ccl::reduction reduction,
-                                                  ccl_comm* comm);
-
-template <typename i_type, typename v_type>
-ccl_status_t ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
-                                                  ccl_buffer send_ind_buf,
-                                                  size_t send_ind_count,
-                                                  ccl_buffer send_val_buf,
-                                                  size_t send_val_count,
-                                                  void** recv_ind_buf,
-                                                  size_t* recv_ind_count,
-                                                  void** recv_val_buf,
-                                                  size_t* recv_val_count,
-                                                  const ccl_datatype& index_dtype,
-                                                  const ccl_datatype& value_dtype,
+ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
+                                                  ccl_buffer send_buf,
+                                                  ccl_buffer recv_buf,
+                                                  size_t count,
+                                                  const ccl_datatype& dtype,
                                                   ccl::reduction reduction,
                                                   ccl_comm* comm);
 
-template <typename i_type, typename v_type>
-ccl_status_t ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
-                                                          ccl_buffer send_ind_buf,
-                                                          size_t send_ind_count,
-                                                          ccl_buffer send_val_buf,
-                                                          size_t send_val_count,
-                                                          void** recv_ind_buf,
-                                                          size_t* recv_ind_count,
-                                                          void** recv_val_buf,
-                                                          size_t* recv_val_count,
-                                                          const ccl_datatype& index_dtype,
-                                                          const ccl_datatype& value_dtype,
-                                                          ccl::reduction reduction,
-                                                          ccl_comm* comm);
-
-class ccl_double_tree;
-ccl_status_t ccl_coll_build_double_tree_op(ccl_sched* sched,
-                                           ccl_coll_type coll_type,
+ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
                                            ccl_buffer send_buf,
                                            ccl_buffer recv_buf,
                                            size_t count,
                                            const ccl_datatype& dtype,
                                            ccl::reduction reduction,
-                                           const ccl_double_tree& dtree,
+                                           int root,
                                            ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
-                                                ccl_buffer send_buf,
-                                                ccl_buffer recv_buf,
-                                                size_t send_count,
-                                                const ccl_datatype& dtype,
-                                                ccl::reduction reduction,
-                                                ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
-                                                      ccl_buffer send_buf,
-                                                      ccl_buffer recv_buf,
-                                                      size_t recv_count,
-                                                      const ccl_datatype& dtype,
-                                                      ccl::reduction reduction,
-                                                      ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_ring_allgatherv(ccl_sched* sched,
+ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
+                                          ccl_buffer send_buf,
+                                          ccl_buffer recv_buf,
+                                          size_t count,
+                                          const ccl_datatype& dtype,
+                                          ccl::reduction reduction,
+                                          ccl_comm* comm);
+
+ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
+                                              ccl_buffer send_buf,
+                                              ccl_buffer recv_buf,
+                                              size_t count,
+                                              const ccl_datatype& dtype,
+                                              ccl::reduction reduction,
+                                              ccl_comm* comm);
+
+ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
+                                                        ccl_buffer send_buf,
+                                                        ccl_buffer recv_buf,
+                                                        size_t count,
+                                                        const ccl_datatype& dtype,
+                                                        ccl::reduction reduction,
+                                                        ccl_comm* comm);
+
+ccl::status ccl_coll_build_starlike_allreduce(ccl_sched* sched,
+                                              ccl_buffer send_buf,
+                                              ccl_buffer recv_buf,
+                                              size_t count,
+                                              const ccl_datatype& dtype,
+                                              ccl::reduction reduction,
+                                              ccl_comm* comm);
+
+ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
                                             ccl_buffer send_buf,
                                             size_t send_count,
                                             ccl_buffer recv_buf,
@@ -185,70 +106,150 @@ ccl_status_t ccl_coll_build_ring_allgatherv(ccl_sched* sched,
                                             const ccl_datatype& dtype,
                                             ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
-                                            std::vector<ccl_sched*>& scheds,
-                                            const ccl_coll_param& coll_param);
-
-ccl_status_t ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
-                                              std::vector<ccl_sched*>& scheds,
-                                              const ccl_coll_param& coll_param);
-
-ccl_status_t ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sched,
-                                                      std::vector<ccl_sched*>& scheds,
-                                                      const ccl_coll_param& coll_param);
+template <typename i_type, typename v_type>
+ccl::status ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
+                                                 ccl_buffer send_ind_buf,
+                                                 size_t send_ind_count,
+                                                 ccl_buffer send_val_buf,
+                                                 size_t send_val_count,
+                                                 void** recv_ind_buf,
+                                                 size_t* recv_ind_count,
+                                                 void** recv_val_buf,
+                                                 size_t* recv_val_count,
+                                                 const ccl_datatype& index_dtype,
+                                                 const ccl_datatype& value_dtype,
+                                                 ccl::reduction reduction,
+                                                 ccl_comm* comm);
 
-/* direct algorithms - i.e. direct mapping on collective API from transport level */
+template <typename i_type, typename v_type>
+ccl::status ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
+                                                 ccl_buffer send_ind_buf,
+                                                 size_t send_ind_count,
+                                                 ccl_buffer send_val_buf,
+                                                 size_t send_val_count,
+                                                 void** recv_ind_buf,
+                                                 size_t* recv_ind_count,
+                                                 void** recv_val_buf,
+                                                 size_t* recv_val_count,
+                                                 const ccl_datatype& index_dtype,
+                                                 const ccl_datatype& value_dtype,
+                                                 ccl::reduction reduction,
+                                                 ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_direct_barrier(ccl_sched* sched, ccl_comm* comm);
+template <typename i_type, typename v_type>
+ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
+                                                         ccl_buffer send_ind_buf,
+                                                         size_t send_ind_count,
+                                                         ccl_buffer send_val_buf,
+                                                         size_t send_val_count,
+                                                         void** recv_ind_buf,
+                                                         size_t* recv_ind_count,
+                                                         void** recv_val_buf,
+                                                         size_t* recv_val_count,
+                                                         const ccl_datatype& index_dtype,
+                                                         const ccl_datatype& value_dtype,
+                                                         ccl::reduction reduction,
+                                                         ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_direct_reduce(ccl_sched* sched,
+class ccl_double_tree;
+ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
+                                          ccl_coll_type coll_type,
                                           ccl_buffer send_buf,
                                           ccl_buffer recv_buf,
                                           size_t count,
                                           const ccl_datatype& dtype,
                                           ccl::reduction reduction,
-                                          size_t root,
+                                          const ccl_double_tree& dtree,
                                           ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_direct_allgatherv(ccl_sched* sched,
-                                              ccl_buffer send_buf,
-                                              size_t send_count,
-                                              ccl_buffer recv_buf,
-                                              const size_t* recv_counts,
-                                              const ccl_datatype& dtype,
-                                              ccl_comm* comm);
+ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t send_count,
+                                               const ccl_datatype& dtype,
+                                               ccl::reduction reduction,
+                                               ccl_comm* comm);
+
+ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
+                                                     ccl_buffer send_buf,
+                                                     ccl_buffer recv_buf,
+                                                     size_t recv_count,
+                                                     const ccl_datatype& dtype,
+                                                     ccl::reduction reduction,
+                                                     ccl_comm* comm);
+
+ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           size_t send_count,
+                                           ccl_buffer recv_buf,
+                                           const size_t* recv_counts,
+                                           const ccl_datatype& dtype,
+                                           ccl_comm* comm);
+
+ccl::status ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
+                                           std::vector<ccl_sched*>& scheds,
+                                           const ccl_coll_param& coll_param);
+
+ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
+                                             std::vector<ccl_sched*>& scheds,
+                                             const ccl_coll_param& coll_param);
+
+ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sched,
+                                                     std::vector<ccl_sched*>& scheds,
+                                                     const ccl_coll_param& coll_param);
+
+/* direct algorithms - i.e. direct mapping on collective API from transport level */
 
-ccl_status_t ccl_coll_build_direct_allreduce(ccl_sched* sched,
+ccl::status ccl_coll_build_direct_barrier(ccl_sched* sched, ccl_comm* comm);
+
+ccl::status ccl_coll_build_direct_reduce(ccl_sched* sched,
+                                         ccl_buffer send_buf,
+                                         ccl_buffer recv_buf,
+                                         size_t count,
+                                         const ccl_datatype& dtype,
+                                         ccl::reduction reduction,
+                                         int root,
+                                         ccl_comm* comm);
+
+ccl::status ccl_coll_build_direct_allgatherv(ccl_sched* sched,
                                              ccl_buffer send_buf,
+                                             size_t send_count,
                                              ccl_buffer recv_buf,
-                                             size_t count,
+                                             const size_t* recv_counts,
                                              const ccl_datatype& dtype,
-                                             ccl::reduction reduction,
                                              ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_direct_alltoall(ccl_sched* sched,
+ccl::status ccl_coll_build_direct_allreduce(ccl_sched* sched,
                                             ccl_buffer send_buf,
                                             ccl_buffer recv_buf,
                                             size_t count,
                                             const ccl_datatype& dtype,
+                                            ccl::reduction reduction,
                                             ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_direct_alltoallv(ccl_sched* sched,
-                                             ccl_buffer send_buf,
-                                             const size_t* send_counts,
-                                             ccl_buffer recv_buf,
-                                             const size_t* recv_counts,
-                                             const ccl_datatype& dtype,
-                                             ccl_comm* comm);
+ccl::status ccl_coll_build_direct_alltoall(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           ccl_buffer recv_buf,
+                                           size_t count,
+                                           const ccl_datatype& dtype,
+                                           ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_direct_bcast(ccl_sched* sched,
-                                         ccl_buffer buf,
-                                         size_t count,
-                                         const ccl_datatype& dtype,
-                                         size_t root,
-                                         ccl_comm* comm);
+ccl::status ccl_coll_build_direct_alltoallv(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            const size_t* send_counts,
+                                            ccl_buffer recv_buf,
+                                            const size_t* recv_counts,
+                                            const ccl_datatype& dtype,
+                                            ccl_comm* comm);
+
+ccl::status ccl_coll_build_direct_bcast(ccl_sched* sched,
+                                        ccl_buffer buf,
+                                        size_t count,
+                                        const ccl_datatype& dtype,
+                                        int root,
+                                        ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
+ccl::status ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
                                                  ccl_buffer send_buf,
                                                  ccl_buffer recv_buf,
                                                  size_t send_count,
diff --git a/src/coll/algorithms/allgatherv.cpp b/src/coll/algorithms/allgatherv.cpp
index 17f25f3e3..329c50752 100644
--- a/src/coll/algorithms/allgatherv.cpp
+++ b/src/coll/algorithms/allgatherv.cpp
@@ -17,37 +17,37 @@
 #include "sched/entry/factory/chunked_entry_factory.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
-ccl_status_t ccl_coll_build_direct_allgatherv(ccl_sched* sched,
-                                              ccl_buffer send_buf,
-                                              size_t send_count,
-                                              ccl_buffer recv_buf,
-                                              const size_t* recv_counts,
-                                              const ccl_datatype& dtype,
-                                              ccl_comm* comm) {
-    LOG_DEBUG("build direct allgatherv");
-
-    entry_factory::make_entry<allgatherv_entry>(
-        sched, send_buf, send_count, recv_buf, recv_counts, dtype, comm);
-    return ccl_status_success;
-}
-
-ccl_status_t ccl_coll_build_naive_allgatherv(ccl_sched* sched,
+ccl::status ccl_coll_build_direct_allgatherv(ccl_sched* sched,
                                              ccl_buffer send_buf,
                                              size_t send_count,
                                              ccl_buffer recv_buf,
                                              const size_t* recv_counts,
                                              const ccl_datatype& dtype,
                                              ccl_comm* comm) {
+    LOG_DEBUG("build direct allgatherv");
+
+    entry_factory::make_entry<allgatherv_entry>(
+        sched, send_buf, send_count, recv_buf, recv_counts, dtype, comm);
+    return ccl::status::success;
+}
+
+ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            size_t send_count,
+                                            ccl_buffer recv_buf,
+                                            const size_t* recv_counts,
+                                            const ccl_datatype& dtype,
+                                            ccl_comm* comm) {
     LOG_DEBUG("build naive allgatherv");
 
-    size_t comm_size = comm->size();
-    size_t this_rank = comm->rank();
+    int comm_size = comm->size();
+    int this_rank = comm->rank();
     size_t dtype_size = dtype.size();
     size_t* offsets = static_cast<size_t*>(CCL_MALLOC(comm_size * sizeof(size_t), "offsets"));
-    ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
 
     offsets[0] = 0;
-    for (size_t rank_idx = 1; rank_idx < comm_size; ++rank_idx) {
+    for (int rank_idx = 1; rank_idx < comm_size; ++rank_idx) {
         offsets[rank_idx] = offsets[rank_idx - 1] + recv_counts[rank_idx - 1] * dtype_size;
     }
 
@@ -57,7 +57,7 @@ ccl_status_t ccl_coll_build_naive_allgatherv(ccl_sched* sched,
             sched, send_buf, recv_buf + offsets[this_rank], send_count, dtype);
     }
 
-    for (size_t rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
+    for (int rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
         if (rank_idx != this_rank) {
             // send own buffer to other ranks
             entry_factory::make_chunked_send_entry(
@@ -72,27 +72,26 @@ ccl_status_t ccl_coll_build_naive_allgatherv(ccl_sched* sched,
     return status;
 }
 
-ccl_status_t ccl_coll_build_ring_allgatherv(ccl_sched* sched,
-                                            ccl_buffer send_buf,
-                                            size_t send_count,
-                                            ccl_buffer recv_buf,
-                                            const size_t* recv_counts,
-                                            const ccl_datatype& dtype,
-                                            ccl_comm* comm) {
+ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           size_t send_count,
+                                           ccl_buffer recv_buf,
+                                           const size_t* recv_counts,
+                                           const ccl_datatype& dtype,
+                                           ccl_comm* comm) {
     LOG_DEBUG("build ring allgatherv, send_count ", send_count);
 
-    ccl_status_t status = ccl_status_success;
-    size_t comm_size, rank;
+    ccl::status status = ccl::status::success;
+    int comm_size, rank;
     size_t dtype_size = dtype.size();
-    size_t idx = 0;
-    size_t src, dst;
+    int src, dst;
 
     comm_size = comm->size();
     rank = comm->rank();
 
     size_t* offsets = static_cast<size_t*>(CCL_MALLOC(comm_size * sizeof(size_t), "offsets"));
     offsets[0] = 0;
-    for (size_t rank_idx = 1; rank_idx < comm_size; ++rank_idx) {
+    for (int rank_idx = 1; rank_idx < comm_size; ++rank_idx) {
         offsets[rank_idx] = offsets[rank_idx - 1] + recv_counts[rank_idx - 1] * dtype_size;
     }
 
@@ -112,7 +111,7 @@ ccl_status_t ccl_coll_build_ring_allgatherv(ccl_sched* sched,
     size_t send_block_count, recv_block_count;
     size_t send_block_offset, recv_block_offset;
 
-    for (idx = 0; idx < (comm_size - 1); idx++) {
+    for (int idx = 0; idx < (comm_size - 1); idx++) {
         send_block_idx = block_idx;
         recv_block_idx = (comm_size + block_idx - 1) % comm_size;
         send_block_count = recv_counts[send_block_idx];
diff --git a/src/coll/algorithms/allreduce/allreduce.cpp b/src/coll/algorithms/allreduce/allreduce.cpp
index e620e2390..ab512a617 100644
--- a/src/coll/algorithms/allreduce/allreduce.cpp
+++ b/src/coll/algorithms/allreduce/allreduce.cpp
@@ -24,30 +24,30 @@
 #include "sched/entry/factory/chunked_entry_factory.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
-ccl_status_t ccl_coll_build_direct_allreduce(ccl_sched* sched,
-                                             ccl_buffer send_buf,
-                                             ccl_buffer recv_buf,
-                                             size_t count,
-                                             const ccl_datatype& dtype,
-                                             ccl::reduction op,
-                                             ccl_comm* comm) {
+ccl::status ccl_coll_build_direct_allreduce(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            ccl_buffer recv_buf,
+                                            size_t count,
+                                            const ccl_datatype& dtype,
+                                            ccl::reduction op,
+                                            ccl_comm* comm) {
     LOG_DEBUG("build direct allreduce");
 
     entry_factory::make_entry<allreduce_entry>(sched, send_buf, recv_buf, count, dtype, op, comm);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
-                                                   ccl_buffer send_buf,
-                                                   ccl_buffer recv_buf,
-                                                   size_t count,
-                                                   const ccl_datatype& dtype,
-                                                   ccl::reduction op,
-                                                   ccl_comm* comm) {
+ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
+                                                  ccl_buffer send_buf,
+                                                  ccl_buffer recv_buf,
+                                                  size_t count,
+                                                  const ccl_datatype& dtype,
+                                                  ccl::reduction op,
+                                                  ccl_comm* comm) {
     LOG_DEBUG("build Rabenseifner's allreduce");
     CCL_ASSERT(sched != nullptr, "empty sched");
 
-    ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
     int comm_size, rank, newrank, pof2, rem;
     int i, send_idx, recv_idx, last_idx, mask, newdst, dst, send_cnt, recv_cnt;
     int *cnts = NULL, *disps = NULL;
@@ -269,16 +269,16 @@ ccl_status_t ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
     return status;
 }
 
-ccl_status_t ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
-                                                         ccl_buffer send_buf,
-                                                         ccl_buffer recv_buf,
-                                                         size_t count,
-                                                         const ccl_datatype& dtype,
-                                                         ccl::reduction op,
-                                                         ccl_comm* comm) {
+ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
+                                                        ccl_buffer send_buf,
+                                                        ccl_buffer recv_buf,
+                                                        size_t count,
+                                                        const ccl_datatype& dtype,
+                                                        ccl::reduction op,
+                                                        ccl_comm* comm) {
     LOG_DEBUG("build recursive_doubling allreduce");
 
-    ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
 
     int pof2, rem, comm_size, rank;
     int newrank, mask, newdst, dst;
@@ -378,18 +378,18 @@ ccl_status_t ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
     return status;
 }
 
-ccl_status_t ccl_coll_build_starlike_allreduce(ccl_sched* sched,
-                                               ccl_buffer send_buf,
-                                               ccl_buffer recv_buf,
-                                               size_t count,
-                                               const ccl_datatype& dtype,
-                                               ccl::reduction op,
-                                               ccl_comm* comm) {
+ccl::status ccl_coll_build_starlike_allreduce(ccl_sched* sched,
+                                              ccl_buffer send_buf,
+                                              ccl_buffer recv_buf,
+                                              size_t count,
+                                              const ccl_datatype& dtype,
+                                              ccl::reduction op,
+                                              ccl_comm* comm) {
     LOG_DEBUG("build starlike allreduce");
 
-    ccl_status_t status = ccl_status_success;
-    size_t comm_size = comm->size();
-    size_t this_rank = comm->rank();
+    ccl::status status = ccl::status::success;
+    int comm_size = comm->size();
+    int this_rank = comm->rank();
     size_t* buffer_counts =
         static_cast<size_t*>(CCL_MALLOC(comm_size * sizeof(size_t), "buffer_count"));
     size_t* buffer_offsets =
@@ -407,7 +407,7 @@ ccl_status_t ccl_coll_build_starlike_allreduce(ccl_sched* sched,
 
     // calculate counts and offsets for each rank
     size_t common_buffer_count = count / comm_size;
-    for (size_t rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
+    for (int rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
         buffer_counts[rank_idx] = common_buffer_count;
         buffer_offsets[rank_idx] = rank_idx * buffer_counts[rank_idx] * dtype_size;
     }
@@ -421,7 +421,7 @@ ccl_status_t ccl_coll_build_starlike_allreduce(ccl_sched* sched,
         tmp_buf = sched->alloc_buffer(this_rank_buf_size * (comm_size - 1));
 
     size_t tmp_buf_recv_idx = 0;
-    for (size_t rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
+    for (int rank_idx = 0; rank_idx < comm_size; ++rank_idx) {
         if (rank_idx != this_rank) {
             // send buffer to others
             entry_factory::make_chunked_send_entry(sched,
@@ -458,13 +458,13 @@ ccl_status_t ccl_coll_build_starlike_allreduce(ccl_sched* sched,
     return status;
 }
 
-ccl_status_t ccl_coll_build_ring_allreduce(ccl_sched* sched,
-                                           ccl_buffer send_buf,
-                                           ccl_buffer recv_buf,
-                                           size_t count,
-                                           const ccl_datatype& dtype,
-                                           ccl::reduction op,
-                                           ccl_comm* comm) {
+ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
+                                          ccl_buffer send_buf,
+                                          ccl_buffer recv_buf,
+                                          size_t count,
+                                          const ccl_datatype& dtype,
+                                          ccl::reduction op,
+                                          ccl_comm* comm) {
     int inplace = (send_buf == recv_buf) ? 1 : 0;
     LOG_DEBUG("build ring allreduce ", inplace ? "in-place" : "out-of-place");
 
@@ -476,13 +476,13 @@ ccl_status_t ccl_coll_build_ring_allreduce(ccl_sched* sched,
                      " recv ",
                      recv_buf);
 
-    ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
 
     ccl_coll_build_ring_reduce_scatter(sched, send_buf, recv_buf, count, dtype, op, comm);
 
     sched->add_barrier();
 
-    size_t comm_size = comm->size();
+    int comm_size = comm->size();
     size_t main_block_count = count / comm_size;
     size_t last_block_count = main_block_count + count % comm_size;
     std::vector<size_t> recv_counts(comm_size, main_block_count);
diff --git a/src/coll/algorithms/allreduce/allreduce_2d.cpp b/src/coll/algorithms/allreduce/allreduce_2d.cpp
index 76ad38245..487de6976 100644
--- a/src/coll/algorithms/allreduce/allreduce_2d.cpp
+++ b/src/coll/algorithms/allreduce/allreduce_2d.cpp
@@ -18,8 +18,9 @@
 #include "common/global/global.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
-ccl_allreduce_2d_builder::ccl_allreduce_2d_builder(size_t base_size, bool switch_dims, ccl_comm* comm) {
-
+ccl_allreduce_2d_builder::ccl_allreduce_2d_builder(size_t base_size,
+                                                   bool switch_dims,
+                                                   ccl_comm* comm) {
     parent_comm = comm;
 
     size_t vector_size = comm->size();
@@ -36,23 +37,19 @@ ccl_allreduce_2d_builder::ccl_allreduce_2d_builder(size_t base_size, bool switch
         }
     }
 
-    first_dim_comm = std::shared_ptr<ccl_comm>(
-        ccl_comm::create_with_colors(first_dim_colors,
-                                     ccl::global_data::get().comm_ids.get(),
-                                     comm, true /*share_resources*/));
+    first_dim_comm = std::shared_ptr<ccl_comm>(ccl_comm::create_with_colors(
+        first_dim_colors, ccl::global_data::get().comm_ids.get(), comm, true /*share_resources*/));
 
-    second_dim_comm = std::shared_ptr<ccl_comm>(
-        ccl_comm::create_with_colors(second_dim_colors,
-                                     ccl::global_data::get().comm_ids.get(),
-                                     comm, true /*share_resources*/));
+    second_dim_comm = std::shared_ptr<ccl_comm>(ccl_comm::create_with_colors(
+        second_dim_colors, ccl::global_data::get().comm_ids.get(), comm, true /*share_resources*/));
 
     if (comm->rank() == 0) {
         std::string first_dim_ranks, second_dim_ranks;
-        for (size_t idx = 0; idx < first_dim_comm->size(); idx++) {
+        for (int idx = 0; idx < first_dim_comm->size(); idx++) {
             first_dim_ranks +=
                 ((idx) ? " " : "") + std::to_string(first_dim_comm->get_global_rank(idx));
         }
-        for (size_t idx = 0; idx < second_dim_comm->size(); idx++) {
+        for (int idx = 0; idx < second_dim_comm->size(); idx++) {
             second_dim_ranks +=
                 ((idx) ? " " : "") + std::to_string(second_dim_comm->get_global_rank(idx));
         }
@@ -74,121 +71,120 @@ ccl_allreduce_2d_builder::ccl_allreduce_2d_builder(size_t base_size, bool switch
 }
 
 ccl_allreduce_2d_builder::~ccl_allreduce_2d_builder() {
-   first_dim_comm.reset();
-   second_dim_comm.reset();
+    first_dim_comm.reset();
+    second_dim_comm.reset();
 }
 
 static void ccl_allreduce_2d_add_allreduce_allgather(ccl_sched* sched,
-                                                    ccl_buffer send_buf,
-                                                    ccl_buffer recv_buf,
-                                                    size_t count,
-                                                    const ccl_datatype& dtype,
-                                                    ccl::reduction op,
-                                                    ccl_comm* comm,
-                                                    size_t chunk_idx,
-                                                    size_t chunk_count) {
-
-   ccl_comm* first_dim_comm = comm->allreduce_2d_builder->get_first_dim_comm();
-   ccl_comm* second_dim_comm = comm->allreduce_2d_builder->get_second_dim_comm();
-
-   size_t dtype_size = dtype.size();
-   size_t main_chunk_size = count / chunk_count;
-   size_t last_chunk_size = main_chunk_size + count % chunk_count;
-   size_t cnt = (chunk_idx == (chunk_count - 1)) ? last_chunk_size : main_chunk_size;
-   ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
-
-   size_t main_block_count = cnt / first_dim_comm->size();
-   size_t last_block_count = main_block_count + cnt % first_dim_comm->size();
-   size_t ar_count = (first_dim_comm->rank() == (first_dim_comm->size() - 1)) ? last_block_count
-                                                                              : main_block_count;
-
-   if (ar_count) {
-       /* TODO: add second level selection to distinguish high and low level algorithms */
-       ccl_buffer ar_buf = rbuf + first_dim_comm->rank() * main_block_count * dtype_size;
-       ccl_coll_build_starlike_allreduce(
-           sched, ar_buf, ar_buf, ar_count, dtype, op, second_dim_comm);
-       sched->add_barrier();
-   }
-
-   std::vector<size_t> ag_recv_counts(first_dim_comm->size(), main_block_count);
-   ag_recv_counts[first_dim_comm->size() - 1] = last_block_count;
-   ccl_coll_build_allgatherv(
-       sched, rbuf, ar_count, rbuf, ag_recv_counts.data(), dtype, first_dim_comm);
+                                                     ccl_buffer send_buf,
+                                                     ccl_buffer recv_buf,
+                                                     size_t count,
+                                                     const ccl_datatype& dtype,
+                                                     ccl::reduction op,
+                                                     ccl_comm* comm,
+                                                     size_t chunk_idx,
+                                                     size_t chunk_count) {
+    ccl_comm* first_dim_comm = comm->allreduce_2d_builder->get_first_dim_comm();
+    ccl_comm* second_dim_comm = comm->allreduce_2d_builder->get_second_dim_comm();
+
+    size_t dtype_size = dtype.size();
+    size_t main_chunk_size = count / chunk_count;
+    size_t last_chunk_size = main_chunk_size + count % chunk_count;
+    size_t cnt = (chunk_idx == (chunk_count - 1)) ? last_chunk_size : main_chunk_size;
+    ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
+
+    size_t main_block_count = cnt / first_dim_comm->size();
+    size_t last_block_count = main_block_count + cnt % first_dim_comm->size();
+    size_t ar_count = (first_dim_comm->rank() == (first_dim_comm->size() - 1)) ? last_block_count
+                                                                               : main_block_count;
+
+    if (ar_count) {
+        /* TODO: add second level selection to distinguish high and low level algorithms */
+        ccl_buffer ar_buf = rbuf + first_dim_comm->rank() * main_block_count * dtype_size;
+        ccl_coll_build_starlike_allreduce(
+            sched, ar_buf, ar_buf, ar_count, dtype, op, second_dim_comm);
+        sched->add_barrier();
+    }
+
+    std::vector<size_t> ag_recv_counts(first_dim_comm->size(), main_block_count);
+    ag_recv_counts[first_dim_comm->size() - 1] = last_block_count;
+    ccl_coll_build_allgatherv(
+        sched, rbuf, ar_count, rbuf, ag_recv_counts.data(), dtype, first_dim_comm);
 }
 
 static void ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(ccl_sched* sched,
-                                                                   ccl_buffer send_buf,
-                                                                   ccl_buffer recv_buf,
-                                                                   size_t count,
-                                                                   const ccl_datatype& dtype,
-                                                                   ccl::reduction op,
-                                                                   ccl_comm* comm,
-                                                                   size_t chunk_idx,
-                                                                   size_t chunk_count) {
-   ccl_comm* first_dim_comm = comm->allreduce_2d_builder->get_first_dim_comm();
-
-   size_t dtype_size = dtype.size();
-   size_t main_chunk_size = count / chunk_count;
-   size_t last_chunk_size = main_chunk_size + count % chunk_count;
-   size_t cnt = (chunk_idx == (chunk_count - 1)) ? last_chunk_size : main_chunk_size;
-   ccl_buffer sbuf = send_buf + chunk_idx * main_chunk_size * dtype_size;
-   ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
-
-   ccl_coll_build_reduce_scatter(sched, sbuf, rbuf, cnt, dtype, op, first_dim_comm, true);
-   sched->add_barrier();
-
-   if (chunk_idx == (chunk_count - 1) || (chunk_count == 1)) {
-       ccl_allreduce_2d_add_allreduce_allgather(
-           sched, send_buf, recv_buf, count, dtype, op, comm, chunk_idx, chunk_count);
-   }
-   else {
-       entry_factory::make_entry<subsched_entry>(
-           sched,
-           chunk_idx,
-           [send_buf, recv_buf, count, &dtype, op, comm, chunk_idx, chunk_count](ccl_sched* s) {
-               ccl_allreduce_2d_add_allreduce_allgather(
-                   s, send_buf, recv_buf, count, dtype, op, comm, chunk_idx, chunk_count);
-           },
-           "AR_AG");
-
-       entry_factory::make_entry<subsched_entry>(
-           sched,
-           chunk_idx + 1,
-           [send_buf, recv_buf, count, &dtype, op, comm, chunk_idx, chunk_count](ccl_sched* s) {
-               ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(
-                   s, send_buf, recv_buf, count, dtype, op, comm, chunk_idx + 1, chunk_count);
-           },
-           "RS_AR_AG");
-   }
+                                                                    ccl_buffer send_buf,
+                                                                    ccl_buffer recv_buf,
+                                                                    size_t count,
+                                                                    const ccl_datatype& dtype,
+                                                                    ccl::reduction op,
+                                                                    ccl_comm* comm,
+                                                                    size_t chunk_idx,
+                                                                    size_t chunk_count) {
+    ccl_comm* first_dim_comm = comm->allreduce_2d_builder->get_first_dim_comm();
+
+    size_t dtype_size = dtype.size();
+    size_t main_chunk_size = count / chunk_count;
+    size_t last_chunk_size = main_chunk_size + count % chunk_count;
+    size_t cnt = (chunk_idx == (chunk_count - 1)) ? last_chunk_size : main_chunk_size;
+    ccl_buffer sbuf = send_buf + chunk_idx * main_chunk_size * dtype_size;
+    ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
+
+    ccl_coll_build_reduce_scatter(sched, sbuf, rbuf, cnt, dtype, op, first_dim_comm, true);
+    sched->add_barrier();
+
+    if (chunk_idx == (chunk_count - 1) || (chunk_count == 1)) {
+        ccl_allreduce_2d_add_allreduce_allgather(
+            sched, send_buf, recv_buf, count, dtype, op, comm, chunk_idx, chunk_count);
+    }
+    else {
+        entry_factory::make_entry<subsched_entry>(
+            sched,
+            chunk_idx,
+            [send_buf, recv_buf, count, &dtype, op, comm, chunk_idx, chunk_count](ccl_sched* s) {
+                ccl_allreduce_2d_add_allreduce_allgather(
+                    s, send_buf, recv_buf, count, dtype, op, comm, chunk_idx, chunk_count);
+            },
+            "AR_AG");
+
+        entry_factory::make_entry<subsched_entry>(
+            sched,
+            chunk_idx + 1,
+            [send_buf, recv_buf, count, &dtype, op, comm, chunk_idx, chunk_count](ccl_sched* s) {
+                ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(
+                    s, send_buf, recv_buf, count, dtype, op, comm, chunk_idx + 1, chunk_count);
+            },
+            "RS_AR_AG");
+    }
 }
 
-ccl_status_t ccl_allreduce_2d_builder::build(ccl_sched* sched,
+ccl::status ccl_allreduce_2d_builder::build(ccl_sched* sched,
                                             ccl_buffer send_buf,
                                             ccl_buffer recv_buf,
                                             size_t count,
                                             const ccl_datatype& dtype,
                                             ccl::reduction op) {
-   CCL_THROW_IF_NOT(sched && send_buf && recv_buf && count,
-                    "incorrect values, sched ",
-                    sched,
-                    ", send ",
-                    send_buf,
-                    " recv ",
-                    recv_buf);
+    CCL_THROW_IF_NOT(sched && send_buf && recv_buf && count,
+                     "incorrect values, sched ",
+                     sched,
+                     ", send ",
+                     send_buf,
+                     " recv ",
+                     recv_buf);
 
-   ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
 
-   size_t chunk_count = ccl::global_data::env().ar2d_chunk_count;
+    size_t chunk_count = ccl::global_data::env().ar2d_chunk_count;
 
-   if (chunk_count == 0) {
-       LOG_ERROR("unexpected chunk_count");
-       chunk_count = 1;
-   }
+    if (chunk_count == 0) {
+        LOG_ERROR("unexpected chunk_count");
+        chunk_count = 1;
+    }
 
-   LOG_DEBUG("build 2d allreduce, chunk_count ", chunk_count);
+    LOG_DEBUG("build 2d allreduce, chunk_count ", chunk_count);
 
-   ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(
-       sched, send_buf, recv_buf, count, dtype, op, parent_comm, 0 /* chunk_idx */, chunk_count);
+    ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(
+        sched, send_buf, recv_buf, count, dtype, op, parent_comm, 0 /* chunk_idx */, chunk_count);
 
-   return status;
+    return status;
 }
diff --git a/src/coll/algorithms/allreduce/allreduce_2d.hpp b/src/coll/algorithms/allreduce/allreduce_2d.hpp
index 01c37531e..5c130c3c4 100644
--- a/src/coll/algorithms/allreduce/allreduce_2d.hpp
+++ b/src/coll/algorithms/allreduce/allreduce_2d.hpp
@@ -22,31 +22,31 @@ class comm;
 
 class ccl_allreduce_2d_builder {
 public:
-   ccl_allreduce_2d_builder(size_t base_size, bool switch_dims, ccl_comm* comm);
-   ~ccl_allreduce_2d_builder();
+    ccl_allreduce_2d_builder(size_t base_size, bool switch_dims, ccl_comm* comm);
+    ~ccl_allreduce_2d_builder();
 
-   ccl_allreduce_2d_builder(const ccl_allreduce_2d_builder&) = delete;
-   ccl_allreduce_2d_builder(ccl_allreduce_2d_builder&&) = delete;
+    ccl_allreduce_2d_builder(const ccl_allreduce_2d_builder&) = delete;
+    ccl_allreduce_2d_builder(ccl_allreduce_2d_builder&&) = delete;
 
-   ccl_allreduce_2d_builder& operator=(const ccl_allreduce_2d_builder&) = delete;
-   ccl_allreduce_2d_builder& operator=(ccl_allreduce_2d_builder&&) = delete;
+    ccl_allreduce_2d_builder& operator=(const ccl_allreduce_2d_builder&) = delete;
+    ccl_allreduce_2d_builder& operator=(ccl_allreduce_2d_builder&&) = delete;
 
-   ccl_status_t build(ccl_sched* sched,
+    ccl::status build(ccl_sched* sched,
                       ccl_buffer send_buf,
                       ccl_buffer recv_buf,
                       size_t count,
                       const ccl_datatype& dtype,
                       ccl::reduction op);
 
-   ccl_comm* get_first_dim_comm() const {
-       return first_dim_comm.get();
-   }
-   ccl_comm* get_second_dim_comm() const {
-       return second_dim_comm.get();
-   }
+    ccl_comm* get_first_dim_comm() const {
+        return first_dim_comm.get();
+    }
+    ccl_comm* get_second_dim_comm() const {
+        return second_dim_comm.get();
+    }
 
 private:
-   ccl_comm* parent_comm;
-   std::shared_ptr<ccl_comm> first_dim_comm;
-   std::shared_ptr<ccl_comm> second_dim_comm;
+    ccl_comm* parent_comm;
+    std::shared_ptr<ccl_comm> first_dim_comm;
+    std::shared_ptr<ccl_comm> second_dim_comm;
 };
diff --git a/src/coll/algorithms/allreduce/allreduce_rma.cpp b/src/coll/algorithms/allreduce/allreduce_rma.cpp
index 506a836b8..a74c95d92 100644
--- a/src/coll/algorithms/allreduce/allreduce_rma.cpp
+++ b/src/coll/algorithms/allreduce/allreduce_rma.cpp
@@ -18,113 +18,113 @@
 #include "sched/entry/factory/entry_factory.hpp"
 #include "exec/exec.hpp"
 
-ccl_status_t rma_ring_allreduce_reset_sync_flag(const void* ctx) {
+ccl::status rma_ring_allreduce_reset_sync_flag(const void* ctx) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     ar_handler->sync_flag = 0;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_reset_dst_ready_flag(const void* ctx) {
+ccl::status rma_ring_allreduce_reset_dst_ready_flag(const void* ctx) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     ar_handler->dst_ready_flag = 0;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_remote_sync_flag_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_remote_sync_flag_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = &(ar_handler->remote_sync_flag_mr);
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_sync_flag_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_sync_flag_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = ar_handler->sync_flag_mr;
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_sync_flags_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_sync_flags_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = ar_handler->sync_flags_mr;
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_send_buf_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_send_buf_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = ar_handler->send_buf_mr;
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_recv_buf_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_recv_buf_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = ar_handler->recv_buf_mr;
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_tmp_buf_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_tmp_buf_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = ar_handler->tmp_buf_mr;
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_dst_ready_flag_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_dst_ready_flag_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = ar_handler->dst_ready_flag_mr;
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_dst_ready_value_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_dst_ready_value_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = ar_handler->dst_ready_value_mr;
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_remote_dst_ready_flag_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_remote_dst_ready_flag_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = &(ar_handler->remote_dst_ready_flag_mr);
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_remote_rs_dst_buf_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_remote_rs_dst_buf_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = &(ar_handler->remote_rs_dst_buf_mr);
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t rma_ring_allreduce_get_remote_recv_buf_mr(const void* ctx, void* field_ptr) {
+ccl::status rma_ring_allreduce_get_remote_recv_buf_mr(const void* ctx, void* field_ptr) {
     ccl_rma_ring_allreduce_handler* ar_handler = (ccl_rma_ring_allreduce_handler*)ctx;
     atl_mr_t* mr = &(ar_handler->remote_recv_buf_mr);
     atl_mr_t** mr_ptr = (atl_mr_t**)field_ptr;
     *mr_ptr = mr;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
-                                               ccl_buffer send_buf,
-                                               ccl_buffer recv_buf,
-                                               size_t count,
-                                               const ccl_datatype& dtype,
-                                               ccl::reduction op,
-                                               ccl_comm* comm) {
+ccl::status ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
+                                              ccl_buffer send_buf,
+                                              ccl_buffer recv_buf,
+                                              size_t count,
+                                              const ccl_datatype& dtype,
+                                              ccl::reduction op,
+                                              ccl_comm* comm) {
     int inplace = (send_buf == recv_buf) ? 1 : 0;
     LOG_DEBUG("build ring rma allreduce (", (inplace) ? "in-place" : "out-of-place", ")");
 
@@ -136,10 +136,10 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
                      ", recv ",
                      recv_buf);
 
-    ccl_status_t status = ccl_status_success;
-    size_t comm_size, rank;
+    ccl::status status = ccl::status::success;
+    int comm_size, rank;
     size_t dtype_size = dtype.size();
-    size_t idx = 0;
+    int idx = 0;
     ccl_buffer tmp_buf;
     comm_size = comm->size();
     rank = comm->rank();
@@ -149,7 +149,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
             sched->add_barrier();
         }
-        return ccl_status_success;
+        return ccl::status::success;
     }
 
     ccl_rma_ring_allreduce_handler* ar_handler =
@@ -216,7 +216,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             sched,
             ccl_buffer(&ar_handler->tmp_buf_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
-            ccl_datatype_char,
+            ccl_datatype_int8,
             ar_handler->src_peer,
             comm);
         e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_tmp_buf_mr, ar_handler);
@@ -226,7 +226,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             sched,
             ccl_buffer(&ar_handler->recv_buf_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
-            ccl_datatype_char,
+            ccl_datatype_int8,
             ar_handler->src_peer,
             comm);
         e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_recv_buf_mr, ar_handler);
@@ -235,7 +235,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         sched,
         ccl_buffer(&ar_handler->recv_buf_mr, sizeof(atl_mr_t)),
         sizeof(atl_mr_t),
-        ccl_datatype_char,
+        ccl_datatype_int8,
         ar_handler->src_peer,
         comm);
     e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_recv_buf_mr, ar_handler);
@@ -244,7 +244,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         sched,
         ccl_buffer(&ar_handler->sync_flag_mr, sizeof(atl_mr_t)),
         sizeof(atl_mr_t),
-        ccl_datatype_char,
+        ccl_datatype_int8,
         ar_handler->src_peer,
         comm);
     e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_sync_flag_mr, ar_handler);
@@ -253,21 +253,21 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
         sched,
         ccl_buffer(&ar_handler->remote_rs_dst_buf_mr, sizeof(atl_mr_t)),
         sizeof(atl_mr_t),
-        ccl_datatype_char,
+        ccl_datatype_int8,
         ar_handler->dst_peer,
         comm);
     entry_factory::make_entry<recv_entry>(
         sched,
         ccl_buffer(&ar_handler->remote_recv_buf_mr, sizeof(atl_mr_t)),
         sizeof(atl_mr_t),
-        ccl_datatype_char,
+        ccl_datatype_int8,
         ar_handler->dst_peer,
         comm);
     entry_factory::make_entry<recv_entry>(
         sched,
         ccl_buffer(&ar_handler->remote_sync_flag_mr, sizeof(atl_mr_t)),
         sizeof(atl_mr_t),
-        ccl_datatype_char,
+        ccl_datatype_int8,
         ar_handler->dst_peer,
         comm);
 
@@ -276,7 +276,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             sched,
             ccl_buffer(ar_handler->dst_ready_flag_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
-            ccl_datatype_char,
+            ccl_datatype_int8,
             ar_handler->dst_peer,
             comm);
         e->set_field_fn<ccl_sched_entry_field_buf>(rma_ring_allreduce_get_dst_ready_flag_mr,
@@ -285,7 +285,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             sched,
             ccl_buffer(&ar_handler->remote_dst_ready_flag_mr, sizeof(atl_mr_t)),
             sizeof(atl_mr_t),
-            ccl_datatype_char,
+            ccl_datatype_int8,
             ar_handler->src_peer,
             comm);
     }
@@ -301,7 +301,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             ccl_buffer(&ar_handler->dst_ready_value, sizeof(uint64_t)),
             (atl_mr_t*)nullptr, /* src_mr */
             sizeof(uint64_t),
-            ccl_datatype_char,
+            ccl_datatype_int8,
             ar_handler->src_peer,
             (atl_mr_t*)nullptr, /* dst_mr */
             0 /* dst_buf_offset */,
@@ -320,7 +320,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             sched, rma_ring_allreduce_reset_dst_ready_flag, ar_handler);
     }
 
-    size_t block_idx = rank;
+    int block_idx = rank;
     size_t main_block_count = count / comm_size;
     size_t buf_offset;
 
@@ -362,7 +362,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             ccl_buffer(&ar_handler->sync_flags[idx], sizeof(uint64_t)),
             (atl_mr_t*)nullptr, /* src_mr */
             sizeof(uint64_t),
-            ccl_datatype_char,
+            ccl_datatype_int8,
             ar_handler->dst_peer,
             (atl_mr_t*)nullptr, /* dst_mr */
             0 /* dst_buf_offset */,
@@ -423,7 +423,7 @@ ccl_status_t ccl_coll_build_ring_rma_allreduce(ccl_sched* sched,
             ccl_buffer(&ar_handler->sync_flags[flag_idx_offset + idx], sizeof(uint64_t)),
             (atl_mr_t*)nullptr, /* src_mr */
             sizeof(uint64_t),
-            ccl_datatype_char,
+            ccl_datatype_int8,
             ar_handler->dst_peer,
             (atl_mr_t*)nullptr, /* dst_mr */
             0 /* dst_buf_offset */,
diff --git a/src/coll/algorithms/allreduce/allreduce_rma.hpp b/src/coll/algorithms/allreduce/allreduce_rma.hpp
index 2c8013a83..76e2075c8 100644
--- a/src/coll/algorithms/allreduce/allreduce_rma.hpp
+++ b/src/coll/algorithms/allreduce/allreduce_rma.hpp
@@ -20,8 +20,8 @@
 typedef struct {
     int wait_dst;
 
-    size_t src_peer;
-    size_t dst_peer;
+    int src_peer;
+    int dst_peer;
 
     volatile uint64_t sync_flag; // src side will write here the index of iteration it completed
     atl_mr_t* sync_flag_mr;
diff --git a/src/coll/algorithms/alltoall.cpp b/src/coll/algorithms/alltoall.cpp
index 17db2b02e..2bd43f5cf 100644
--- a/src/coll/algorithms/alltoall.cpp
+++ b/src/coll/algorithms/alltoall.cpp
@@ -16,14 +16,14 @@
 #include "coll/algorithms/algorithms.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
-ccl_status_t ccl_coll_build_direct_alltoall(ccl_sched* sched,
-                                            ccl_buffer send_buf,
-                                            ccl_buffer recv_buf,
-                                            size_t count,
-                                            const ccl_datatype& dtype,
-                                            ccl_comm* comm) {
+ccl::status ccl_coll_build_direct_alltoall(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           ccl_buffer recv_buf,
+                                           size_t count,
+                                           const ccl_datatype& dtype,
+                                           ccl_comm* comm) {
     LOG_DEBUG("build direct alltoall");
 
     entry_factory::make_entry<alltoall_entry>(sched, send_buf, recv_buf, count, dtype, comm);
-    return ccl_status_success;
+    return ccl::status::success;
 }
diff --git a/src/coll/algorithms/alltoallv.cpp b/src/coll/algorithms/alltoallv.cpp
index 1e180ce3c..f9675483b 100644
--- a/src/coll/algorithms/alltoallv.cpp
+++ b/src/coll/algorithms/alltoallv.cpp
@@ -26,22 +26,22 @@
 #include "sched/entry/factory/chunked_entry_factory.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
-ccl_status_t ccl_coll_build_direct_alltoallv(ccl_sched* sched,
-                                             ccl_buffer send_buf,
-                                             const size_t* send_counts,
-                                             ccl_buffer recv_buf,
-                                             const size_t* recv_counts,
-                                             const ccl_datatype& dtype,
-                                             ccl_comm* comm) {
+ccl::status ccl_coll_build_direct_alltoallv(ccl_sched* sched,
+                                            ccl_buffer send_buf,
+                                            const size_t* send_counts,
+                                            ccl_buffer recv_buf,
+                                            const size_t* recv_counts,
+                                            const ccl_datatype& dtype,
+                                            ccl_comm* comm) {
     LOG_DEBUG("build direct alltoallv");
 
     entry_factory::make_entry<alltoallv_entry>(
         sched, send_buf, send_counts, recv_buf, recv_counts, dtype, comm);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_add_scatter_alltoallv_barriers(std::vector<ccl_sched*>& scheds,
-                                                     size_t sched_idx) {
+ccl::status ccl_coll_add_scatter_alltoallv_barriers(std::vector<ccl_sched*>& scheds,
+                                                    size_t sched_idx) {
     ssize_t max_ops = ccl::global_data::env().alltoall_scatter_max_ops;
 
     if (max_ops != CCL_ENV_SIZET_NOT_SPECIFIED) {
@@ -56,23 +56,23 @@ ccl_status_t ccl_coll_add_scatter_alltoallv_barriers(std::vector<ccl_sched*>& sc
         }
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_calculate_alltoallv_counts(const ccl_coll_param& coll_param,
-                                                 std::vector<size_t>& send_counts,
-                                                 std::vector<size_t>& recv_counts,
-                                                 std::vector<size_t>& send_offsets,
-                                                 std::vector<size_t>& recv_offsets,
-                                                 size_t& total_send_count,
-                                                 size_t& total_recv_count,
-                                                 size_t& total_send_bytes,
-                                                 size_t& total_recv_bytes) {
+ccl::status ccl_coll_calculate_alltoallv_counts(const ccl_coll_param& coll_param,
+                                                std::vector<size_t>& send_counts,
+                                                std::vector<size_t>& recv_counts,
+                                                std::vector<size_t>& send_offsets,
+                                                std::vector<size_t>& recv_offsets,
+                                                size_t& total_send_count,
+                                                size_t& total_recv_count,
+                                                size_t& total_send_bytes,
+                                                size_t& total_recv_bytes) {
     ccl_coll_type coll_type = coll_param.ctype;
     ccl_comm* comm = coll_param.comm;
     const ccl_datatype& dtype = coll_param.dtype;
 
-    size_t comm_size = comm->size();
+    int comm_size = comm->size();
     size_t dtype_size = dtype.size();
 
     if (coll_type == ccl_coll_alltoall) {
@@ -91,7 +91,7 @@ ccl_status_t ccl_coll_calculate_alltoallv_counts(const ccl_coll_param& coll_para
     send_offsets.resize(comm_size, 0);
     recv_offsets.resize(comm_size, 0);
 
-    for (size_t idx = 1; idx < comm_size; idx++) {
+    for (int idx = 1; idx < comm_size; idx++) {
         send_offsets[idx] = send_offsets[idx - 1] + send_counts[idx - 1] * dtype_size;
         recv_offsets[idx] = recv_offsets[idx - 1] + recv_counts[idx - 1] * dtype_size;
     }
@@ -111,19 +111,19 @@ ccl_status_t ccl_coll_calculate_alltoallv_counts(const ccl_coll_param& coll_para
               ", total_recv_bytes ",
               total_recv_bytes);
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
-                                            std::vector<ccl_sched*>& scheds,
-                                            const ccl_coll_param& coll_param) {
+ccl::status ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
+                                           std::vector<ccl_sched*>& scheds,
+                                           const ccl_coll_param& coll_param) {
     LOG_DEBUG("build naive alltoallv");
 
     ccl_comm* comm = coll_param.comm;
     const ccl_datatype& dtype = coll_param.dtype;
 
-    size_t comm_rank = comm->rank();
-    size_t comm_size = comm->size();
+    int comm_rank = comm->rank();
+    int comm_size = comm->size();
     size_t sched_count = scheds.size();
     size_t dtype_size = dtype.size();
 
@@ -159,7 +159,7 @@ ccl_status_t ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
                                               dtype);
     }
 
-    for (size_t idx = 0; idx < comm_size; idx++) {
+    for (int idx = 0; idx < comm_size; idx++) {
         if (idx == comm_rank)
             continue;
 
@@ -203,19 +203,19 @@ ccl_status_t ccl_coll_build_naive_alltoallv(ccl_master_sched* main_sched,
         }
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
-                                              std::vector<ccl_sched*>& scheds,
-                                              const ccl_coll_param& coll_param) {
+ccl::status ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
+                                             std::vector<ccl_sched*>& scheds,
+                                             const ccl_coll_param& coll_param) {
     LOG_DEBUG("build scatter alltoall");
 
     ccl_comm* comm = coll_param.comm;
     const ccl_datatype& dtype = coll_param.dtype;
 
-    size_t comm_rank = comm->rank();
-    size_t comm_size = comm->size();
+    int comm_rank = comm->rank();
+    int comm_size = comm->size();
     size_t sched_count = scheds.size();
     size_t dtype_size = dtype.size();
 
@@ -255,8 +255,8 @@ ccl_status_t ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
                                               dtype);
     }
 
-    for (size_t idx = 0; idx < comm_size; idx++) {
-        size_t src = (comm_rank + idx) % comm_size;
+    for (int idx = 0; idx < comm_size; idx++) {
+        int src = (comm_rank + idx) % comm_size;
 
         if (src == comm_rank)
             continue;
@@ -281,8 +281,8 @@ ccl_status_t ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
         ccl_coll_add_scatter_alltoallv_barriers(scheds, sched_idx);
     }
 
-    for (size_t idx = 0; idx < comm_size; idx++) {
-        size_t dst = (comm_rank - idx + comm_size) % comm_size;
+    for (int idx = 0; idx < comm_size; idx++) {
+        int dst = (comm_rank - idx + comm_size) % comm_size;
 
         if (dst == comm_rank)
             continue;
@@ -305,11 +305,11 @@ ccl_status_t ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
     }
 
     if (!inplace)
-        return ccl_status_success;
+        return ccl::status::success;
 
     main_sched->sync_partial_scheds();
 
-    for (size_t idx = 0; idx < comm_size; idx++) {
+    for (int idx = 0; idx < comm_size; idx++) {
         if (idx == comm_rank)
             continue;
 
@@ -325,19 +325,19 @@ ccl_status_t ccl_coll_build_scatter_alltoallv(ccl_master_sched* main_sched,
                                               dtype);
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sched,
-                                                      std::vector<ccl_sched*>& scheds,
-                                                      const ccl_coll_param& coll_param) {
+ccl::status ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sched,
+                                                     std::vector<ccl_sched*>& scheds,
+                                                     const ccl_coll_param& coll_param) {
     LOG_DEBUG("build scatter_barrier alltoallv");
 
     ccl_comm* comm = coll_param.comm;
     const ccl_datatype& dtype = coll_param.dtype;
 
-    size_t comm_rank = comm->rank();
-    size_t comm_size = comm->size();
+    int comm_rank = comm->rank();
+    int comm_size = comm->size();
     size_t sched_count = scheds.size();
     size_t dtype_size = dtype.size();
 
@@ -394,8 +394,8 @@ ccl_status_t ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sch
                                               dtype);
     }
 
-    for (size_t idx = 0; idx < comm_size; idx++) {
-        size_t src = (comm_rank + idx) % comm_size;
+    for (int idx = 0; idx < comm_size; idx++) {
+        int src = (comm_rank + idx) % comm_size;
 
         if (src == comm_rank)
             continue;
@@ -423,8 +423,8 @@ ccl_status_t ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sch
         ccl_coll_add_scatter_alltoallv_barriers(recv_scheds, sched_idx);
     }
 
-    for (size_t idx = 0; idx < comm_size; idx++) {
-        size_t dst = (comm_rank - idx + comm_size) % comm_size;
+    for (int idx = 0; idx < comm_size; idx++) {
+        int dst = (comm_rank - idx + comm_size) % comm_size;
 
         if (dst == comm_rank)
             continue;
@@ -447,11 +447,11 @@ ccl_status_t ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sch
     }
 
     if (!inplace)
-        return ccl_status_success;
+        return ccl::status::success;
 
     main_sched->sync_partial_scheds();
 
-    for (size_t idx = 0; idx < comm_size; idx++) {
+    for (int idx = 0; idx < comm_size; idx++) {
         if (idx == comm_rank)
             continue;
 
@@ -467,5 +467,5 @@ ccl_status_t ccl_coll_build_scatter_barrier_alltoallv(ccl_master_sched* main_sch
                                               dtype);
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
diff --git a/src/coll/algorithms/barrier.cpp b/src/coll/algorithms/barrier.cpp
index f34184512..5aa05e094 100644
--- a/src/coll/algorithms/barrier.cpp
+++ b/src/coll/algorithms/barrier.cpp
@@ -22,17 +22,17 @@
 #include "coll/algorithms/algorithms.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
-ccl_status_t ccl_coll_build_direct_barrier(ccl_sched* sched, ccl_comm* comm) {
+ccl::status ccl_coll_build_direct_barrier(ccl_sched* sched, ccl_comm* comm) {
     LOG_DEBUG("build direct barrier");
 
     entry_factory::make_entry<barrier_entry>(sched, comm);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* comm) {
+ccl::status ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* comm) {
     LOG_DEBUG("build dissemination barrier");
 
-    ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
     int size, rank, src, dst, mask;
     size = comm->size();
     rank = comm->rank();
@@ -44,8 +44,8 @@ ccl_status_t ccl_coll_build_dissemination_barrier(ccl_sched* sched, ccl_comm* co
     while (mask < size) {
         dst = (rank + mask) % size;
         src = (rank - mask + size) % size;
-        entry_factory::make_entry<send_entry>(sched, ccl_buffer(), 0, ccl_datatype_char, dst, comm);
-        entry_factory::make_entry<recv_entry>(sched, ccl_buffer(), 0, ccl_datatype_char, src, comm);
+        entry_factory::make_entry<send_entry>(sched, ccl_buffer(), 0, ccl_datatype_int8, dst, comm);
+        entry_factory::make_entry<recv_entry>(sched, ccl_buffer(), 0, ccl_datatype_int8, src, comm);
         sched->add_barrier();
         mask <<= 1;
     }
diff --git a/src/coll/algorithms/bcast.cpp b/src/coll/algorithms/bcast.cpp
index ec5de1a97..59dcf76e0 100644
--- a/src/coll/algorithms/bcast.cpp
+++ b/src/coll/algorithms/bcast.cpp
@@ -13,222 +13,222 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-
-/*
-*
-*  (C) 2001 by Argonne National Laboratory.
-*      See COPYRIGHT in top-level directory.
-*/
-
-#include "coll/algorithms/algorithms.hpp"
-#include "sched/entry/factory/entry_factory.hpp"
-
-#define MIN(a, b) std::min(a, b)
-
-ccl_status_t ccl_coll_build_direct_bcast(ccl_sched* sched,
-                                         ccl_buffer buf,
-                                         size_t count,
-                                         const ccl_datatype& dtype,
-                                         size_t root,
-                                         ccl_comm* comm) {
-    LOG_DEBUG("build direct bcast");
-
-    entry_factory::make_entry<bcast_entry>(sched, buf, count, dtype, root, comm);
-    return ccl_status_success;
-}
-
-ccl_status_t ccl_coll_build_naive_bcast(ccl_sched* sched,
-                                        ccl_buffer buf,
-                                        size_t count,
-                                        const ccl_datatype& dtype,
-                                        size_t root,
-                                        ccl_comm* comm) {
-    LOG_DEBUG("build naive bcast");
-
-    ccl_status_t status = ccl_status_success;
-
-    size_t rank = comm->rank();
-    size_t comm_size = comm->size();
-    size_t idx;
-
-    if (comm_size == 1)
-        goto fn_exit;
-
-    if (rank == root) {
-        for (idx = 0; idx < comm_size; idx++) {
-            if (idx != rank) {
-                entry_factory::make_entry<send_entry>(sched, buf, count, dtype, idx, comm);
-            }
-        }
-    }
-    else {
-        entry_factory::make_entry<recv_entry>(sched, buf, count, dtype, root, comm);
-    }
-
-fn_exit:
-    return status;
-}
-
-ccl_status_t ccl_coll_build_scatter_for_bcast(ccl_sched* sched,
-                                              ccl_buffer tmp_buf,
-                                              size_t root,
-                                              size_t nbytes,
-                                              ccl_comm* comm) {
-    LOG_DEBUG("build scatter_for_bcast");
-
-    ccl_status_t status = ccl_status_success;
-    int rank, local_root, comm_size, src, dst;
-    int relative_rank, mask;
-    int scatter_size, curr_size, recv_size, send_size;
-
-    comm_size = comm->size();
-    rank = comm->rank();
-    local_root = static_cast<int>(root);
-    relative_rank = (rank >= local_root) ? rank - local_root : rank - local_root + comm_size;
-
-    /* The scatter algorithm divides the buffer into nprocs pieces and
-     * scatters them among the processes. Root gets the first piece,
-     * root+1 gets the second piece, and so forth. Uses the same
-     * binomial tree algorithm as above. Ceiling division is used to
-     * compute the size of each piece. This means some processes may
-     * not get any data. For example if bufsize = 97 and nprocs = 16,
-     * ranks 15 and 16 will get 0 data. On each process, the scattered
-     * data is stored at the same offset in the buffer as it is on the
-     * root process. */
-
-    scatter_size = (nbytes + comm_size - 1) / comm_size; /* ceiling division */
-    curr_size = (rank == local_root) ? nbytes : 0; /* root starts with all the data */
-
-    mask = 0x1;
-    while (mask < comm_size) {
-        if (relative_rank & mask) {
-            src = rank - mask;
-            if (src < 0)
-                src += comm_size;
-
-            /* compute the exact recv_size to avoid writing this NBC
-             * in callback style */
-            recv_size = nbytes - (relative_rank * scatter_size);
-            if (recv_size < 0)
-                recv_size = 0;
-
-            curr_size = recv_size;
-
-            if (recv_size > 0) {
-                entry_factory::make_entry<recv_entry>(sched,
-                                                      tmp_buf + relative_rank * scatter_size,
-                                                      recv_size,
-                                                      ccl_datatype_char,
-                                                      src,
-                                                      comm);
-                sched->add_barrier();
-            }
-            break;
-        }
-        mask <<= 1;
-    }
-
-    /* This process is responsible for all processes that have bits
-     * set from the LSB upto (but not including) mask.  Because of the
-     * "not including", we start by shifting mask back down one. */
-
-    mask >>= 1;
-    while (mask > 0) {
-        if (relative_rank + mask < comm_size) {
-            send_size = curr_size - scatter_size * mask;
-
-            /* mask is also the size of this process's subtree */
-
-            if (send_size > 0) {
-                dst = rank + mask;
-                if (dst >= comm_size)
-                    dst -= comm_size;
-
-                entry_factory::make_entry<send_entry>(
-                    sched,
-                    tmp_buf + scatter_size * (relative_rank + mask),
-                    send_size,
-                    ccl_datatype_char,
-                    dst,
-                    comm);
-                sched->add_barrier();
-                curr_size -= send_size;
-            }
-        }
-        mask >>= 1;
-    }
-
-    return status;
-}
-
-ccl_status_t ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
-                                                         ccl_buffer buf,
-                                                         size_t count,
-                                                         const ccl_datatype& dtype,
-                                                         size_t root,
-                                                         ccl_comm* comm) {
-    LOG_DEBUG("build scatter_ring_allgather bcast");
-
-    ccl_status_t status = ccl_status_success;
-
-    int comm_size, rank, nbytes;
-    int scatter_size, curr_size;
-    int i, j, jnext, left, right;
-    size_t dtype_size = dtype.size();
-
-    comm_size = comm->size();
-    rank = comm->rank();
-
-    ccl_buffer tmp_buf(buf);
-
-    /* If there is only one process, return */
-    if (comm_size == 1)
-        goto fn_exit;
-
-    nbytes = dtype_size * count;
-
-    CCL_CALL(ccl_coll_build_scatter_for_bcast(sched, tmp_buf, root, nbytes, comm));
-
-    /* this is the block size used for the scatter operation */
-    scatter_size = (nbytes + comm_size - 1) / comm_size; /* ceiling division */
-
-    /* curr_size is the amount of data that this process now has stored in
-     * buffer at byte offset (rank*scatter_size) */
-    curr_size = MIN(scatter_size, (nbytes - (rank * scatter_size)));
-    if (curr_size < 0)
-        curr_size = 0;
-
-    /* long-message allgather or medium-size but non-power-of-two. use ring algorithm. */
-
-    left = (comm_size + rank - 1) % comm_size;
-    right = (rank + 1) % comm_size;
-
-    j = rank;
-    jnext = left;
-    for (i = 1; i < comm_size; i++) {
-        int left_count, right_count, left_disp, right_disp, rel_j, rel_jnext;
-
-        rel_j = (j - root + comm_size) % comm_size;
-        rel_jnext = (jnext - root + comm_size) % comm_size;
-        left_count = MIN(scatter_size, (nbytes - rel_jnext * scatter_size));
-        if (left_count < 0)
-            left_count = 0;
-        left_disp = rel_jnext * scatter_size;
-        right_count = MIN(scatter_size, (nbytes - rel_j * scatter_size));
-        if (right_count < 0)
-            right_count = 0;
-        right_disp = rel_j * scatter_size;
-        entry_factory::make_entry<send_entry>(
-            sched, tmp_buf + right_disp, right_count, ccl_datatype_char, right, comm);
-        /* sendrecv, no barrier here */
-        entry_factory::make_entry<recv_entry>(
-            sched, tmp_buf + left_disp, left_count, ccl_datatype_char, left, comm);
-        sched->add_barrier();
-
-        j = jnext;
-        jnext = (comm_size + jnext - 1) % comm_size;
-    }
-
-fn_exit:
-    return status;
-}
+
+/*
+*
+*  (C) 2001 by Argonne National Laboratory.
+*      See COPYRIGHT in top-level directory.
+*/
+
+#include "coll/algorithms/algorithms.hpp"
+#include "sched/entry/factory/entry_factory.hpp"
+
+#define MIN(a, b) std::min(a, b)
+
+ccl::status ccl_coll_build_direct_bcast(ccl_sched* sched,
+                                        ccl_buffer buf,
+                                        size_t count,
+                                        const ccl_datatype& dtype,
+                                        int root,
+                                        ccl_comm* comm) {
+    LOG_DEBUG("build direct bcast");
+
+    entry_factory::make_entry<bcast_entry>(sched, buf, count, dtype, root, comm);
+    return ccl::status::success;
+}
+
+ccl::status ccl_coll_build_naive_bcast(ccl_sched* sched,
+                                       ccl_buffer buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       int root,
+                                       ccl_comm* comm) {
+    LOG_DEBUG("build naive bcast");
+
+    ccl::status status = ccl::status::success;
+
+    int rank = comm->rank();
+    int comm_size = comm->size();
+    int idx;
+
+    if (comm_size == 1)
+        goto fn_exit;
+
+    if (rank == root) {
+        for (idx = 0; idx < comm_size; idx++) {
+            if (idx != rank) {
+                entry_factory::make_entry<send_entry>(sched, buf, count, dtype, idx, comm);
+            }
+        }
+    }
+    else {
+        entry_factory::make_entry<recv_entry>(sched, buf, count, dtype, root, comm);
+    }
+
+fn_exit:
+    return status;
+}
+
+ccl::status ccl_coll_build_scatter_for_bcast(ccl_sched* sched,
+                                             ccl_buffer tmp_buf,
+                                             int root,
+                                             size_t nbytes,
+                                             ccl_comm* comm) {
+    LOG_DEBUG("build scatter_for_bcast");
+
+    ccl::status status = ccl::status::success;
+    int rank, local_root, comm_size, src, dst;
+    int relative_rank, mask;
+    int scatter_size, curr_size, recv_size, send_size;
+
+    comm_size = comm->size();
+    rank = comm->rank();
+    local_root = static_cast<int>(root);
+    relative_rank = (rank >= local_root) ? rank - local_root : rank - local_root + comm_size;
+
+    /* The scatter algorithm divides the buffer into nprocs pieces and
+     * scatters them among the processes. Root gets the first piece,
+     * root+1 gets the second piece, and so forth. Uses the same
+     * binomial tree algorithm as above. Ceiling division is used to
+     * compute the size of each piece. This means some processes may
+     * not get any data. For example if bufsize = 97 and nprocs = 16,
+     * ranks 15 and 16 will get 0 data. On each process, the scattered
+     * data is stored at the same offset in the buffer as it is on the
+     * root process. */
+
+    scatter_size = (nbytes + comm_size - 1) / comm_size; /* ceiling division */
+    curr_size = (rank == local_root) ? nbytes : 0; /* root starts with all the data */
+
+    mask = 0x1;
+    while (mask < comm_size) {
+        if (relative_rank & mask) {
+            src = rank - mask;
+            if (src < 0)
+                src += comm_size;
+
+            /* compute the exact recv_size to avoid writing this NBC
+             * in callback style */
+            recv_size = nbytes - (relative_rank * scatter_size);
+            if (recv_size < 0)
+                recv_size = 0;
+
+            curr_size = recv_size;
+
+            if (recv_size > 0) {
+                entry_factory::make_entry<recv_entry>(sched,
+                                                      tmp_buf + relative_rank * scatter_size,
+                                                      recv_size,
+                                                      ccl_datatype_int8,
+                                                      src,
+                                                      comm);
+                sched->add_barrier();
+            }
+            break;
+        }
+        mask <<= 1;
+    }
+
+    /* This process is responsible for all processes that have bits
+     * set from the LSB upto (but not including) mask.  Because of the
+     * "not including", we start by shifting mask back down one. */
+
+    mask >>= 1;
+    while (mask > 0) {
+        if (relative_rank + mask < comm_size) {
+            send_size = curr_size - scatter_size * mask;
+
+            /* mask is also the size of this process's subtree */
+
+            if (send_size > 0) {
+                dst = rank + mask;
+                if (dst >= comm_size)
+                    dst -= comm_size;
+
+                entry_factory::make_entry<send_entry>(
+                    sched,
+                    tmp_buf + scatter_size * (relative_rank + mask),
+                    send_size,
+                    ccl_datatype_int8,
+                    dst,
+                    comm);
+                sched->add_barrier();
+                curr_size -= send_size;
+            }
+        }
+        mask >>= 1;
+    }
+
+    return status;
+}
+
+ccl::status ccl_coll_build_scatter_ring_allgather_bcast(ccl_sched* sched,
+                                                        ccl_buffer buf,
+                                                        size_t count,
+                                                        const ccl_datatype& dtype,
+                                                        int root,
+                                                        ccl_comm* comm) {
+    LOG_DEBUG("build scatter_ring_allgather bcast");
+
+    ccl::status status = ccl::status::success;
+
+    int comm_size, rank, nbytes;
+    int scatter_size, curr_size;
+    int i, j, jnext, left, right;
+    size_t dtype_size = dtype.size();
+
+    comm_size = comm->size();
+    rank = comm->rank();
+
+    ccl_buffer tmp_buf(buf);
+
+    /* If there is only one process, return */
+    if (comm_size == 1)
+        goto fn_exit;
+
+    nbytes = dtype_size * count;
+
+    CCL_CALL(ccl_coll_build_scatter_for_bcast(sched, tmp_buf, root, nbytes, comm));
+
+    /* this is the block size used for the scatter operation */
+    scatter_size = (nbytes + comm_size - 1) / comm_size; /* ceiling division */
+
+    /* curr_size is the amount of data that this process now has stored in
+     * buffer at byte offset (rank*scatter_size) */
+    curr_size = MIN(scatter_size, (nbytes - (rank * scatter_size)));
+    if (curr_size < 0)
+        curr_size = 0;
+
+    /* long-message allgather or medium-size but non-power-of-two. use ring algorithm. */
+
+    left = (comm_size + rank - 1) % comm_size;
+    right = (rank + 1) % comm_size;
+
+    j = rank;
+    jnext = left;
+    for (i = 1; i < comm_size; i++) {
+        int left_count, right_count, left_disp, right_disp, rel_j, rel_jnext;
+
+        rel_j = (j - root + comm_size) % comm_size;
+        rel_jnext = (jnext - root + comm_size) % comm_size;
+        left_count = MIN(scatter_size, (nbytes - rel_jnext * scatter_size));
+        if (left_count < 0)
+            left_count = 0;
+        left_disp = rel_jnext * scatter_size;
+        right_count = MIN(scatter_size, (nbytes - rel_j * scatter_size));
+        if (right_count < 0)
+            right_count = 0;
+        right_disp = rel_j * scatter_size;
+        entry_factory::make_entry<send_entry>(
+            sched, tmp_buf + right_disp, right_count, ccl_datatype_int8, right, comm);
+        /* sendrecv, no barrier here */
+        entry_factory::make_entry<recv_entry>(
+            sched, tmp_buf + left_disp, left_count, ccl_datatype_int8, left, comm);
+        sched->add_barrier();
+
+        j = jnext;
+        jnext = (comm_size + jnext - 1) % comm_size;
+    }
+
+fn_exit:
+    return status;
+}
diff --git a/src/coll/algorithms/double_tree_ops.cpp b/src/coll/algorithms/double_tree_ops.cpp
index a981b867e..e124673f2 100644
--- a/src/coll/algorithms/double_tree_ops.cpp
+++ b/src/coll/algorithms/double_tree_ops.cpp
@@ -143,16 +143,16 @@ static void reduce_bcast_tree(const ccl_bin_tree& tree,
     }
 }
 
-ccl_status_t ccl_coll_build_double_tree_op(ccl_sched* sched,
-                                           ccl_coll_type coll_type,
-                                           ccl_buffer send_buf,
-                                           ccl_buffer recv_buf,
-                                           size_t count,
-                                           const ccl_datatype& dtype,
-                                           ccl::reduction op,
-                                           const ccl_double_tree& dtree,
-                                           ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
+                                          ccl_coll_type coll_type,
+                                          ccl_buffer send_buf,
+                                          ccl_buffer recv_buf,
+                                          size_t count,
+                                          const ccl_datatype& dtype,
+                                          ccl::reduction op,
+                                          const ccl_double_tree& dtree,
+                                          ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     LOG_DEBUG("build double tree ", ccl_coll_type_to_str(coll_type));
 
diff --git a/src/coll/algorithms/reduce.cpp b/src/coll/algorithms/reduce.cpp
index ee99e7f93..65707afed 100644
--- a/src/coll/algorithms/reduce.cpp
+++ b/src/coll/algorithms/reduce.cpp
@@ -51,32 +51,32 @@
            n.(1+(p-1)/p).gamma
 */
 
-ccl_status_t ccl_coll_build_direct_reduce(ccl_sched* sched,
-                                          ccl_buffer send_buf,
-                                          ccl_buffer recv_buf,
-                                          size_t count,
-                                          const ccl_datatype& dtype,
-                                          ccl::reduction reduction,
-                                          size_t root,
-                                          ccl_comm* comm) {
+ccl::status ccl_coll_build_direct_reduce(ccl_sched* sched,
+                                         ccl_buffer send_buf,
+                                         ccl_buffer recv_buf,
+                                         size_t count,
+                                         const ccl_datatype& dtype,
+                                         ccl::reduction reduction,
+                                         int root,
+                                         ccl_comm* comm) {
     LOG_DEBUG("build direct reduce");
 
     entry_factory::make_entry<reduce_entry>(
         sched, send_buf, recv_buf, count, dtype, reduction, root, comm);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
-                                                ccl_buffer send_buf,
-                                                ccl_buffer recv_buf,
-                                                size_t count,
-                                                const ccl_datatype& dtype,
-                                                ccl::reduction reduction,
-                                                size_t root,
-                                                ccl_comm* comm) {
+ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t count,
+                                               const ccl_datatype& dtype,
+                                               ccl::reduction reduction,
+                                               int root,
+                                               ccl_comm* comm) {
     LOG_DEBUG("build Rabenseifner's reduce");
 
-    ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
 
     int i, j, comm_size, rank, local_root, pof2;
     int rem, dst, new_rank, new_dst, mask, send_idx, recv_idx, last_idx;
@@ -347,17 +347,17 @@ ccl_status_t ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
     return status;
 }
 
-ccl_status_t ccl_coll_build_binomial_reduce(ccl_sched* sched,
-                                            ccl_buffer send_buf,
-                                            ccl_buffer recv_buf,
-                                            size_t count,
-                                            const ccl_datatype& dtype,
-                                            ccl::reduction reduction,
-                                            size_t root,
-                                            ccl_comm* comm) {
+ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
+                                           ccl_buffer send_buf,
+                                           ccl_buffer recv_buf,
+                                           size_t count,
+                                           const ccl_datatype& dtype,
+                                           ccl::reduction reduction,
+                                           int root,
+                                           ccl_comm* comm) {
     LOG_DEBUG("build binomial reduce");
 
-    ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
 
     int comm_size, rank, local_root;
     int mask, relrank, source, lroot;
diff --git a/src/coll/algorithms/reduce_scatter.cpp b/src/coll/algorithms/reduce_scatter.cpp
index 659c9ec7b..4a6774ff7 100644
--- a/src/coll/algorithms/reduce_scatter.cpp
+++ b/src/coll/algorithms/reduce_scatter.cpp
@@ -23,29 +23,27 @@
 #include "coll/algorithms/algorithms.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
-ccl_status_t ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
+ccl::status ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
                                                  ccl_buffer send_buf,
                                                  ccl_buffer recv_buf,
                                                  size_t recv_count,
                                                  const ccl_datatype& dtype,
                                                  ccl::reduction reduction,
-                                                 ccl_comm* comm)
-{
+                                                 ccl_comm* comm) {
     LOG_DEBUG("build direct reduce_scatter");
 
     entry_factory::make_entry<reduce_scatter_entry>(
         sched, send_buf, recv_buf, recv_count, dtype, reduction, comm);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
-                                                      ccl_buffer send_buf,
-                                                      ccl_buffer recv_buf,
-                                                      size_t recv_count,
-                                                      const ccl_datatype& dtype,
-                                                      ccl::reduction op,
-                                                      ccl_comm* comm)
-{
+ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
+                                                     ccl_buffer send_buf,
+                                                     ccl_buffer recv_buf,
+                                                     size_t recv_count,
+                                                     const ccl_datatype& dtype,
+                                                     ccl::reduction op,
+                                                     ccl_comm* comm) {
     CCL_THROW_IF_NOT(sched && send_buf && recv_buf,
                      "incorrect values, sched ",
                      sched,
@@ -55,37 +53,31 @@ ccl_status_t ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
                      recv_buf);
 
     int inplace = (send_buf == recv_buf) ? 1 : 0;
-    LOG_DEBUG("build ring reduce_scatter_block: ",
-              inplace ? "in-place" : "out-of-place");
+    LOG_DEBUG("build ring reduce_scatter_block: ", inplace ? "in-place" : "out-of-place");
 
-    ccl_status_t status = ccl_status_success;
-    size_t comm_size, rank, idx;
+    ccl::status status = ccl::status::success;
+    int comm_size, rank, idx;
     size_t dtype_size = dtype.size();
 
-    size_t src, dst;
+    int src, dst;
 
     comm_size = comm->size();
     rank = comm->rank();
 
     if (recv_count == 0) {
-        return ccl_status_success;
+        return ccl::status::success;
     }
 
     if (!inplace) {
         /* copy local data into recv_buf */
         entry_factory::make_entry<copy_entry>(
-            sched,
-            send_buf + rank * recv_count * dtype_size,
-            recv_buf,
-            recv_count,
-            dtype);
+            sched, send_buf + rank * recv_count * dtype_size, recv_buf, recv_count, dtype);
     }
 
     /* allocate temporary buffer to store incoming data */
     ccl_buffer tmp_buf = sched->alloc_buffer(recv_count * dtype_size);
 
     for (idx = 1; idx < comm_size; idx++) {
-
         src = (comm_size + rank - idx) % comm_size;
         dst = (rank + idx) % comm_size;
 
@@ -93,59 +85,31 @@ ccl_status_t ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
          * needs from src into tmp_recvbuf */
         if (!inplace) {
             entry_factory::make_entry<send_entry>(
-                sched,
-                send_buf + dst * recv_count * dtype_size,
-                recv_count,
-                dtype,
-                dst,
-                comm);
-
-            entry_factory::make_entry<recv_entry>(
-                sched,
-                tmp_buf,
-                recv_count,
-                dtype,
-                src,
-                comm);
+                sched, send_buf + dst * recv_count * dtype_size, recv_count, dtype, dst, comm);
+
+            entry_factory::make_entry<recv_entry>(sched, tmp_buf, recv_count, dtype, src, comm);
         }
         else {
             entry_factory::make_entry<send_entry>(
-                sched,
-                recv_buf + dst * recv_count * dtype_size,
-                recv_count,
-                dtype,
-                dst,
-                comm);
-
-            entry_factory::make_entry<recv_entry>(
-                sched,
-                tmp_buf,
-                recv_count,
-                dtype,
-                src,
-                comm);
+                sched, recv_buf + dst * recv_count * dtype_size, recv_count, dtype, dst, comm);
+
+            entry_factory::make_entry<recv_entry>(sched, tmp_buf, recv_count, dtype, src, comm);
         }
 
         sched->add_barrier();
 
         if (!inplace) {
             entry_factory::make_entry<reduce_local_entry>(
-                sched,
-                tmp_buf,
-                recv_count,
-                recv_buf,
-                nullptr,
-                dtype,
-                op);
-        } else {
-            entry_factory::make_entry<reduce_local_entry>(
-                sched,
-                tmp_buf,
-                recv_count,
-                recv_buf + rank * recv_count * dtype_size,
-                nullptr,
-                dtype,
-                op);
+                sched, tmp_buf, recv_count, recv_buf, nullptr, dtype, op);
+        }
+        else {
+            entry_factory::make_entry<reduce_local_entry>(sched,
+                                                          tmp_buf,
+                                                          recv_count,
+                                                          recv_buf + rank * recv_count * dtype_size,
+                                                          nullptr,
+                                                          dtype,
+                                                          op);
         }
     }
 
@@ -153,25 +117,20 @@ ccl_status_t ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
      * recv_buf. already done for rank 0 */
     if (inplace && (rank != 0)) {
         entry_factory::make_entry<copy_entry>(
-            sched,
-            recv_buf + rank * recv_count * dtype_size,
-            recv_buf,
-            recv_count,
-            dtype);
+            sched, recv_buf + rank * recv_count * dtype_size, recv_buf, recv_count, dtype);
     }
 
     return status;
 }
 
 /* behaves like reduce_scatter_block but last block may contain more elements */
-ccl_status_t ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
-                                                ccl_buffer send_buf,
-                                                ccl_buffer recv_buf,
-                                                size_t send_count,
-                                                const ccl_datatype& dtype,
-                                                ccl::reduction op,
-                                                ccl_comm* comm) {
-
+ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
+                                               ccl_buffer send_buf,
+                                               ccl_buffer recv_buf,
+                                               size_t send_count,
+                                               const ccl_datatype& dtype,
+                                               ccl::reduction op,
+                                               ccl_comm* comm) {
     LOG_DEBUG("build ring reduce_scatter");
 
     CCL_THROW_IF_NOT(sched && send_buf && recv_buf,
@@ -182,23 +141,24 @@ ccl_status_t ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
                      " recv ",
                      recv_buf);
 
-    ccl_status_t status = ccl_status_success;
-    size_t comm_size, rank;
+    ccl::status status = ccl::status::success;
+    int comm_size, rank;
     size_t dtype_size = dtype.size();
 
     comm_size = comm->size();
     rank = comm->rank();
 
-    size_t src = (comm_size + rank - 1) % comm_size;
-    size_t dst = (comm_size + rank + 1) % comm_size;
+    int src = (comm_size + rank - 1) % comm_size;
+    int dst = (comm_size + rank + 1) % comm_size;
 
     size_t count = send_count;
     size_t bytes = count * dtype_size;
 
-    size_t chunk_count = (bytes >= ccl::global_data::env().rs_min_chunk_size &&
-                          count >= ccl::global_data::env().rs_chunk_count && count >= comm_size)
-                             ? ccl::global_data::env().rs_chunk_count
-                             : 1;
+    size_t chunk_count =
+        (bytes >= ccl::global_data::env().rs_min_chunk_size &&
+         count >= ccl::global_data::env().rs_chunk_count && (int)count >= comm_size)
+            ? ccl::global_data::env().rs_chunk_count
+            : 1;
 
     while ((chunk_count > 1) &&
            (bytes / (comm_size * chunk_count) < ccl::global_data::env().rs_min_chunk_size)) {
@@ -221,7 +181,7 @@ ccl_status_t ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
             entry_factory::make_entry<copy_entry>(sched, send_buf, recv_buf, count, dtype);
             sched->add_barrier();
         }
-        return ccl_status_success;
+        return ccl::status::success;
     }
 
     ccl_buffer tmp_buf;
@@ -238,10 +198,10 @@ ccl_status_t ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
     /* the final reduction result on last iteration in corresponsing block */
 
     /* block = group of ~ equal-sized chunks */
-    size_t block_idx = (rank + comm_size - 1) % comm_size;
+    int block_idx = (rank + comm_size - 1) % comm_size;
     size_t main_block_size = count / comm_size;
     size_t last_block_size = main_block_size + count % comm_size;
-    size_t send_block_idx, recv_block_idx;
+    int send_block_idx, recv_block_idx;
     size_t send_block_size, recv_block_size;
     size_t send_block_offset, recv_block_offset;
 
@@ -257,7 +217,7 @@ ccl_status_t ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
 
     ccl_recv_reduce_result_buf_type recv_reduce_result_type;
 
-    for (size_t idx = 0; idx < (comm_size - 1); idx++) {
+    for (int idx = 0; idx < (comm_size - 1); idx++) {
         send_block_idx = block_idx;
         recv_block_idx = (comm_size + block_idx - 1) % comm_size;
 
diff --git a/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp b/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
index 104d98cba..2daba7276 100644
--- a/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
+++ b/src/coll/algorithms/sparse_allreduce/sparse_allreduce.hpp
@@ -13,7 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "coll/algorithms/sparse_allreduce/sparse_handler.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
@@ -74,7 +74,7 @@
                 break; \
             default: \
                 CCL_FATAL("unexpected sparse_allreduce_algo ", ccl_coll_algorithm_to_str(algo)); \
-                return ccl_status_invalid_arguments; \
+                return ccl::status::invalid_arguments; \
         } \
     } while (0)
 
@@ -84,25 +84,14 @@
             case ccl::datatype::float32: \
                 CCL_SPARSE_ALLREDUCE_SELECT_ALGO(itype, float, algo); \
                 break; \
-            case ccl::datatype::float64: \
-                CCL_SPARSE_ALLREDUCE_SELECT_ALGO(itype, double, algo); \
-                break; \
-            case ccl::datatype::int8: CCL_SPARSE_ALLREDUCE_SELECT_ALGO(itype, char, algo); break; \
-            case ccl::datatype::int32: CCL_SPARSE_ALLREDUCE_SELECT_ALGO(itype, int, algo); break; \
-            case ccl::datatype::int64: \
-                CCL_SPARSE_ALLREDUCE_SELECT_ALGO(itype, int64_t, algo); \
-                break; \
-            case ccl::datatype::uint64: \
-                CCL_SPARSE_ALLREDUCE_SELECT_ALGO(itype, uint64_t, algo); \
-                break; \
             case ccl::datatype::bfloat16: \
-                CCL_SPARSE_ALLREDUCE_SELECT_ALGO(itype, ccl::bf16, algo); \
+                CCL_SPARSE_ALLREDUCE_SELECT_ALGO(itype, ccl::bfloat16, algo); \
                 break; \
             default: \
                 CCL_FATAL("value datatype ", \
                           ccl::global_data::get().dtypes->name(vtype), \
                           " is not supported yet"); \
-                return ccl_status_invalid_arguments; \
+                return ccl::status::invalid_arguments; \
         } \
     } while (0)
 
@@ -141,7 +130,7 @@
         sa_handler->size_per_rank = \
             static_cast<size_t*>(sched->alloc_buffer(sizeof(size_t) * comm_size).get_ptr()); \
 \
-        for (size_t i = 0; i < comm_size; i++) \
+        for (int i = 0; i < comm_size; i++) \
             sa_handler->size_per_rank[i] = sizeof(size_t); \
 \
         sa_handler->send_ibuf = send_ind_buf.get_ptr(); \
@@ -172,7 +161,7 @@
         param_nnz.recv_buf = ccl_buffer(sa_handler->recv_counts, sizeof(size_t) * comm_size); \
         param_nnz.send_count = sizeof(size_t); \
         param_nnz.recv_counts = sa_handler->size_per_rank; \
-        param_nnz.dtype = ccl_datatype_char; \
+        param_nnz.dtype = ccl_datatype_int8; \
         param_nnz.comm = comm; \
 \
         entry_factory::make_entry<coll_entry>(sched, param_nnz); \
@@ -180,7 +169,7 @@
     } while (0)
 
 template <typename vtype>
-typename std::enable_if<!std::is_same<vtype, ccl::bf16>::value, vtype>::type get_mask(
+typename std::enable_if<!std::is_same<vtype, ccl::bfloat16>::value, vtype>::type get_mask(
     ccl::reduction op) {
     switch (op) {
         case ccl::reduction::sum: return 0;
@@ -189,22 +178,22 @@ typename std::enable_if<!std::is_same<vtype, ccl::bf16>::value, vtype>::type get
         case ccl::reduction::max: return std::numeric_limits<vtype>::min();
         case ccl::reduction::custom:
             CCL_FATAL("custom reduction is not supported for sparse_allreduce/mask algorithm");
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
         default: return 0;
     }
 }
 
 template <typename vtype>
-typename std::enable_if<std::is_same<vtype, ccl::bf16>::value, vtype>::type get_mask(
+typename std::enable_if<std::is_same<vtype, ccl::bfloat16>::value, vtype>::type get_mask(
     ccl::reduction op) {
     switch (op) {
-        case ccl::reduction::sum: return 0;
-        case ccl::reduction::prod: return CCL_BF16_ONE;
-        case ccl::reduction::min: return CCL_BF16_MAX;
-        case ccl::reduction::max: return CCL_BF16_MIN;
+        case ccl::reduction::sum: return ccl::bfloat16(0);
+        case ccl::reduction::prod: return ccl::bfloat16(CCL_BF16_ONE);
+        case ccl::reduction::min: return ccl::bfloat16(CCL_BF16_MAX);
+        case ccl::reduction::max: return ccl::bfloat16(CCL_BF16_MIN);
         case ccl::reduction::custom:
             CCL_FATAL("custom reduction is not supported for sparse_allreduce/mask algorithm");
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
         default: return 0;
     }
 }
@@ -300,7 +289,7 @@ void sparse_coalesce(ccl_sparse_allreduce_handler* sah) {
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t sparse_reduce_ring(const void* ctx) {
+ccl::status sparse_reduce_ring(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
 
     /* Having received the msg we should prepare it for further send operation to the next neighbour. 
@@ -371,11 +360,11 @@ ccl_status_t sparse_reduce_ring(const void* ctx) {
         ccl_comp_copy(snd_i,
                       buf_i.data(),
                       sa_handler->itype_size * sa_handler->dst_count[0],
-                      ccl_datatype_char);
+                      ccl_datatype_int8);
         ccl_comp_copy(snd_v,
                       buf_v.data(),
                       sa_handler->vtype_size * sa_handler->dst_count[1],
-                      ccl_datatype_char);
+                      ccl_datatype_int8);
 
         size_t idx_offset = 0;
         for (auto id : unique_indices_ids) {
@@ -409,13 +398,13 @@ ccl_status_t sparse_reduce_ring(const void* ctx) {
         ccl_comp_copy(buf_i.data(),
                       (i_type*)(sa_handler->dst_buf),
                       sa_handler->itype_size * merge_idx_len,
-                      ccl_datatype_char);
+                      ccl_datatype_int8);
 
         ccl_comp_copy(
             buf_v.data(),
             (v_type*)((char*)(sa_handler->dst_buf) + sa_handler->itype_size * merge_idx_len),
             sa_handler->vtype_size * merge_idx_len * sa_handler->val_dim_cnt,
-            ccl_datatype_char);
+            ccl_datatype_int8);
 
         sa_handler->dst_count[0] = merge_idx_len;
         sa_handler->dst_count[1] = merge_idx_len * sa_handler->val_dim_cnt;
@@ -425,15 +414,15 @@ ccl_status_t sparse_reduce_ring(const void* ctx) {
     ccl_comp_copy(sa_handler->recv_buf,
                   sa_handler->send_tmp_buf,
                   idx_size + sa_handler->send_count[1] * sa_handler->vtype_size,
-                  ccl_datatype_char);
+                  ccl_datatype_int8);
 
     sa_handler->iter++;
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t sparse_prepare_result_ring(const void* ctx) {
+ccl::status sparse_prepare_result_ring(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
 
     /* data should be returned as sorted in the result buffer */
@@ -458,25 +447,25 @@ ccl_status_t sparse_prepare_result_ring(const void* ctx) {
 
     sa_handler->iv_map->clear();
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_send_count_ring(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_send_count_ring(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     size_t* cnt_ptr = (size_t*)field_ptr;
     *cnt_ptr = sa_handler->send_count[0] *
                (sa_handler->itype_size + sa_handler->val_dim_cnt * sa_handler->vtype_size);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_send_buf_ring(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_send_buf_ring(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(sa_handler->send_tmp_buf);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_recv_count_ring(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_recv_count_ring(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
 
     size_t* cnt_ptr = (size_t*)field_ptr;
@@ -485,21 +474,21 @@ ccl_status_t sparse_get_recv_count_ring(const void* ctx, void* field_ptr) {
                                 sa_handler->comm_size];
 
     *cnt_ptr = nnz * (sa_handler->itype_size + sa_handler->val_dim_cnt * sa_handler->vtype_size);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_recv_buf_ring(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_recv_buf_ring(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(sa_handler->recv_buf);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_set_max_buf_size_ring(const void* ctx) {
+ccl::status sparse_set_max_buf_size_ring(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     size_t max_nnz = sa_handler->recv_counts[0];
 
-    for (size_t i = 1; i < sa_handler->comm_size; i++) {
+    for (int i = 1; i < sa_handler->comm_size; i++) {
         if (max_nnz < sa_handler->recv_counts[i]) {
             max_nnz = sa_handler->recv_counts[i];
         }
@@ -514,11 +503,11 @@ ccl_status_t sparse_set_max_buf_size_ring(const void* ctx) {
         sa_handler->send_tmp_buf, sa_handler->dst_buf, sa_handler->dst_count[0] * common_size_part);
     sa_handler->recv_buf = sa_handler->sched->alloc_buffer(max_size).get_ptr();
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t sparse_coalesce_ring(const void* ctx) {
+ccl::status sparse_coalesce_ring(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
 
     sparse_coalesce<i_type, v_type>(sa_handler);
@@ -530,27 +519,27 @@ ccl_status_t sparse_coalesce_ring(const void* ctx) {
     CCL_MEMCPY(&sa_handler->dst_count, &sa_handler->send_count, sizeof(size_t) * 2);
 
     CCL_SPARSE_ALLREDUCE_IF_SINGLE_RANK();
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
-                                                  ccl_buffer send_ind_buf,
-                                                  size_t send_ind_count,
-                                                  ccl_buffer send_val_buf,
-                                                  size_t send_val_count,
-                                                  void** recv_ind_buf,
-                                                  size_t* recv_ind_count,
-                                                  void** recv_val_buf,
-                                                  size_t* recv_val_count,
-                                                  const ccl_datatype& index_dtype,
-                                                  const ccl_datatype& value_dtype,
-                                                  ccl::reduction op,
-                                                  ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
-
-    size_t comm_size = comm->size();
-    size_t rank = comm->rank();
+ccl::status ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
+                                                 ccl_buffer send_ind_buf,
+                                                 size_t send_ind_count,
+                                                 ccl_buffer send_val_buf,
+                                                 size_t send_val_count,
+                                                 void** recv_ind_buf,
+                                                 size_t* recv_ind_count,
+                                                 void** recv_val_buf,
+                                                 size_t* recv_val_count,
+                                                 const ccl_datatype& index_dtype,
+                                                 const ccl_datatype& value_dtype,
+                                                 ccl::reduction op,
+                                                 ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
+
+    int comm_size = comm->size();
+    int rank = comm->rank();
 
     /* get data type sizes */
     size_t vtype_size = sizeof(v_type);
@@ -570,10 +559,10 @@ ccl_status_t ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
 
     /* send from left to right (ring)*/
     /* receive from the left neighbour */
-    size_t recv_from = (rank - 1 + comm_size) % comm_size;
+    int recv_from = (rank - 1 + comm_size) % comm_size;
 
     /* send to the right neighbour */
-    size_t send_to = (rank + 1) % comm_size;
+    int send_to = (rank + 1) % comm_size;
 
     sa_handler->recv_from = recv_from;
     sa_handler->iter = 0;
@@ -591,16 +580,16 @@ ccl_status_t ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
         entry_factory::make_entry<function_entry>(sched, sparse_set_max_buf_size_ring, sa_handler);
         sched->add_barrier();
 
-        for (size_t i = 0; i < comm_size - 1; i++) {
+        for (int i = 0; i < comm_size - 1; i++) {
             /* send local data to the right neighbour */
             send_entry* se = entry_factory::make_entry<send_entry>(
-                sched, ccl_buffer(), 0, ccl_datatype_char, send_to, comm);
+                sched, ccl_buffer(), 0, ccl_datatype_int8, send_to, comm);
             se->set_field_fn<ccl_sched_entry_field_buf>(sparse_get_send_buf_ring, sa_handler);
             se->set_field_fn<ccl_sched_entry_field_cnt>(sparse_get_send_count_ring, sa_handler);
 
             /* receive data from the left neighbour */
             recv_entry* re = entry_factory::make_entry<recv_entry>(
-                sched, ccl_buffer(), 0, ccl_datatype_char, recv_from, comm);
+                sched, ccl_buffer(), 0, ccl_datatype_int8, recv_from, comm);
             re->set_field_fn<ccl_sched_entry_field_buf>(sparse_get_recv_buf_ring, sa_handler);
             re->set_field_fn<ccl_sched_entry_field_cnt>(sparse_get_recv_count_ring, sa_handler);
             sched->add_barrier();
@@ -621,7 +610,7 @@ ccl_status_t ccl_coll_build_sparse_allreduce_ring(ccl_sched* sched,
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t sparse_create_matrix_mask(const void* ctx) {
+ccl::status sparse_create_matrix_mask(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     LOG_TRACE("sa_handler: ",
               sa_handler,
@@ -674,7 +663,7 @@ ccl_status_t sparse_create_matrix_mask(const void* ctx) {
     ccl_comp_copy(matrix,
                   (char*)sa_handler->dst_buf + idx_cnt * sa_handler->itype_size,
                   matrix_size,
-                  ccl_datatype_char);
+                  ccl_datatype_int8);
 
     CCL_FREE(matrix);
     sa_handler->iv_map->clear();
@@ -686,27 +675,27 @@ ccl_status_t sparse_create_matrix_mask(const void* ctx) {
     *sa_handler->recv_ibuf = sa_handler->dst_buf;
     *sa_handler->recv_vbuf = ((char*)sa_handler->dst_buf + sa_handler->itype_size * idx_cnt);
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_allreduce_buf_mask(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_allreduce_buf_mask(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(*sa_handler->recv_vbuf);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_allreduce_count_mask(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_allreduce_count_mask(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     size_t* cnt_ptr = (size_t*)field_ptr;
     *cnt_ptr = *sa_handler->recv_vcount;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_nnz_per_rank_mask(const void* ctx) {
+ccl::status sparse_nnz_per_rank_mask(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     sa_handler->recv_buf_count = 0;
-    for (size_t i = 0; i < sa_handler->comm_size; i++) {
+    for (int i = 0; i < sa_handler->comm_size; i++) {
         sa_handler->recv_buf_count += sa_handler->recv_counts[i];
     }
 
@@ -714,32 +703,32 @@ ccl_status_t sparse_nnz_per_rank_mask(const void* ctx) {
         sa_handler->sched->alloc_buffer(sa_handler->itype_size * sa_handler->recv_buf_count)
             .get_ptr();
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_allgatherv_buf_mask(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_allgatherv_buf_mask(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(sa_handler->recv_buf);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_send_buf_mask(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_send_buf_mask(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(sa_handler->dst_buf);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_send_count_mask(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_send_count_mask(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     size_t* count = (size_t*)field_ptr;
     *count = sa_handler->dst_count[0];
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t sparse_coalesce_mask(const void* ctx) {
+ccl::status sparse_coalesce_mask(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
 
     sparse_coalesce<i_type, v_type>(sa_handler);
@@ -750,26 +739,26 @@ ccl_status_t sparse_coalesce_mask(const void* ctx) {
     sa_handler->dst_count[1] = iv_map_cnt * sa_handler->val_dim_cnt;
 
     CCL_SPARSE_ALLREDUCE_IF_SINGLE_RANK();
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
-                                                  ccl_buffer send_ind_buf,
-                                                  size_t send_ind_count,
-                                                  ccl_buffer send_val_buf,
-                                                  size_t send_val_count,
-                                                  void** recv_ind_buf,
-                                                  size_t* recv_ind_count,
-                                                  void** recv_val_buf,
-                                                  size_t* recv_val_count,
-                                                  const ccl_datatype& index_dtype,
-                                                  const ccl_datatype& value_dtype,
-                                                  ccl::reduction op,
-                                                  ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
-
-    size_t comm_size = comm->size();
+ccl::status ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
+                                                 ccl_buffer send_ind_buf,
+                                                 size_t send_ind_count,
+                                                 ccl_buffer send_val_buf,
+                                                 size_t send_val_count,
+                                                 void** recv_ind_buf,
+                                                 size_t* recv_ind_count,
+                                                 void** recv_val_buf,
+                                                 size_t* recv_val_count,
+                                                 const ccl_datatype& index_dtype,
+                                                 const ccl_datatype& value_dtype,
+                                                 ccl::reduction op,
+                                                 ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
+
+    int comm_size = comm->size();
 
     /* get data type sizes */
     size_t itype_size = sizeof(i_type);
@@ -840,11 +829,11 @@ ccl_status_t ccl_coll_build_sparse_allreduce_mask(ccl_sched* sched,
     return status;
 }
 
-ccl_status_t sparse_alloc_result_buf_allgatherv(const void* ctx) {
+ccl::status sparse_alloc_result_buf_allgatherv(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
 
     sa_handler->recv_buf_count = 0;
-    for (size_t i = 0; i < sa_handler->comm_size; i++) {
+    for (int i = 0; i < sa_handler->comm_size; i++) {
         sa_handler->recv_buf_count += sa_handler->recv_counts[i];
     }
 
@@ -883,21 +872,21 @@ ccl_status_t sparse_alloc_result_buf_allgatherv(const void* ctx) {
 
     CCL_THROW_IF_NOT(sa_handler->all_idx_buf && sa_handler->all_val_buf);
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <size_t stride_per_comm>
-ccl_status_t sparse_set_v_counts_allgatherv(const void* ctx) {
+ccl::status sparse_set_v_counts_allgatherv(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     size_t stride = stride_per_comm * sa_handler->comm_size;
-    for (size_t i = 0; i < sa_handler->comm_size; i++) {
+    for (int i = 0; i < sa_handler->comm_size; i++) {
         sa_handler->recv_counts[i + stride] = sa_handler->recv_counts[i] * sa_handler->val_dim_cnt;
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_return_gathered_allgatherv(const void* ctx) {
+ccl::status sparse_return_gathered_allgatherv(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     *sa_handler->recv_icount = sa_handler->recv_buf_count;
     *sa_handler->recv_vcount = sa_handler->recv_buf_count * sa_handler->val_dim_cnt;
@@ -905,11 +894,11 @@ ccl_status_t sparse_return_gathered_allgatherv(const void* ctx) {
     *sa_handler->recv_ibuf = sa_handler->all_idx_buf;
     *sa_handler->recv_vbuf = sa_handler->all_val_buf;
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t sparse_reduce_gathered_allgatherv(const void* ctx) {
+ccl::status sparse_reduce_gathered_allgatherv(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     i_type* indices = static_cast<i_type*>(sa_handler->all_idx_buf);
     v_type* values = static_cast<v_type*>(sa_handler->all_val_buf);
@@ -997,39 +986,39 @@ ccl_status_t sparse_reduce_gathered_allgatherv(const void* ctx) {
     *sa_handler->recv_ibuf = i_recv;
     *sa_handler->recv_vbuf = v_recv;
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_i_recv_allgatherv(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_i_recv_allgatherv(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(sa_handler->all_idx_buf);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_i_send_allgatherv(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_i_send_allgatherv(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(sa_handler->dst_ibuf);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <int send_count_src_index>
-ccl_status_t sparse_get_send_count_allgatherv(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_send_count_allgatherv(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     size_t* send_buf_count = (size_t*)field_ptr;
     *send_buf_count = sa_handler->send_count[send_count_src_index];
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_v_recv_allgatherv(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_v_recv_allgatherv(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(sa_handler->all_val_buf);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t sparse_get_v_send_allgatherv(const void* ctx, void* field_ptr) {
+ccl::status sparse_get_v_send_allgatherv(const void* ctx, void* field_ptr) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     if (sa_handler->sched->coll_attr.sparse_coalesce_mode == ccl::sparse_coalesce_mode::disable) {
@@ -1039,11 +1028,11 @@ ccl_status_t sparse_get_v_send_allgatherv(const void* ctx, void* field_ptr) {
         buf_ptr->set(sa_handler->dst_vbuf);
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t sparse_coalesce_allgatherv(const void* ctx) {
+ccl::status sparse_coalesce_allgatherv(const void* ctx) {
     ccl_sparse_allreduce_handler* sa_handler = (ccl_sparse_allreduce_handler*)ctx;
 
     sparse_coalesce<i_type, v_type>(sa_handler);
@@ -1060,26 +1049,26 @@ ccl_status_t sparse_coalesce_allgatherv(const void* ctx) {
         *sa_handler->recv_vbuf = sa_handler->dst_vbuf;
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename i_type, typename v_type>
-ccl_status_t ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
-                                                          ccl_buffer send_ind_buf,
-                                                          size_t send_ind_count,
-                                                          ccl_buffer send_val_buf,
-                                                          size_t send_val_count,
-                                                          void** recv_ind_buf,
-                                                          size_t* recv_ind_count,
-                                                          void** recv_val_buf,
-                                                          size_t* recv_val_count,
-                                                          const ccl_datatype& index_dtype,
-                                                          const ccl_datatype& value_dtype,
-                                                          ccl::reduction op,
-                                                          ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
-
-    size_t comm_size = comm->size();
+ccl::status ccl_coll_build_sparse_allreduce_3_allgatherv(ccl_sched* sched,
+                                                         ccl_buffer send_ind_buf,
+                                                         size_t send_ind_count,
+                                                         ccl_buffer send_val_buf,
+                                                         size_t send_val_count,
+                                                         void** recv_ind_buf,
+                                                         size_t* recv_ind_count,
+                                                         void** recv_val_buf,
+                                                         size_t* recv_val_count,
+                                                         const ccl_datatype& index_dtype,
+                                                         const ccl_datatype& value_dtype,
+                                                         ccl::reduction op,
+                                                         ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
+
+    int comm_size = comm->size();
 
     /* get data type sizes */
     size_t vtype_size = sizeof(v_type);
diff --git a/src/coll/algorithms/sparse_allreduce/sparse_handler.hpp b/src/coll/algorithms/sparse_allreduce/sparse_handler.hpp
index a37bcc04b..26e972491 100644
--- a/src/coll/algorithms/sparse_allreduce/sparse_handler.hpp
+++ b/src/coll/algorithms/sparse_allreduce/sparse_handler.hpp
@@ -24,9 +24,9 @@ struct ccl_sparse_allreduce_handler {
     size_t recv_buf_count;
     size_t itype_size;
     size_t vtype_size;
-    size_t comm_size;
+    int comm_size;
     size_t buf_size;
-    size_t recv_from;
+    int recv_from;
     size_t iter; /*iteration within ring algorithm*/
 
     size_t send_count[2];
diff --git a/src/coll/ccl_allgather_op_attr.hpp b/src/coll/ccl_allgather_op_attr.hpp
index 6de415fb6..b9c43842b 100644
--- a/src/coll/ccl_allgather_op_attr.hpp
+++ b/src/coll/ccl_allgather_op_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 namespace ccl {
 
@@ -27,9 +27,8 @@ class ccl_allgatherv_attr_impl_t : public ccl_operation_attr_impl_t {
 
     ccl_allgatherv_attr_impl_t(const base_t& base);
     ccl_allgatherv_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
     ccl_allgatherv_attr_impl_t(const ccl_allgatherv_attr_impl_t& src);
 
 private:
diff --git a/src/coll/ccl_allreduce_op_attr.hpp b/src/coll/ccl_allreduce_op_attr.hpp
index deb72c14a..911861734 100644
--- a/src/coll/ccl_allreduce_op_attr.hpp
+++ b/src/coll/ccl_allreduce_op_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 namespace ccl {
 
@@ -26,12 +26,11 @@ class ccl_allreduce_attr_impl_t : public ccl_operation_attr_impl_t {
     using base_t = ccl_operation_attr_impl_t;
 
     ccl_allreduce_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 
     using reduction_fn_traits_t =
-        details::ccl_api_type_attr_traits<allreduce_attr_id, allreduce_attr_id::reduction_fn>;
+        detail::ccl_api_type_attr_traits<allreduce_attr_id, allreduce_attr_id::reduction_fn>;
     typename reduction_fn_traits_t::return_type set_attribute_value(
         typename reduction_fn_traits_t::type val,
         const reduction_fn_traits_t& t);
diff --git a/src/coll/ccl_alltoall_op_attr.hpp b/src/coll/ccl_alltoall_op_attr.hpp
index cf575c8de..d1b58587d 100644
--- a/src/coll/ccl_alltoall_op_attr.hpp
+++ b/src/coll/ccl_alltoall_op_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 namespace ccl {
 
@@ -26,9 +26,8 @@ class ccl_alltoall_attr_impl_t : public ccl_operation_attr_impl_t {
     using base_t = ccl_operation_attr_impl_t;
 
     ccl_alltoall_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 } // namespace ccl
diff --git a/src/coll/ccl_alltoallv_op_attr.hpp b/src/coll/ccl_alltoallv_op_attr.hpp
index 019a2745f..b9a855bb2 100644
--- a/src/coll/ccl_alltoallv_op_attr.hpp
+++ b/src/coll/ccl_alltoallv_op_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 namespace ccl {
 
@@ -26,9 +26,8 @@ class ccl_alltoallv_attr_impl_t : public ccl_operation_attr_impl_t {
     using base_t = ccl_operation_attr_impl_t;
 
     ccl_alltoallv_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 } // namespace ccl
diff --git a/src/coll/ccl_barrier_attr.hpp b/src/coll/ccl_barrier_attr.hpp
index f5657711f..a3c7849a1 100644
--- a/src/coll/ccl_barrier_attr.hpp
+++ b/src/coll/ccl_barrier_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 
 namespace ccl {
@@ -26,8 +26,7 @@ class ccl_barrier_attr_impl_t : public ccl_operation_attr_impl_t {
     using base_t = ccl_operation_attr_impl_t;
 
     ccl_barrier_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 } // namespace ccl
diff --git a/src/coll/ccl_bcast_op_attr.hpp b/src/coll/ccl_bcast_op_attr.hpp
index 1a733a596..78db47ef1 100644
--- a/src/coll/ccl_bcast_op_attr.hpp
+++ b/src/coll/ccl_bcast_op_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 namespace ccl {
 
@@ -26,9 +26,8 @@ class ccl_broadcast_attr_impl_t : public ccl_operation_attr_impl_t {
     using base_t = ccl_operation_attr_impl_t;
 
     ccl_broadcast_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 };
 
 } // namespace ccl
diff --git a/src/coll/ccl_reduce_op_attr.hpp b/src/coll/ccl_reduce_op_attr.hpp
index 74a1dc767..825367ddb 100644
--- a/src/coll/ccl_reduce_op_attr.hpp
+++ b/src/coll/ccl_reduce_op_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 namespace ccl {
 
@@ -26,12 +26,11 @@ class ccl_reduce_attr_impl_t : public ccl_operation_attr_impl_t {
     using base_t = ccl_operation_attr_impl_t;
 
     ccl_reduce_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 
     using reduction_fn_traits_t =
-        details::ccl_api_type_attr_traits<reduce_attr_id, reduce_attr_id::reduction_fn>;
+        detail::ccl_api_type_attr_traits<reduce_attr_id, reduce_attr_id::reduction_fn>;
     typename reduction_fn_traits_t::return_type set_attribute_value(
         typename reduction_fn_traits_t::type val,
         const reduction_fn_traits_t& t);
diff --git a/src/coll/ccl_reduce_scatter_op_attr.hpp b/src/coll/ccl_reduce_scatter_op_attr.hpp
index 7590dfe46..5f40c76ab 100644
--- a/src/coll/ccl_reduce_scatter_op_attr.hpp
+++ b/src/coll/ccl_reduce_scatter_op_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 
 namespace ccl {
@@ -27,13 +27,12 @@ class ccl_reduce_scatter_attr_impl_t : public ccl_operation_attr_impl_t {
     using base_t = ccl_operation_attr_impl_t;
 
     ccl_reduce_scatter_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 
     using reduction_fn_traits_t =
-        details::ccl_api_type_attr_traits<reduce_scatter_attr_id,
-                                          reduce_scatter_attr_id::reduction_fn>;
+        detail::ccl_api_type_attr_traits<reduce_scatter_attr_id,
+                                         reduce_scatter_attr_id::reduction_fn>;
     typename reduction_fn_traits_t::return_type set_attribute_value(
         typename reduction_fn_traits_t::type val,
         const reduction_fn_traits_t& t);
diff --git a/src/coll/ccl_sparse_allreduce_op_attr.hpp b/src/coll/ccl_sparse_allreduce_op_attr.hpp
index 4abc53a3e..8ba49dae8 100644
--- a/src/coll/ccl_sparse_allreduce_op_attr.hpp
+++ b/src/coll/ccl_sparse_allreduce_op_attr.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 #include "coll/coll_common_attributes.hpp"
 namespace ccl {
 
@@ -26,13 +26,12 @@ class ccl_sparse_allreduce_attr_impl_t : public ccl_operation_attr_impl_t {
     using base_t = ccl_operation_attr_impl_t;
 
     ccl_sparse_allreduce_attr_impl_t(
-        const typename details::ccl_api_type_attr_traits<operation_attr_id,
-                                                         ccl::operation_attr_id::version>::type&
-            version);
+        const typename detail::ccl_api_type_attr_traits<operation_attr_id,
+                                                        operation_attr_id::version>::type& version);
 
     using sparse_allreduce_completion_fn_traits =
-        details::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
-                                          sparse_allreduce_attr_id::completion_fn>;
+        detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
+                                         sparse_allreduce_attr_id::completion_fn>;
     typename sparse_allreduce_completion_fn_traits::return_type set_attribute_value(
         typename sparse_allreduce_completion_fn_traits::type val,
         const sparse_allreduce_completion_fn_traits& t);
@@ -40,8 +39,8 @@ class ccl_sparse_allreduce_attr_impl_t : public ccl_operation_attr_impl_t {
         const sparse_allreduce_completion_fn_traits& id) const;
 
     using sparse_allreduce_alloc_fn_traits =
-        details::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
-                                          sparse_allreduce_attr_id::alloc_fn>;
+        detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
+                                         sparse_allreduce_attr_id::alloc_fn>;
     typename sparse_allreduce_alloc_fn_traits::return_type set_attribute_value(
         typename sparse_allreduce_alloc_fn_traits::type val,
         const sparse_allreduce_alloc_fn_traits& t);
@@ -49,8 +48,8 @@ class ccl_sparse_allreduce_attr_impl_t : public ccl_operation_attr_impl_t {
         const sparse_allreduce_alloc_fn_traits& id) const;
 
     using sparse_allreduce_fn_ctx_traits =
-        details::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
-                                          sparse_allreduce_attr_id::fn_ctx>;
+        detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
+                                         sparse_allreduce_attr_id::fn_ctx>;
     typename sparse_allreduce_fn_ctx_traits::return_type set_attribute_value(
         typename sparse_allreduce_fn_ctx_traits::type val,
         const sparse_allreduce_fn_ctx_traits& t);
@@ -58,8 +57,8 @@ class ccl_sparse_allreduce_attr_impl_t : public ccl_operation_attr_impl_t {
         const sparse_allreduce_fn_ctx_traits& id) const;
 
     using sparse_coalesce_mode_traits =
-        details::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
-                                          sparse_allreduce_attr_id::coalesce_mode>;
+        detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
+                                         sparse_allreduce_attr_id::coalesce_mode>;
     typename sparse_coalesce_mode_traits::return_type set_attribute_value(
         typename sparse_coalesce_mode_traits::type val,
         const sparse_coalesce_mode_traits& t);
diff --git a/src/coll/coll.cpp b/src/coll/coll.cpp
index 0d7cb03a6..86e202686 100644
--- a/src/coll/coll.cpp
+++ b/src/coll/coll.cpp
@@ -13,27 +13,23 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_aliases.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
 
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
 
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
 
-#include "common/event/event_internal/event_internal_attr_ids.hpp"
-#include "common/event/event_internal/event_internal_attr_ids_traits.hpp"
-#include "common/event/event_internal/event_internal.hpp"
-
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_stream.hpp"
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/stream.hpp"
 
 #include "common/request/request.hpp"
 
@@ -61,38 +57,13 @@
 #include "unordered_coll/unordered_coll.hpp"
 
 #define COPY_COMMON_OP_ATTRS(from, to) \
-    to->prologue_fn = from.get<ccl::operation_attr_id::prologue_fn>().get(); \
-    to->epilogue_fn = from.get<ccl::operation_attr_id::epilogue_fn>().get(); \
+    to->prologue_fn = nullptr; /*from.get<ccl::operation_attr_id::prologue_fn>().get();*/ \
+    to->epilogue_fn = nullptr; /*from.get<ccl::operation_attr_id::epilogue_fn>().get();*/ \
     to->priority = from.get<ccl::operation_attr_id::priority>(); \
     to->synchronous = from.get<ccl::operation_attr_id::synchronous>(); \
     to->to_cache = from.get<ccl::operation_attr_id::to_cache>(); \
     to->match_id = from.get<ccl::operation_attr_id::match_id>();
 
-ccl_coll_attr::ccl_coll_attr(const ccl_coll_attr_t* attr) {
-    *this = attr ?: ccl::global_data::get().default_coll_attr.get();
-}
-
-ccl_coll_attr& ccl_coll_attr::operator=(const ccl_coll_attr_t* attr) {
-    prologue_fn = attr->prologue_fn;
-    epilogue_fn = attr->epilogue_fn;
-    reduction_fn = attr->reduction_fn;
-    priority = attr->priority;
-    synchronous = attr->synchronous;
-    to_cache = attr->to_cache && attr->match_id && attr->match_id[0];
-    vector_buf = attr->vector_buf;
-    match_id = (attr->match_id ? attr->match_id : "");
-
-    sparse_allreduce_completion_fn = attr->sparse_allreduce_completion_fn;
-    sparse_allreduce_alloc_fn = attr->sparse_allreduce_alloc_fn;
-    sparse_allreduce_fn_ctx = attr->sparse_allreduce_fn_ctx;
-    sparse_coalesce_mode = attr->sparse_coalesce_mode;
-
-    if (to_cache != attr->to_cache)
-        LOG_INFO("collective caching is requested but no match_id is provided, disable caching");
-
-    return *this;
-}
-
 //TODO temporary solution for type convertation, ccl_coll_attr would be depreacated
 ccl_coll_attr::ccl_coll_attr(const ccl::allgatherv_attr& attr) {
     COPY_COMMON_OP_ATTRS(attr, this);
@@ -149,7 +120,8 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     bool postpone_schedule = false;
     if (ccl::global_data::env().enable_unordered_coll) {
         if (!attr.match_id.empty()) {
-            auto comm = param.comm->unordered_coll_manager->get_comm(std::string(attr.match_id)).get();
+            auto comm =
+                param.comm->unordered_coll_manager->get_comm(std::string(attr.match_id)).get();
             if (!comm) {
                 if (attr.synchronous) {
                     CCL_THROW("unsupported collective (synchronous && unordered && !communicator)");
@@ -212,7 +184,8 @@ static ccl_request* ccl_gpu_coll_create(ccl_coll_param& param, const ccl_coll_at
     bool postpone_schedule = false;
     if (ccl::global_data::env().enable_unordered_coll) {
         if (!attr.match_id.empty()) {
-            auto comm = param.comm->unordered_coll_manager->get_comm(std::string(attr.match_id)).get();
+            auto comm =
+                param.comm->unordered_coll_manager->get_comm(std::string(attr.match_id)).get();
             if (!comm) {
                 if (attr.synchronous) {
                     CCL_THROW("unsupported collective (synchronous && unordered && !communicator)");
@@ -267,14 +240,14 @@ static ccl_request* ccl_gpu_coll_create(ccl_coll_param& param, const ccl_coll_at
     return request;
 }
 
-ccl_status_t ccl_coll_build_allgatherv(ccl_sched* sched,
-                                       ccl_buffer send_buf,
-                                       size_t send_count,
-                                       ccl_buffer recv_buf,
-                                       const size_t* recv_counts,
-                                       const ccl_datatype& dtype,
-                                       ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
+                                      ccl_buffer send_buf,
+                                      size_t send_count,
+                                      ccl_buffer recv_buf,
+                                      const size_t* recv_counts,
+                                      const ccl_datatype& dtype,
+                                      ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_allgatherv;
@@ -300,19 +273,19 @@ ccl_status_t ccl_coll_build_allgatherv(ccl_sched* sched,
             break;
         default:
             CCL_FATAL("unexpected allgatherv_algo ", ccl_coll_algorithm_to_str(algo));
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
     return status;
 }
 
-ccl_status_t ccl_coll_build_allreduce(ccl_sched* sched,
-                                      ccl_buffer send_buf,
-                                      ccl_buffer recv_buf,
-                                      size_t count,
-                                      const ccl_datatype& dtype,
-                                      ccl::reduction reduction,
-                                      ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
+                                     ccl_buffer send_buf,
+                                     ccl_buffer recv_buf,
+                                     size_t count,
+                                     const ccl_datatype& dtype,
+                                     ccl::reduction reduction,
+                                     ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_allreduce;
@@ -360,23 +333,23 @@ ccl_status_t ccl_coll_build_allreduce(ccl_sched* sched,
             break;
         case ccl_coll_allreduce_2d:
             CCL_CALL(comm->allreduce_2d_builder->build(
-                     sched, send_buf, recv_buf, count, dtype, reduction));
+                sched, send_buf, recv_buf, count, dtype, reduction));
             break;
         default:
             CCL_FATAL("unexpected allreduce_algo ", ccl_coll_algorithm_to_str(algo));
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
 
     return status;
 }
 
-ccl_status_t ccl_coll_build_alltoall(ccl_sched* sched,
-                                     ccl_buffer send_buf,
-                                     ccl_buffer recv_buf,
-                                     size_t count,
-                                     const ccl_datatype& dtype,
-                                     ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_alltoall(ccl_sched* sched,
+                                    ccl_buffer send_buf,
+                                    ccl_buffer recv_buf,
+                                    size_t count,
+                                    const ccl_datatype& dtype,
+                                    ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_alltoall;
@@ -392,20 +365,20 @@ ccl_status_t ccl_coll_build_alltoall(ccl_sched* sched,
             break;
         default:
             CCL_FATAL("unexpected alltoall_algo ", ccl_coll_algorithm_to_str(algo));
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
 
     return status;
 }
 
-ccl_status_t ccl_coll_build_alltoallv(ccl_sched* sched,
-                                      ccl_buffer send_buf,
-                                      const size_t* send_counts,
-                                      ccl_buffer recv_buf,
-                                      const size_t* recv_counts,
-                                      const ccl_datatype& dtype,
-                                      ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_alltoallv(ccl_sched* sched,
+                                     ccl_buffer send_buf,
+                                     const size_t* send_counts,
+                                     ccl_buffer recv_buf,
+                                     const size_t* recv_counts,
+                                     const ccl_datatype& dtype,
+                                     ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_alltoallv;
@@ -421,19 +394,19 @@ ccl_status_t ccl_coll_build_alltoallv(ccl_sched* sched,
             break;
         default:
             CCL_FATAL("unexpected alltoallv_algo ", ccl_coll_algorithm_to_str(algo));
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
 
     return status;
 }
 
-ccl_status_t ccl_coll_build_barrier(ccl_sched* sched, ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_barrier(ccl_sched* sched, ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_barrier;
     param.count = 0;
-    param.dtype = ccl_datatype_char;
+    param.dtype = ccl_datatype_int8;
     param.comm = comm;
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_barrier>(param);
@@ -445,19 +418,19 @@ ccl_status_t ccl_coll_build_barrier(ccl_sched* sched, ccl_comm* comm) {
             break;
         default:
             CCL_FATAL("unexpected barrier_algo ", ccl_coll_algorithm_to_str(algo));
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
 
     return status;
 }
 
-ccl_status_t ccl_coll_build_bcast(ccl_sched* sched,
-                                  ccl_buffer buf,
-                                  size_t count,
-                                  const ccl_datatype& dtype,
-                                  size_t root,
-                                  ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_bcast(ccl_sched* sched,
+                                 ccl_buffer buf,
+                                 size_t count,
+                                 const ccl_datatype& dtype,
+                                 int root,
+                                 ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_bcast;
@@ -492,20 +465,20 @@ ccl_status_t ccl_coll_build_bcast(ccl_sched* sched,
             break;
         default:
             CCL_FATAL("unexpected bcast_algo ", ccl_coll_algorithm_to_str(algo));
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
     return status;
 }
 
-ccl_status_t ccl_coll_build_reduce(ccl_sched* sched,
-                                   ccl_buffer send_buf,
-                                   ccl_buffer recv_buf,
-                                   size_t count,
-                                   const ccl_datatype& dtype,
-                                   ccl::reduction reduction,
-                                   size_t root,
-                                   ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_reduce(ccl_sched* sched,
+                                  ccl_buffer send_buf,
+                                  ccl_buffer recv_buf,
+                                  size_t count,
+                                  const ccl_datatype& dtype,
+                                  ccl::reduction reduction,
+                                  int root,
+                                  ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_reduce;
@@ -542,21 +515,21 @@ ccl_status_t ccl_coll_build_reduce(ccl_sched* sched,
             break;
         default:
             CCL_FATAL("unexpected reduce_algo ", ccl_coll_algorithm_to_str(algo));
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
 
     return status;
 }
 
-ccl_status_t ccl_coll_build_reduce_scatter(ccl_sched* sched,
-                                           ccl_buffer send_buf,
-                                           ccl_buffer recv_buf,
-                                           size_t count,
-                                           const ccl_datatype& dtype,
-                                           ccl::reduction reduction,
-                                           ccl_comm* comm,
-                                           bool from_allreduce) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_reduce_scatter(ccl_sched* sched,
+                                          ccl_buffer send_buf,
+                                          ccl_buffer recv_buf,
+                                          size_t count,
+                                          const ccl_datatype& dtype,
+                                          ccl::reduction reduction,
+                                          ccl_comm* comm,
+                                          bool from_allreduce) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_reduce_scatter;
@@ -568,51 +541,48 @@ ccl_status_t ccl_coll_build_reduce_scatter(ccl_sched* sched,
 
     switch (algo) {
         case ccl_coll_reduce_scatter_direct:
-            if (!from_allreduce)
-            {
+            if (!from_allreduce) {
                 CCL_CALL(ccl_coll_build_direct_reduce_scatter(
                     sched, send_buf, recv_buf, count, dtype, reduction, comm));
                 break;
             }
         case ccl_coll_reduce_scatter_ring:
-            if (from_allreduce)
-            {
+            if (from_allreduce) {
                 CCL_CALL(ccl_coll_build_ring_reduce_scatter(
                     sched, send_buf, recv_buf, count, dtype, reduction, comm));
             }
-            else
-            {
+            else {
                 CCL_CALL(ccl_coll_build_ring_reduce_scatter_block(
                     sched, send_buf, recv_buf, count, dtype, reduction, comm));
             }
             break;
         default:
             CCL_FATAL("unexpected reduce_scatter_algo ", ccl_coll_algorithm_to_str(algo));
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
 
     return status;
 }
 
-ccl_status_t ccl_coll_build_sparse_allreduce(ccl_sched* sched,
-                                             ccl_buffer send_ind_buf,
-                                             size_t send_ind_count,
-                                             ccl_buffer send_val_buf,
-                                             size_t send_val_count,
-                                             void** recv_ind_buf,
-                                             size_t* recv_ind_count,
-                                             void** recv_val_buf,
-                                             size_t* recv_val_count,
-                                             const ccl_datatype& index_dtype,
-                                             const ccl_datatype& value_dtype,
-                                             ccl::reduction reduction,
-                                             ccl_comm* comm) {
-    ccl_status_t status = ccl_status_success;
+ccl::status ccl_coll_build_sparse_allreduce(ccl_sched* sched,
+                                            ccl_buffer send_ind_buf,
+                                            size_t send_ind_count,
+                                            ccl_buffer send_val_buf,
+                                            size_t send_val_count,
+                                            void** recv_ind_buf,
+                                            size_t* recv_ind_count,
+                                            void** recv_val_buf,
+                                            size_t* recv_val_count,
+                                            const ccl_datatype& index_dtype,
+                                            const ccl_datatype& value_dtype,
+                                            ccl::reduction reduction,
+                                            ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
 
     ccl_selector_param param;
     param.ctype = ccl_coll_sparse_allreduce;
     param.count = 0;
-    param.dtype = ccl_datatype_char;
+    param.dtype = ccl_datatype_int8;
     param.comm = comm;
     param.sparse_coalesce_mode = sched->coll_attr.sparse_coalesce_mode;
     param.sparse_allreduce_alloc_fn = sched->coll_attr.sparse_allreduce_alloc_fn;
@@ -653,16 +623,16 @@ ccl_status_t ccl_coll_build_sparse_allreduce(ccl_sched* sched,
                   send_ind_count,
                   ", values count = ",
                   send_val_count);
-        return ccl_status_invalid_arguments;
+        return ccl::status::invalid_arguments;
     }
 
     if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
         /*
             for now all sparse_allreduce algorithms
             may contains direct collective entries (allreduce/allgatherv)
-            which should be executed in strict_start_order mode
+            which should be executed in strict_order mode
         */
-        sched->strict_start_order = true;
+        sched->strict_order = true;
     }
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_sparse_allreduce>(param);
@@ -692,23 +662,17 @@ ccl_status_t ccl_coll_build_sparse_allreduce(ccl_sched* sched,
               ccl_reduction_to_str(reduction));
 
     switch (index_dtype.idx()) {
-        case ccl::datatype::int8:
-            CCL_SPARSE_ALLREDUCE_SELECT_V_DTYPE(char, value_dtype, algo);
-            break;
         case ccl::datatype::int32:
-            CCL_SPARSE_ALLREDUCE_SELECT_V_DTYPE(int, value_dtype, algo);
+            CCL_SPARSE_ALLREDUCE_SELECT_V_DTYPE(int32_t, value_dtype, algo);
             break;
         case ccl::datatype::int64:
             CCL_SPARSE_ALLREDUCE_SELECT_V_DTYPE(int64_t, value_dtype, algo);
             break;
-        case ccl::datatype::uint64:
-            CCL_SPARSE_ALLREDUCE_SELECT_V_DTYPE(uint64_t, value_dtype, algo);
-            break;
         default:
             CCL_FATAL("index datatype ",
                       ccl::global_data::get().dtypes->name(index_dtype),
                       " is not supported yet");
-            return ccl_status_invalid_arguments;
+            return ccl::status::invalid_arguments;
     }
 
     return status;
@@ -838,7 +802,7 @@ void ccl_barrier_impl(ccl_comm* comm, const ccl_stream* stream) {
     ccl_coll_param param{};
 
     param.ctype = ccl_coll_barrier;
-    param.dtype = ccl_datatype_char;
+    param.dtype = ccl_datatype_int8;
     param.stream = stream;
     param.comm = comm;
 
@@ -858,7 +822,7 @@ void ccl_barrier_impl(ccl_comm* comm, const ccl_stream* stream) {
 ccl_request* ccl_broadcast_impl(void* buf,
                                 size_t count,
                                 ccl::datatype dtype,
-                                size_t root,
+                                int root,
                                 const ccl_coll_attr& attr,
                                 ccl_comm* comm,
                                 const ccl_stream* stream) {
@@ -882,7 +846,7 @@ ccl_request* ccl_reduce_impl(const void* send_buf,
                              size_t count,
                              ccl::datatype dtype,
                              ccl::reduction reduction,
-                             size_t root,
+                             int root,
                              const ccl_coll_attr& attr,
                              ccl_comm* comm,
                              const ccl_stream* stream) {
diff --git a/src/coll/coll.hpp b/src/coll/coll.hpp
index aca729971..69a8cc7ff 100644
--- a/src/coll/coll.hpp
+++ b/src/coll/coll.hpp
@@ -24,80 +24,82 @@
 
 #include "coll/coll_common_attributes.hpp"
 
+#include "internal_types.hpp"
+
 class ccl_sched;
 class ccl_request;
 
-ccl_status_t ccl_coll_build_allgatherv(ccl_sched* sched,
-                                       ccl_buffer send_buf,
-                                       size_t send_count,
-                                       ccl_buffer recv_buf,
-                                       const size_t* recv_counts,
-                                       const ccl_datatype& dtype,
-                                       ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_allreduce(ccl_sched* sched,
+ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
                                       ccl_buffer send_buf,
+                                      size_t send_count,
                                       ccl_buffer recv_buf,
-                                      size_t count,
+                                      const size_t* recv_counts,
                                       const ccl_datatype& dtype,
-                                      ccl::reduction reduction,
                                       ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_alltoall(ccl_sched* sched,
+ccl::status ccl_coll_build_allreduce(ccl_sched* sched,
                                      ccl_buffer send_buf,
                                      ccl_buffer recv_buf,
                                      size_t count,
                                      const ccl_datatype& dtype,
+                                     ccl::reduction reduction,
                                      ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_alltoallv(ccl_sched* sched,
-                                      ccl_buffer send_buf,
-                                      const size_t* send_counts,
-                                      ccl_buffer recv_buf,
-                                      const size_t* recv_counts,
-                                      const ccl_datatype& dtype,
-                                      ccl_comm* comm);
+ccl::status ccl_coll_build_alltoall(ccl_sched* sched,
+                                    ccl_buffer send_buf,
+                                    ccl_buffer recv_buf,
+                                    size_t count,
+                                    const ccl_datatype& dtype,
+                                    ccl_comm* comm);
+
+ccl::status ccl_coll_build_alltoallv(ccl_sched* sched,
+                                     ccl_buffer send_buf,
+                                     const size_t* send_counts,
+                                     ccl_buffer recv_buf,
+                                     const size_t* recv_counts,
+                                     const ccl_datatype& dtype,
+                                     ccl_comm* comm);
+
+ccl::status ccl_coll_build_barrier(ccl_sched* sched, ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_barrier(ccl_sched* sched, ccl_comm* comm);
+ccl::status ccl_coll_build_bcast(ccl_sched* sched,
+                                 ccl_buffer buf,
+                                 size_t count,
+                                 const ccl_datatype& dtype,
+                                 int root,
+                                 ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_bcast(ccl_sched* sched,
-                                  ccl_buffer buf,
+ccl::status ccl_coll_build_reduce(ccl_sched* sched,
+                                  ccl_buffer send_buf,
+                                  ccl_buffer recv_buf,
                                   size_t count,
                                   const ccl_datatype& dtype,
-                                  size_t root,
+                                  ccl::reduction reduction,
+                                  int root,
                                   ccl_comm* comm);
 
-ccl_status_t ccl_coll_build_reduce(ccl_sched* sched,
-                                   ccl_buffer send_buf,
-                                   ccl_buffer recv_buf,
-                                   size_t count,
-                                   const ccl_datatype& dtype,
-                                   ccl::reduction reduction,
-                                   size_t root,
-                                   ccl_comm* comm);
-
-ccl_status_t ccl_coll_build_reduce_scatter(ccl_sched* sched,
-                                           ccl_buffer send_buf,
-                                           ccl_buffer recv_buf,
-                                           size_t count,
-                                           const ccl_datatype& dtype,
-                                           ccl::reduction reduction,
-                                           ccl_comm* comm,
-                                           bool from_allreduce = false);
-
-ccl_status_t ccl_coll_build_sparse_allreduce(ccl_sched* sched,
-                                             ccl_buffer send_ind_buf,
-                                             size_t send_ind_count,
-                                             ccl_buffer send_val_buf,
-                                             size_t send_val_count,
-                                             void** recv_ind_buf,
-                                             size_t* recv_ind_count,
-                                             void** recv_val_buf,
-                                             size_t* recv_val_count,
-                                             const ccl_datatype& index_dtype,
-                                             const ccl_datatype& value_dtype,
-                                             ccl::reduction reduction,
-                                             ccl_comm* comm);
+ccl::status ccl_coll_build_reduce_scatter(ccl_sched* sched,
+                                          ccl_buffer send_buf,
+                                          ccl_buffer recv_buf,
+                                          size_t count,
+                                          const ccl_datatype& dtype,
+                                          ccl::reduction reduction,
+                                          ccl_comm* comm,
+                                          bool from_allreduce = false);
+
+ccl::status ccl_coll_build_sparse_allreduce(ccl_sched* sched,
+                                            ccl_buffer send_ind_buf,
+                                            size_t send_ind_count,
+                                            ccl_buffer send_val_buf,
+                                            size_t send_val_count,
+                                            void** recv_ind_buf,
+                                            size_t* recv_ind_count,
+                                            void** recv_val_buf,
+                                            size_t* recv_val_count,
+                                            const ccl_datatype& index_dtype,
+                                            const ccl_datatype& value_dtype,
+                                            ccl::reduction reduction,
+                                            ccl_comm* comm);
 
 ccl_request* ccl_allgatherv_impl(const void* send_buf,
                                  size_t send_count,
@@ -148,7 +150,7 @@ void ccl_barrier_impl(ccl_comm* comm, const ccl_stream* stream);
 ccl_request* ccl_broadcast_impl(void* buf,
                                 size_t count,
                                 ccl::datatype dtype,
-                                size_t root,
+                                int root,
                                 const ccl_coll_attr& attr,
                                 ccl_comm* comm,
                                 const ccl_stream* stream);
@@ -158,7 +160,7 @@ ccl_request* ccl_reduce_impl(const void* send_buf,
                              size_t count,
                              ccl::datatype dtype,
                              ccl::reduction reduction,
-                             size_t root,
+                             int root,
                              const ccl_coll_attr& attr,
                              ccl_comm* comm,
                              const ccl_stream* stream);
diff --git a/src/coll/coll_common_attributes.cpp b/src/coll/coll_common_attributes.cpp
index 943d51577..aedd96df1 100644
--- a/src/coll/coll_common_attributes.cpp
+++ b/src/coll/coll_common_attributes.cpp
@@ -35,36 +35,36 @@ ccl_operation_attr_impl_t::get_attribute_value(const version_traits_t& id) const
     return version;
 }
 
-/**
- * `prologue_fn` operations definitions
- */
-const typename ccl_operation_attr_impl_t::prologue_fn_traits_t::return_type&
-ccl_operation_attr_impl_t::get_attribute_value(const prologue_fn_traits_t& id) const {
-    return prologue_fn;
-}
+// /**
+//  * `prologue_fn` operations definitions
+//  */
+// const typename ccl_operation_attr_impl_t::prologue_fn_traits_t::return_type&
+// ccl_operation_attr_impl_t::get_attribute_value(const prologue_fn_traits_t& id) const {
+//     return prologue_fn;
+// }
 
-typename ccl_operation_attr_impl_t::prologue_fn_traits_t::return_type
-ccl_operation_attr_impl_t::set_attribute_value(typename prologue_fn_traits_t::type val,
-                                               const prologue_fn_traits_t& t) {
-    auto old = prologue_fn.get();
-    prologue_fn = typename prologue_fn_traits_t::return_type{ val };
-    return typename prologue_fn_traits_t::return_type{ old };
-}
-/**
- * `epilogue_fn` operations definitions
- */
-const typename ccl_operation_attr_impl_t::epilogue_fn_traits_t::return_type&
-ccl_operation_attr_impl_t::get_attribute_value(const epilogue_fn_traits_t& id) const {
-    return epilogue_fn;
-}
+// typename ccl_operation_attr_impl_t::prologue_fn_traits_t::return_type
+// ccl_operation_attr_impl_t::set_attribute_value(typename prologue_fn_traits_t::type val,
+//                                                const prologue_fn_traits_t& t) {
+//     auto old = prologue_fn.get();
+//     prologue_fn = typename prologue_fn_traits_t::return_type{ val };
+//     return typename prologue_fn_traits_t::return_type{ old };
+// }
+// /**
+//  * `epilogue_fn` operations definitions
+//  */
+// const typename ccl_operation_attr_impl_t::epilogue_fn_traits_t::return_type&
+// ccl_operation_attr_impl_t::get_attribute_value(const epilogue_fn_traits_t& id) const {
+//     return epilogue_fn;
+// }
 
-typename ccl_operation_attr_impl_t::epilogue_fn_traits_t::return_type
-ccl_operation_attr_impl_t::set_attribute_value(typename epilogue_fn_traits_t::type val,
-                                               const epilogue_fn_traits_t& t) {
-    auto old = epilogue_fn.get();
-    epilogue_fn = typename epilogue_fn_traits_t::return_type{ val };
-    return typename epilogue_fn_traits_t::return_type{ old };
-}
+// typename ccl_operation_attr_impl_t::epilogue_fn_traits_t::return_type
+// ccl_operation_attr_impl_t::set_attribute_value(typename epilogue_fn_traits_t::type val,
+//                                                const epilogue_fn_traits_t& t) {
+//     auto old = epilogue_fn.get();
+//     epilogue_fn = typename epilogue_fn_traits_t::return_type{ val };
+//     return typename epilogue_fn_traits_t::return_type{ old };
+// }
 
 /**
  * `priority` operations definitions
diff --git a/src/coll/coll_common_attributes.hpp b/src/coll/coll_common_attributes.hpp
index f5ce48873..36a17434a 100644
--- a/src/coll/coll_common_attributes.hpp
+++ b/src/coll/coll_common_attributes.hpp
@@ -14,10 +14,10 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
 
 namespace ccl {
 struct ccl_operation_attr_impl_t {
@@ -26,42 +26,42 @@ struct ccl_operation_attr_impl_t {
      * `version` operations
      */
     using version_traits_t =
-        details::ccl_api_type_attr_traits<operation_attr_id, ccl::operation_attr_id::version>;
+        detail::ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::version>;
     const typename version_traits_t::return_type& get_attribute_value(
         const version_traits_t& id) const;
 
     typename version_traits_t::return_type set_attribute_value(typename version_traits_t::type val,
                                                                const version_traits_t& t);
 
-    /**
-     * `prologue_fn` operations
-     */
-    using prologue_fn_traits_t =
-        details::ccl_api_type_attr_traits<operation_attr_id, ccl::operation_attr_id::prologue_fn>;
-    const typename prologue_fn_traits_t::return_type& get_attribute_value(
-        const prologue_fn_traits_t& id) const;
-
-    typename prologue_fn_traits_t::return_type set_attribute_value(
-        typename prologue_fn_traits_t::type val,
-        const prologue_fn_traits_t& t);
-
-    /**
-     * `epilogue_fn` operations
-     */
-    using epilogue_fn_traits_t =
-        details::ccl_api_type_attr_traits<operation_attr_id, ccl::operation_attr_id::epilogue_fn>;
-    const typename epilogue_fn_traits_t::return_type& get_attribute_value(
-        const epilogue_fn_traits_t& id) const;
-
-    typename epilogue_fn_traits_t::return_type set_attribute_value(
-        typename epilogue_fn_traits_t::type val,
-        const epilogue_fn_traits_t& t);
+    // /**
+    //  * `prologue_fn` operations
+    //  */
+    // using prologue_fn_traits_t =
+    //     detail::ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::prologue_fn>;
+    // const typename prologue_fn_traits_t::return_type& get_attribute_value(
+    //     const prologue_fn_traits_t& id) const;
+
+    // typename prologue_fn_traits_t::return_type set_attribute_value(
+    //     typename prologue_fn_traits_t::type val,
+    //     const prologue_fn_traits_t& t);
+
+    // /**
+    //  * `epilogue_fn` operations
+    //  */
+    // using epilogue_fn_traits_t =
+    //     detail::ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::epilogue_fn>;
+    // const typename epilogue_fn_traits_t::return_type& get_attribute_value(
+    //     const epilogue_fn_traits_t& id) const;
+
+    // typename epilogue_fn_traits_t::return_type set_attribute_value(
+    //     typename epilogue_fn_traits_t::type val,
+    //     const epilogue_fn_traits_t& t);
 
     /**
      * `priority` operations
      */
     using priority_traits_t =
-        details::ccl_api_type_attr_traits<operation_attr_id, ccl::operation_attr_id::priority>;
+        detail::ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::priority>;
     const typename priority_traits_t::return_type& get_attribute_value(
         const priority_traits_t& id) const;
 
@@ -73,7 +73,7 @@ struct ccl_operation_attr_impl_t {
      * `synchronous` operations
      */
     using synchronous_traits_t =
-        details::ccl_api_type_attr_traits<operation_attr_id, ccl::operation_attr_id::synchronous>;
+        detail::ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::synchronous>;
     const typename synchronous_traits_t::return_type& get_attribute_value(
         const synchronous_traits_t& id) const;
 
@@ -85,7 +85,7 @@ struct ccl_operation_attr_impl_t {
      * `to_cache` operations
      */
     using to_cache_traits_t =
-        details::ccl_api_type_attr_traits<operation_attr_id, ccl::operation_attr_id::to_cache>;
+        detail::ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::to_cache>;
     const typename to_cache_traits_t::return_type& get_attribute_value(
         const to_cache_traits_t& id) const;
 
@@ -97,7 +97,7 @@ struct ccl_operation_attr_impl_t {
      * `match_id` operations
      */
     using match_id_traits_t =
-        details::ccl_api_type_attr_traits<operation_attr_id, ccl::operation_attr_id::match_id>;
+        detail::ccl_api_type_attr_traits<operation_attr_id, operation_attr_id::match_id>;
     const typename match_id_traits_t::return_type& get_attribute_value(
         const match_id_traits_t& id) const;
 
@@ -105,8 +105,8 @@ struct ccl_operation_attr_impl_t {
         typename match_id_traits_t::type val,
         const match_id_traits_t& t);
 
-    typename ccl_operation_attr_impl_t::prologue_fn_traits_t::return_type prologue_fn{};
-    typename ccl_operation_attr_impl_t::epilogue_fn_traits_t::return_type epilogue_fn{};
+    // typename ccl_operation_attr_impl_t::prologue_fn_traits_t::return_type prologue_fn{};
+    // typename ccl_operation_attr_impl_t::epilogue_fn_traits_t::return_type epilogue_fn{};
 
     /* Priority for collective operation */
     size_t priority = 0;
diff --git a/src/coll/coll_param.hpp b/src/coll/coll_param.hpp
index e4a738dd7..6ccc95a4d 100644
--- a/src/coll/coll_param.hpp
+++ b/src/coll/coll_param.hpp
@@ -18,29 +18,29 @@
 #include "coll/algorithms/algorithms_enum.hpp"
 #include "common/datatype/datatype.hpp"
 
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_stream.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/stream.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 
 class ccl_comm;
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
-typedef cl::sycl::buffer<char, 1> ccl_sycl_buffer_t;
+typedef cl::sycl::buffer<int8_t, 1> ccl_sycl_buffer_t;
 
 template <class native_type>
 using ccl_sycl_typed_buffer_t = cl::sycl::buffer<native_type, 1>;
 
 /* ordering should be aligned with ccl::datatype */
-using ccl_sycle_buffer_one_dim_types = std::tuple<ccl_sycl_typed_buffer_t<char>,
-                                                  ccl_sycl_typed_buffer_t<unsigned char>,
+using ccl_sycle_buffer_one_dim_types = std::tuple<ccl_sycl_typed_buffer_t<int8_t>,
+                                                  ccl_sycl_typed_buffer_t<uint8_t>,
                                                   ccl_sycl_typed_buffer_t<int16_t>,
                                                   ccl_sycl_typed_buffer_t<uint16_t>,
-                                                  ccl_sycl_typed_buffer_t<int>,
+                                                  ccl_sycl_typed_buffer_t<int32_t>,
                                                   ccl_sycl_typed_buffer_t<uint32_t>,
                                                   ccl_sycl_typed_buffer_t<int64_t>,
                                                   ccl_sycl_typed_buffer_t<uint64_t>,
@@ -56,8 +56,6 @@ struct ccl_coll_attr {
     ccl_coll_attr() = default;
     ccl_coll_attr(const ccl_coll_attr&) = default;
     ccl_coll_attr& operator=(const ccl_coll_attr&) = default;
-    ccl_coll_attr(const ccl_coll_attr_t* attr);
-    ccl_coll_attr& operator=(const ccl_coll_attr_t* attr);
 
     //TODO temporary solution for type convertation, ccl_coll_attr would be depreacated
     ccl_coll_attr(const ccl::allgatherv_attr& attr);
@@ -112,7 +110,7 @@ struct ccl_coll_param {
     const size_t* recv_counts;
     ccl_datatype dtype;
     ccl::reduction reduction;
-    size_t root;
+    int root;
     const ccl_stream* stream;
     ccl_comm* comm;
     ccl_coll_sparse_param sparse_param;
diff --git a/src/coll/selection/selector_allgatherv.cpp b/src/coll/selection/selector_allgatherv.cpp
index 34ea1bf24..e22b280cb 100644
--- a/src/coll/selection/selector_allgatherv.cpp
+++ b/src/coll/selection/selector_allgatherv.cpp
@@ -71,7 +71,7 @@ CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_allgatherv_algo,
                                     ({
                                         CCL_ASSERT(param.recv_counts);
                                         size_t count = 0;
-                                        for (size_t idx = 0; idx < param.comm->size(); idx++) {
+                                        for (int idx = 0; idx < param.comm->size(); idx++) {
                                             count += param.recv_counts[idx];
                                         }
                                         count /= param.comm->size();
diff --git a/src/coll/selection/selector_allreduce.cpp b/src/coll/selection/selector_allreduce.cpp
index 7536148e4..ae8c8ce74 100644
--- a/src/coll/selection/selector_allreduce.cpp
+++ b/src/coll/selection/selector_allreduce.cpp
@@ -59,10 +59,9 @@ bool ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::can_use(
     const ccl_selection_table_t<ccl_coll_allreduce_algo>& table) {
     bool can_use = true;
 
-    if (algo == ccl_coll_allreduce_rabenseifner && param.count < param.comm->pof2())
+    if (algo == ccl_coll_allreduce_rabenseifner && (int)param.count < param.comm->pof2())
         can_use = false;
-    else if (algo == ccl_coll_allreduce_ring_rma &&
-             !atl_wrapper::attr.enable_rma)
+    else if (algo == ccl_coll_allreduce_ring_rma && !atl_wrapper::attr.enable_rma)
         can_use = false;
     else if (algo == ccl_coll_allreduce_starlike && !(param.count / param.comm->size()))
         can_use = false;
diff --git a/src/coll/selection/selector_alltoall.cpp b/src/coll/selection/selector_alltoall.cpp
index a544a1d70..70e92a7e9 100644
--- a/src/coll/selection/selector_alltoall.cpp
+++ b/src/coll/selection/selector_alltoall.cpp
@@ -52,8 +52,7 @@ bool ccl_algorithm_selector_helper<ccl_coll_alltoall_algo>::can_use(
     const ccl_selection_table_t<ccl_coll_alltoall_algo>& table) {
     bool can_use = true;
 
-    if (algo == ccl_coll_alltoall_direct &&
-        (ccl::global_data::env().atl_transport == ccl_atl_ofi))
+    if (algo == ccl_coll_alltoall_direct && (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
 
     return can_use;
diff --git a/src/coll/selection/selector_alltoallv.cpp b/src/coll/selection/selector_alltoallv.cpp
index 65723c7cd..e09d1fcfb 100644
--- a/src/coll/selection/selector_alltoallv.cpp
+++ b/src/coll/selection/selector_alltoallv.cpp
@@ -53,8 +53,7 @@ bool ccl_algorithm_selector_helper<ccl_coll_alltoallv_algo>::can_use(
     const ccl_selection_table_t<ccl_coll_alltoallv_algo>& table) {
     bool can_use = true;
 
-    if (algo == ccl_coll_alltoallv_direct &&
-        (ccl::global_data::env().atl_transport == ccl_atl_ofi))
+    if (algo == ccl_coll_alltoallv_direct && (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
 
     return can_use;
diff --git a/src/coll/selection/selector_barrier.cpp b/src/coll/selection/selector_barrier.cpp
index c7f1c23e4..cf47cd7e1 100644
--- a/src/coll/selection/selector_barrier.cpp
+++ b/src/coll/selection/selector_barrier.cpp
@@ -43,8 +43,7 @@ bool ccl_algorithm_selector_helper<ccl_coll_barrier_algo>::can_use(
     const ccl_selection_table_t<ccl_coll_barrier_algo>& table) {
     bool can_use = true;
 
-    if (algo == ccl_coll_barrier_direct &&
-        (ccl::global_data::env().atl_transport == ccl_atl_ofi))
+    if (algo == ccl_coll_barrier_direct && (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
 
     return can_use;
diff --git a/src/coll/selection/selector_bcast.cpp b/src/coll/selection/selector_bcast.cpp
index 578b9a0fd..aecd3e985 100644
--- a/src/coll/selection/selector_bcast.cpp
+++ b/src/coll/selection/selector_bcast.cpp
@@ -52,7 +52,7 @@ bool ccl_algorithm_selector_helper<ccl_coll_bcast_algo>::can_use(
         can_use = false;
     }
     else if (algo == ccl_coll_bcast_direct &&
-        (ccl::global_data::env().atl_transport == ccl_atl_ofi))
+             (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
 
     return can_use;
diff --git a/src/coll/selection/selector_reduce.cpp b/src/coll/selection/selector_reduce.cpp
index e0276a268..68cbe0ca9 100644
--- a/src/coll/selection/selector_reduce.cpp
+++ b/src/coll/selection/selector_reduce.cpp
@@ -45,10 +45,10 @@ bool ccl_algorithm_selector_helper<ccl_coll_reduce_algo>::can_use(
     const ccl_selection_table_t<ccl_coll_reduce_algo>& table) {
     bool can_use = true;
 
-    if (algo == ccl_coll_reduce_rabenseifner && param.count < param.comm->pof2())
+    if (algo == ccl_coll_reduce_rabenseifner && (int)param.count < param.comm->pof2())
         can_use = false;
     else if (algo == ccl_coll_reduce_direct &&
-        (ccl::global_data::env().atl_transport == ccl_atl_ofi))
+             (ccl::global_data::env().atl_transport == ccl_atl_ofi))
         can_use = false;
 
     return can_use;
diff --git a/src/coll_attr_creation_impl.hpp b/src/coll_attr_creation_impl.hpp
index 9a359c4c6..a20a1847e 100644
--- a/src/coll_attr_creation_impl.hpp
+++ b/src/coll_attr_creation_impl.hpp
@@ -14,26 +14,26 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 #include "coll/coll_attributes.hpp"
+#include "common/utils/version.hpp"
 
 namespace ccl {
-/* TODO temporary function for UT compilation: would be part of ccl::environment in final*/
+
+namespace v1 {
+
+/* TODO temporary function for UT compilation: would be part of ccl::detail::environment in final*/
 template <class coll_attribute_type, class... attr_value_pair_t>
 coll_attribute_type create_coll_attr(attr_value_pair_t&&... avps) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    auto coll_attr = coll_attribute_type(ret);
+    auto version = utils::get_library_version();
+    auto coll_attr = coll_attribute_type(version);
 
     int expander[]{ (coll_attr.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
     (void)expander;
     return coll_attr;
 }
+
+} // namespace v1
+
 } // namespace ccl
diff --git a/src/coll_attr_impl.hpp b/src/coll_attr_impl.hpp
index 5b5c0dfe2..7005fd386 100644
--- a/src/coll_attr_impl.hpp
+++ b/src/coll_attr_impl.hpp
@@ -14,42 +14,44 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 #include "coll/coll_attributes.hpp"
 
 namespace ccl {
 
+namespace v1 {
+
 template<allgatherv_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>::return_type allgatherv_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>::return_type allgatherv_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type allgatherv_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type allgatherv_attr::set(const Value& v)
 {
     return static_cast<ccl_operation_attr_impl_t*>(get_impl().get())
-        ->set_attribute_value(v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->set_attribute_value(v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <allgatherv_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>::return_type&
 allgatherv_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<allgatherv_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 allgatherv_attr::get() const {
     return static_cast<const ccl_operation_attr_impl_t*>(get_impl().get())
-        ->get_attribute_value(details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->get_attribute_value(detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 /**
@@ -58,33 +60,33 @@ allgatherv_attr::get() const {
 template<allreduce_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<allreduce_attr_id, attrId>::return_type allreduce_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<allreduce_attr_id, attrId>::return_type allreduce_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<allreduce_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<allreduce_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type allreduce_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type allreduce_attr::set(const Value& v)
 {
     return static_cast<ccl_operation_attr_impl_t*>(get_impl().get())
-        ->set_attribute_value(v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->set_attribute_value(v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <allreduce_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<allreduce_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<allreduce_attr_id, attrId>::return_type&
 allreduce_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<allreduce_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<allreduce_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 allreduce_attr::get() const {
     return static_cast<const ccl_operation_attr_impl_t*>(get_impl().get())
-        ->get_attribute_value(details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->get_attribute_value(detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 /**
@@ -93,33 +95,33 @@ allreduce_attr::get() const {
 template<alltoall_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<alltoall_attr_id, attrId>::return_type alltoall_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<alltoall_attr_id, attrId>::return_type alltoall_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<alltoall_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<alltoall_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type alltoall_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type alltoall_attr::set(const Value& v)
 {
     return static_cast<ccl_operation_attr_impl_t*>(get_impl().get())
-        ->set_attribute_value(v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->set_attribute_value(v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <alltoall_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<alltoall_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<alltoall_attr_id, attrId>::return_type&
 alltoall_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<alltoall_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<alltoall_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 alltoall_attr::get() const {
     return static_cast<const ccl_operation_attr_impl_t*>(get_impl().get())
-        ->get_attribute_value(details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->get_attribute_value(detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 /**
@@ -128,33 +130,33 @@ alltoall_attr::get() const {
 template<alltoallv_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>::return_type alltoallv_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>::return_type alltoallv_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type alltoallv_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type alltoallv_attr::set(const Value& v)
 {
     return static_cast<ccl_operation_attr_impl_t*>(get_impl().get())
-        ->set_attribute_value(v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->set_attribute_value(v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <alltoallv_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>::return_type&
 alltoallv_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<alltoallv_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 alltoallv_attr::get() const {
     return static_cast<const ccl_operation_attr_impl_t*>(get_impl().get())
-        ->get_attribute_value(details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->get_attribute_value(detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 /**
@@ -163,33 +165,33 @@ alltoallv_attr::get() const {
 template<broadcast_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<broadcast_attr_id, attrId>::return_type broadcast_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<broadcast_attr_id, attrId>::return_type broadcast_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<broadcast_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<broadcast_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type broadcast_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type broadcast_attr::set(const Value& v)
 {
     return static_cast<ccl_operation_attr_impl_t*>(get_impl().get())
-        ->set_attribute_value(v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->set_attribute_value(v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <broadcast_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<broadcast_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<broadcast_attr_id, attrId>::return_type&
 broadcast_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<broadcast_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<broadcast_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 broadcast_attr::get() const {
     return static_cast<const ccl_operation_attr_impl_t*>(get_impl().get())
-        ->get_attribute_value(details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->get_attribute_value(detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 /**
@@ -198,33 +200,33 @@ broadcast_attr::get() const {
 template<reduce_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<reduce_attr_id, attrId>::return_type reduce_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<reduce_attr_id, attrId>::return_type reduce_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<reduce_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<reduce_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type reduce_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type reduce_attr::set(const Value& v)
 {
     return static_cast<ccl_operation_attr_impl_t*>(get_impl().get())
-        ->set_attribute_value(v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->set_attribute_value(v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <reduce_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<reduce_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<reduce_attr_id, attrId>::return_type&
 reduce_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<reduce_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<reduce_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 reduce_attr::get() const {
     return static_cast<const ccl_operation_attr_impl_t*>(get_impl().get())
-        ->get_attribute_value(details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->get_attribute_value(detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 /**
@@ -233,34 +235,34 @@ reduce_attr::get() const {
 template<reduce_scatter_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>::return_type reduce_scatter_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>::return_type reduce_scatter_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type reduce_scatter_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type reduce_scatter_attr::set(const Value& v)
 {
     return static_cast<ccl_operation_attr_impl_t*>(get_impl().get())
-        ->set_attribute_value(v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->set_attribute_value(v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <reduce_scatter_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<reduce_scatter_attr_id,
-                                                         attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<reduce_scatter_attr_id,
+                                                        attrId>::return_type&
 reduce_scatter_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<reduce_scatter_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 reduce_scatter_attr::get() const {
     return static_cast<const ccl_operation_attr_impl_t*>(get_impl().get())
-        ->get_attribute_value(details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->get_attribute_value(detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 /**
@@ -269,34 +271,34 @@ reduce_scatter_attr::get() const {
 template<sparse_allreduce_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>::return_type sparse_allreduce_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>::return_type sparse_allreduce_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type sparse_allreduce_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type sparse_allreduce_attr::set(const Value& v)
 {
     return static_cast<ccl_operation_attr_impl_t*>(get_impl().get())
-        ->set_attribute_value(v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->set_attribute_value(v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <sparse_allreduce_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
-                                                         attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id,
+                                                        attrId>::return_type&
 sparse_allreduce_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<sparse_allreduce_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 sparse_allreduce_attr::get() const {
     return static_cast<const ccl_operation_attr_impl_t*>(get_impl().get())
-        ->get_attribute_value(details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        ->get_attribute_value(detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 /**
@@ -305,32 +307,35 @@ sparse_allreduce_attr::get() const {
 template<barrier_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<barrier_attr_id, attrId>::return_type barrier_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<barrier_attr_id, attrId>::return_type barrier_attr::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<barrier_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<barrier_attr_id, attrId>{});
 }
 
 template<operation_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type barrier_attr::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type barrier_attr::set(const Value& v)
 {
     return get_impl().get()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
 
 template <barrier_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<barrier_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<barrier_attr_id, attrId>::return_type&
 barrier_attr::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<barrier_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<barrier_attr_id, attrId>{});
 }
 
 template <operation_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<operation_attr_id, attrId>::return_type&
 barrier_attr::get() const {
     return get_impl().get()->get_attribute_value(
-        details::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<operation_attr_id, attrId>{});
 }
+
+} // namespace v1
+
 } // namespace ccl
diff --git a/src/comm_attr_impl.hpp b/src/comm_attr_impl.hpp
new file mode 100644
index 000000000..0f72e8007
--- /dev/null
+++ b/src/comm_attr_impl.hpp
@@ -0,0 +1,47 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/comm_attr.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+/**
+ * comm_attr attributes definition
+ */
+template <comm_attr_id attrId, class Value>
+CCL_API Value comm_attr::set(const Value& v) {
+    return get_impl()->set_attribute_value(
+        v, detail::ccl_api_type_attr_traits<comm_attr_id, attrId>{});
+}
+
+template <comm_attr_id attrId>
+CCL_API const typename detail::ccl_api_type_attr_traits<comm_attr_id, attrId>::type&
+comm_attr::get() const {
+    return get_impl()->get_attribute_value(
+        detail::ccl_api_type_attr_traits<comm_attr_id, attrId>{});
+}
+
+template <comm_attr_id attrId>
+CCL_API bool comm_attr::is_valid() const noexcept {
+    return get_impl()->is_valid<attrId>();
+}
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/comm_split_attr_impl.hpp b/src/comm_split_attr_impl.hpp
index 9ef132211..b9e3db667 100644
--- a/src/comm_split_attr_impl.hpp
+++ b/src/comm_split_attr_impl.hpp
@@ -13,31 +13,35 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
-
-namespace ccl {
-
-/**
- * comm_split_attr attributes definition
- */
-template <comm_split_attr_id attrId, class Value>
-CCL_API Value comm_split_attr::set(const Value& v) {
-    return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<comm_split_attr_id, attrId>{});
-}
-
-template <comm_split_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<comm_split_attr_id, attrId>::type&
-comm_split_attr::get() const {
-    return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<comm_split_attr_id, attrId>{});
-}
-
-template <comm_split_attr_id attrId>
-CCL_API bool comm_split_attr::is_valid() const noexcept {
-    return get_impl()->is_valid<attrId>();
-}
-
-} // namespace ccl
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+/**
+ * comm_split_attr attributes definition
+ */
+template <comm_split_attr_id attrId, class Value>
+CCL_API Value comm_split_attr::set(const Value& v) {
+    return get_impl()->set_attribute_value(
+        v, detail::ccl_api_type_attr_traits<comm_split_attr_id, attrId>{});
+}
+
+template <comm_split_attr_id attrId>
+CCL_API const typename detail::ccl_api_type_attr_traits<comm_split_attr_id, attrId>::type&
+comm_split_attr::get() const {
+    return get_impl()->get_attribute_value(
+        detail::ccl_api_type_attr_traits<comm_split_attr_id, attrId>{});
+}
+
+template <comm_split_attr_id attrId>
+CCL_API bool comm_split_attr::is_valid() const noexcept {
+    return get_impl()->is_valid<attrId>();
+}
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/common/comm/atl_tag.cpp b/src/common/comm/atl_tag.cpp
index 794290c9a..eea162428 100644
--- a/src/common/comm/atl_tag.cpp
+++ b/src/common/comm/atl_tag.cpp
@@ -30,7 +30,7 @@ void ccl_atl_tag::print() {
 }
 
 uint64_t ccl_atl_tag::create(ccl_comm_id_t comm_id,
-                             size_t rank,
+                             int rank,
                              ccl_sched_id_t sched_id,
                              ccl_op_id_t op_id) {
     uint64_t tag = 0;
diff --git a/src/common/comm/atl_tag.hpp b/src/common/comm/atl_tag.hpp
index 5a5ae0348..ea4570121 100644
--- a/src/common/comm/atl_tag.hpp
+++ b/src/common/comm/atl_tag.hpp
@@ -48,7 +48,7 @@ class ccl_atl_tag {
      * @param op_id local operation ID. Used to generate unique ATL tag when the rest of input parameters do not change
      * @return ATL communication tag
      */
-    uint64_t create(ccl_comm_id_t comm_id, size_t rank, ccl_sched_id_t sched_id, ccl_op_id_t op_id);
+    uint64_t create(ccl_comm_id_t comm_id, int rank, ccl_sched_id_t sched_id, ccl_op_id_t op_id);
 
 private:
     /**********************************************************************************
diff --git a/src/common/comm/comm.cpp b/src/common/comm/comm.cpp
index c5290f1ce..b70ffdff4 100644
--- a/src/common/comm/comm.cpp
+++ b/src/common/comm/comm.cpp
@@ -18,11 +18,10 @@
 #include "common/comm/comm.hpp"
 #include "common/global/global.hpp"
 #include "sched/sched.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_kvs.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/kvs.hpp"
 
-void ccl_comm::allocate_resources()
-{
+void ccl_comm::allocate_resources() {
     if (ccl::global_data::env().enable_unordered_coll) {
         unordered_coll_manager =
             std::unique_ptr<ccl_unordered_coll_manager>(new ccl_unordered_coll_manager(*this));
@@ -30,27 +29,26 @@ void ccl_comm::allocate_resources()
 
     auto& env_object = ccl::global_data::env();
 
-    allreduce_2d_builder = std::unique_ptr<ccl_allreduce_2d_builder>(
-      new ccl_allreduce_2d_builder(
-          (env_object.allreduce_2d_base_size != CCL_ENV_SIZET_NOT_SPECIFIED)
-              ? env_object.allreduce_2d_base_size
-               : ccl::global_data::get().executor->get_local_proc_count(),
-           env_object.allreduce_2d_switch_dims,
-           this));
+    allreduce_2d_builder = std::unique_ptr<ccl_allreduce_2d_builder>(new ccl_allreduce_2d_builder(
+        (env_object.allreduce_2d_base_size != CCL_ENV_SIZET_NOT_SPECIFIED)
+            ? env_object.allreduce_2d_base_size
+            : ccl::global_data::get().executor->get_local_proc_count(),
+        env_object.allreduce_2d_switch_dims,
+        this));
 
     if (m_rank == 0)
         env_object.print();
 }
 
-ccl_comm::ccl_comm(size_t rank,
-                   size_t size,
+ccl_comm::ccl_comm(int rank,
+                   int size,
                    ccl_comm_id_storage::comm_id&& id,
                    std::shared_ptr<atl_wrapper> atl,
                    bool share_resources)
         : ccl_comm(rank, size, std::move(id), ccl_rank2rank_map{}, atl, share_resources) {}
 
-ccl_comm::ccl_comm(size_t rank,
-                   size_t size,
+ccl_comm::ccl_comm(int rank,
+                   int size,
                    ccl_comm_id_storage::comm_id&& id,
                    ccl_rank2rank_map&& rank_map,
                    std::shared_ptr<atl_wrapper> atl,
@@ -63,8 +61,7 @@ ccl_comm::ccl_comm(size_t rank,
           on_process_ranks_number(1) {
     reset(rank, size);
 
-    if (!share_resources)
-    {
+    if (!share_resources) {
         allocate_resources();
     }
 }
@@ -79,27 +76,24 @@ void ccl_comm::ccl_comm_reset_thread_barrier() {
     thread_ranks_counter.store(0);
 }
 
-ccl_comm::ccl_comm(const std::vector<size_t>& local_thread_device_ranks,
-                   size_t cluster_devices_count,
+ccl_comm::ccl_comm(const std::vector<int>& local_ranks,
+                   int comm_size,
                    std::shared_ptr<ccl::kvs_interface> kvs_instance,
                    ccl_comm_id_storage::comm_id&& id,
                    bool share_resources)
         : m_id(std::move(id)),
           m_local2global_map(),
-          m_dtree(local_thread_device_ranks.size(), cluster_devices_count) {
-
+          m_dtree(local_ranks.size(), comm_size) {
     std::shared_ptr<ikvs_wrapper> kvs_wrapper(new users_kvs(kvs_instance));
 
-    atl = std::shared_ptr<atl_wrapper>(
-        new atl_wrapper(cluster_devices_count, local_thread_device_ranks, kvs_wrapper));
+    atl = std::shared_ptr<atl_wrapper>(new atl_wrapper(comm_size, local_ranks, kvs_wrapper));
 
-    thread_number = atl->get_threads_count();
-    on_process_ranks_number = atl->get_devices_per_rank_count();
+    thread_number = atl->get_threads_per_process();
+    on_process_ranks_number = atl->get_ranks_per_process();
 
     reset(atl->get_rank(), atl->get_size());
 
-    if (!share_resources)
-    {
+    if (!share_resources) {
         allocate_resources();
     }
 }
@@ -109,11 +103,11 @@ ccl_comm* ccl_comm::create_with_colors(const std::vector<int>& colors,
                                        const ccl_comm* parent_comm,
                                        bool share_resources) {
     ccl_rank2rank_map rank_map;
-    size_t new_comm_size = 0;
-    size_t new_comm_rank = 0;
+    int new_comm_size = 0;
+    int new_comm_rank = 0;
     int color = colors[parent_comm->rank()];
 
-    for (size_t i = 0; i < parent_comm->size(); ++i) {
+    for (int i = 0; i < parent_comm->size(); ++i) {
         if (colors[i] == color) {
             LOG_DEBUG("map local rank ", new_comm_size, " to global ", i);
             rank_map.emplace_back(i);
@@ -134,8 +128,12 @@ ccl_comm* ccl_comm::create_with_colors(const std::vector<int>& colors,
         rank_map.clear();
     }
 
-    ccl_comm* comm = new ccl_comm(
-        new_comm_rank, new_comm_size, comm_ids->acquire(), std::move(rank_map), parent_comm->atl, share_resources);
+    ccl_comm* comm = new ccl_comm(new_comm_rank,
+                                  new_comm_size,
+                                  comm_ids->acquire(),
+                                  std::move(rank_map),
+                                  parent_comm->atl,
+                                  share_resources);
 
     LOG_DEBUG("new comm: color ",
               color,
@@ -151,23 +149,24 @@ ccl_comm* ccl_comm::create_with_colors(const std::vector<int>& colors,
 
 std::shared_ptr<ccl_comm> ccl_comm::clone_with_new_id(ccl_comm_id_storage::comm_id&& id) {
     ccl_rank2rank_map rank_map{ m_local2global_map };
-    return std::make_shared<ccl_comm>(m_rank, m_size, std::move(id), std::move(rank_map), atl, true /*share_resources*/);
+    return std::make_shared<ccl_comm>(
+        m_rank, m_size, std::move(id), std::move(rank_map), atl, true /*share_resources*/);
 }
 
-size_t ccl_comm::get_global_rank(size_t rank) const {
+int ccl_comm::get_global_rank(int rank) const {
     if (m_local2global_map.empty()) {
         // global comm and its copies do not have entries in the map
         return rank;
     }
 
-    CCL_THROW_IF_NOT(m_local2global_map.size() > rank,
+    CCL_THROW_IF_NOT((int)m_local2global_map.size() > rank,
                      "no rank ",
                      rank,
                      " was found in comm ",
                      this,
                      ", id ",
                      m_id.value());
-    size_t global_rank = m_local2global_map[rank];
+    int global_rank = m_local2global_map[rank];
     LOG_DEBUG(
         "comm , ", this, " id ", m_id.value(), ", map rank ", rank, " to global ", global_rank);
     return global_rank;
diff --git a/src/common/comm/comm.hpp b/src/common/comm/comm.hpp
index 03a081a2c..5a9e2072a 100644
--- a/src/common/comm/comm.hpp
+++ b/src/common/comm/comm.hpp
@@ -13,168 +13,169 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#include <atomic>
-#include <unordered_map>
-
-#include "atl/atl_wrapper.h"
-#include "coll/algorithms/allreduce/allreduce_2d.hpp"
-#include "common/comm/comm_id_storage.hpp"
-#include "common/comm/atl_tag.hpp"
-#include "common/log/log.hpp"
-#include "common/utils/tree.hpp"
-#include "common/utils/utils.hpp"
-#include "unordered_coll/unordered_coll.hpp"
-
-// index = local_rank, value = global_rank
-using ccl_rank2rank_map = std::vector<size_t>;
-
-namespace ccl {
-class kvs_interface;
-}
-
-class alignas(CACHELINE_SIZE) ccl_comm {
-public:
-    //TODO
-    static void ccl_comm_reset_thread_barrier();
-    ccl_comm() = delete;
-    ccl_comm(const ccl_comm& other) = delete;
-    ccl_comm& operator=(const ccl_comm& other) = delete;
-
-    ccl_comm(size_t rank,
-             size_t size,
-             ccl_comm_id_storage::comm_id&& id,
-             std::shared_ptr<atl_wrapper> atl,
-             bool share_resources = false);
-    ccl_comm(size_t rank,
-             size_t size,
-             ccl_comm_id_storage::comm_id&& id,
-             ccl_rank2rank_map&& ranks,
-             std::shared_ptr<atl_wrapper> atl,
-             bool share_resources = false);
-
-    //TODO non-implemented
-    //1) cluster_devices_count (devices 1000) -> (processes 10)
-    //2) blocking until all thread -> calls ccl_comm
-    //3) return 'thread_count'
-
-    // ccl_comm( {0,1,2,3...}, 1000, kvs )
-    // from 20 processes from ranks 0,1,2,3. Each rank contains 10 threads
-    // communicator: size in {20} and ranks in {0..19}
-    // communicator: return threads count in process {10}
-    // communicator: return devices counts per thread in process
-    ccl_comm(const std::vector<size_t>& local_thread_device_ranks,
-             size_t cluster_devices_count,
-             std::shared_ptr<ccl::kvs_interface> kvs_instance,
-             ccl_comm_id_storage::comm_id&& id,
-             bool share_resources = false);
-
-    ~ccl_comm() = default;
-
-    /* version with user-provided colors, allows to skip allgatherv */
-    static ccl_comm* create_with_colors(const std::vector<int>& colors,
-                                        ccl_comm_id_storage* comm_ids,
-                                        const ccl_comm* parent_comm,
-                                        bool share_resources = false);
-
-    std::shared_ptr<ccl_comm> clone_with_new_id(ccl_comm_id_storage::comm_id&& id);
-
-    size_t rank() const noexcept {
-        return m_rank;
-    }
-
-    size_t size() const noexcept {
-        return m_size;
-    }
-
-    size_t pof2() const noexcept {
-        return m_pof2;
-    }
-
-    ccl_comm_id_t id() const noexcept {
-        return m_id.value();
-    }
-
-    size_t thread_count() const noexcept {
-        return thread_number;
-    }
-
-    size_t on_process_ranks_count() const noexcept {
-        return on_process_ranks_number;
-    }
-
-    ccl_sched_id_t get_sched_id(bool use_internal_space) {
-        ccl_sched_id_t& next_sched_id =
-            (use_internal_space) ? m_next_sched_id_internal : m_next_sched_id_external;
-
-        ccl_sched_id_t first_sched_id =
-            (use_internal_space) ? static_cast<ccl_sched_id_t>(0) : ccl_comm::max_sched_count / 2;
-
-        ccl_sched_id_t max_sched_id =
-            (use_internal_space) ? ccl_comm::max_sched_count / 2 : ccl_comm::max_sched_count;
-
-        ccl_sched_id_t id = next_sched_id;
-
-        ++next_sched_id;
-
-        if (next_sched_id == max_sched_id) {
-            /* wrap the sched numbers around to the start */
-            next_sched_id = first_sched_id;
-        }
-
-        LOG_DEBUG("sched_id ", id, ", comm_id ", m_id.value(), ", next sched_id ", next_sched_id);
-
-        return id;
-    }
-
-    void reset(size_t rank, size_t size) {
-        m_rank = rank;
-        m_size = size;
-        m_pof2 = ccl_pof2(m_size);
-
-        m_next_sched_id_internal = ccl_comm::max_sched_count / 2;
-        m_next_sched_id_external = 0;
-    }
-
-    /**
-     * Returns the number of @c rank in the global communicator
-     * @param rank a rank which is part of the current communicator
-     * @return number of @c rank in the global communicator
-     */
-    size_t get_global_rank(size_t rank) const;
-
-    const ccl_double_tree& dtree() const {
-        return m_dtree;
-    }
-
-    /**
-     * Maximum available number of active communicators
-     */
-    static constexpr ccl_sched_id_t max_comm_count = std::numeric_limits<ccl_comm_id_t>::max();
-    /**
-     * Maximum value of schedule id in scope of the current communicator
-     */
-    static constexpr ccl_sched_id_t max_sched_count = std::numeric_limits<ccl_sched_id_t>::max();
-
-    std::shared_ptr<atl_wrapper> atl;
-    std::unique_ptr<ccl_unordered_coll_manager> unordered_coll_manager;
-    std::unique_ptr<ccl_allreduce_2d_builder> allreduce_2d_builder;
-
-private:
-
-    void allocate_resources();
-
-    size_t m_rank;
-    size_t m_size;
-    size_t m_pof2;
-
-    ccl_comm_id_storage::comm_id m_id;
-    ccl_sched_id_t m_next_sched_id_internal;
-    ccl_sched_id_t m_next_sched_id_external;
-    ccl_rank2rank_map m_local2global_map{};
-    ccl_double_tree m_dtree;
-
-    size_t thread_number;
-    size_t on_process_ranks_number;
-};
+#pragma once
+
+#include <atomic>
+#include <unordered_map>
+
+#include "atl/atl_wrapper.h"
+#include "coll/algorithms/allreduce/allreduce_2d.hpp"
+#include "common/comm/comm_id_storage.hpp"
+#include "common/comm/atl_tag.hpp"
+#include "common/log/log.hpp"
+#include "common/utils/tree.hpp"
+#include "common/utils/utils.hpp"
+#include "unordered_coll/unordered_coll.hpp"
+
+// index = local_rank, value = global_rank
+using ccl_rank2rank_map = std::vector<int>;
+
+namespace ccl {
+namespace v1 {
+class kvs_interface;
+}
+} // namespace ccl
+
+class alignas(CACHELINE_SIZE) ccl_comm {
+public:
+    //TODO
+    static void ccl_comm_reset_thread_barrier();
+    ccl_comm() = delete;
+    ccl_comm(const ccl_comm& other) = delete;
+    ccl_comm& operator=(const ccl_comm& other) = delete;
+
+    ccl_comm(int rank,
+             int size,
+             ccl_comm_id_storage::comm_id&& id,
+             std::shared_ptr<atl_wrapper> atl,
+             bool share_resources = false);
+    ccl_comm(int rank,
+             int size,
+             ccl_comm_id_storage::comm_id&& id,
+             ccl_rank2rank_map&& ranks,
+             std::shared_ptr<atl_wrapper> atl,
+             bool share_resources = false);
+
+    //TODO non-implemented
+    //1) cluster_devices_count (devices 1000) -> (processes 10)
+    //2) blocking until all thread -> calls ccl_comm
+    //3) return 'thread_count'
+
+    // ccl_comm( {0,1,2,3...}, 1000, kvs )
+    // from 20 processes from ranks 0,1,2,3. Each rank contains 10 threads
+    // communicator: size in {20} and ranks in {0..19}
+    // communicator: return threads count in process {10}
+    // communicator: return devices counts per thread in process
+    ccl_comm(const std::vector<int>& local_ranks,
+             int comm_size,
+             std::shared_ptr<ccl::kvs_interface> kvs_instance,
+             ccl_comm_id_storage::comm_id&& id,
+             bool share_resources = false);
+
+    ~ccl_comm() = default;
+
+    /* version with user-provided colors, allows to skip allgatherv */
+    static ccl_comm* create_with_colors(const std::vector<int>& colors,
+                                        ccl_comm_id_storage* comm_ids,
+                                        const ccl_comm* parent_comm,
+                                        bool share_resources = false);
+
+    std::shared_ptr<ccl_comm> clone_with_new_id(ccl_comm_id_storage::comm_id&& id);
+
+    int rank() const noexcept {
+        return m_rank;
+    }
+
+    int size() const noexcept {
+        return m_size;
+    }
+
+    int pof2() const noexcept {
+        return m_pof2;
+    }
+
+    ccl_comm_id_t id() const noexcept {
+        return m_id.value();
+    }
+
+    size_t thread_count() const noexcept {
+        return thread_number;
+    }
+
+    size_t ranks_per_process() const noexcept {
+        return on_process_ranks_number;
+    }
+
+    ccl_sched_id_t get_sched_id(bool use_internal_space) {
+        ccl_sched_id_t& next_sched_id =
+            (use_internal_space) ? m_next_sched_id_internal : m_next_sched_id_external;
+
+        ccl_sched_id_t first_sched_id =
+            (use_internal_space) ? static_cast<ccl_sched_id_t>(0) : ccl_comm::max_sched_count / 2;
+
+        ccl_sched_id_t max_sched_id =
+            (use_internal_space) ? ccl_comm::max_sched_count / 2 : ccl_comm::max_sched_count;
+
+        ccl_sched_id_t id = next_sched_id;
+
+        ++next_sched_id;
+
+        if (next_sched_id == max_sched_id) {
+            /* wrap the sched numbers around to the start */
+            next_sched_id = first_sched_id;
+        }
+
+        LOG_DEBUG("sched_id ", id, ", comm_id ", m_id.value(), ", next sched_id ", next_sched_id);
+
+        return id;
+    }
+
+    void reset(int rank, int size) {
+        m_rank = rank;
+        m_size = size;
+        m_pof2 = ccl_pof2(m_size);
+
+        m_next_sched_id_internal = ccl_comm::max_sched_count / 2;
+        m_next_sched_id_external = 0;
+    }
+
+    /**
+     * Returns the number of @c rank in the global communicator
+     * @param rank a rank which is part of the current communicator
+     * @return number of @c rank in the global communicator
+     */
+    int get_global_rank(int rank) const;
+
+    const ccl_double_tree& dtree() const {
+        return m_dtree;
+    }
+
+    /**
+     * Maximum available number of active communicators
+     */
+    static constexpr ccl_sched_id_t max_comm_count = std::numeric_limits<ccl_comm_id_t>::max();
+    /**
+     * Maximum value of schedule id in scope of the current communicator
+     */
+    static constexpr ccl_sched_id_t max_sched_count = std::numeric_limits<ccl_sched_id_t>::max();
+
+    std::shared_ptr<atl_wrapper> atl;
+    std::unique_ptr<ccl_unordered_coll_manager> unordered_coll_manager;
+    std::unique_ptr<ccl_allreduce_2d_builder> allreduce_2d_builder;
+
+private:
+    void allocate_resources();
+
+    int m_rank;
+    int m_size;
+    int m_pof2;
+
+    ccl_comm_id_storage::comm_id m_id;
+    ccl_sched_id_t m_next_sched_id_internal;
+    ccl_sched_id_t m_next_sched_id_external;
+    ccl_rank2rank_map m_local2global_map{};
+    ccl_double_tree m_dtree;
+
+    size_t thread_number;
+    size_t on_process_ranks_number;
+};
diff --git a/src/common/comm/comm_common_attr.hpp b/src/common/comm/comm_common_attr.hpp
new file mode 100644
index 000000000..af0f7d0c9
--- /dev/null
+++ b/src/common/comm/comm_common_attr.hpp
@@ -0,0 +1,52 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/comm_attr_ids_traits.hpp"
+
+namespace ccl {
+
+class ccl_comm_attr_impl {
+public:
+    /**
+     * `version` operations
+     */
+    using version_traits_t = detail::ccl_api_type_attr_traits<comm_attr_id, comm_attr_id::version>;
+
+    const typename version_traits_t::return_type& get_attribute_value(
+        const version_traits_t& id) const {
+        return version;
+    }
+
+    typename version_traits_t::return_type set_attribute_value(typename version_traits_t::type val,
+                                                               const version_traits_t& t) {
+        (void)t;
+        throw ccl::exception("Set value for 'ccl::comm_attr_id::version' is not allowed");
+        return version;
+    }
+
+    ccl_comm_attr_impl(const typename version_traits_t::return_type& version) : version(version) {}
+
+    template <comm_attr_id attr_id>
+    bool is_valid() const noexcept {
+        return (attr_id == comm_attr_id::version);
+    }
+
+protected:
+    typename version_traits_t::return_type version;
+};
+
+} // namespace ccl
diff --git a/src/common/comm/comm_id_storage.hpp b/src/common/comm/comm_id_storage.hpp
index 7025b8606..627bc7c9b 100644
--- a/src/common/comm/comm_id_storage.hpp
+++ b/src/common/comm/comm_id_storage.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/log/log.hpp"
 #include "common/utils/spinlock.hpp"
 
@@ -127,7 +127,6 @@ class ccl_comm_id_storage {
     ccl_comm_id_t acquire_id_impl(ccl_comm_id_t last_used,
                                   ccl_comm_id_t lower_bound,
                                   ccl_comm_id_t upper_bound) {
-        
         //search from the current position till the end
         LOG_DEBUG("last ", last_used, ", low ", lower_bound, " up ", upper_bound);
 
diff --git a/src/common/comm/comm_interface.cpp b/src/common/comm/comm_interface.cpp
index 7c270c28b..e5a153671 100644
--- a/src/common/comm/comm_interface.cpp
+++ b/src/common/comm/comm_interface.cpp
@@ -16,6 +16,9 @@
 #include "common/comm/comm_interface.hpp"
 #include "common/comm/compiler_comm_interface_dispatcher_impl.hpp"
 
-
-COMMUNICATOR_INTERFACE_DISPATCHER_CLASS_EXPLICIT_INSTANTIATION(typename ccl::unified_device_type::ccl_native_t, typename ccl::unified_device_context_type::ccl_native_t);
-COMMUNICATOR_INTERFACE_DISPATCHER_NON_CLASS_EXPLICIT_INSTANTIATION(ccl::device_index_type, typename ccl::unified_device_context_type::ccl_native_t);
+COMMUNICATOR_INTERFACE_DISPATCHER_CLASS_EXPLICIT_INSTANTIATION(
+    typename ccl::unified_device_type::ccl_native_t,
+    typename ccl::unified_context_type::ccl_native_t);
+COMMUNICATOR_INTERFACE_DISPATCHER_NON_CLASS_EXPLICIT_INSTANTIATION(
+    ccl::device_index_type,
+    typename ccl::unified_context_type::ccl_native_t);
diff --git a/src/common/comm/comm_interface.hpp b/src/common/comm/comm_interface.hpp
index ab6e119df..1e813d8c5 100644
--- a/src/common/comm/comm_interface.hpp
+++ b/src/common/comm/comm_interface.hpp
@@ -14,32 +14,30 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_event.hpp"
 
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/event.hpp"
 
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_stream.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
 
-#include "common/event/event_internal/event_internal_attr_ids.hpp"
-#include "common/event/event_internal/event_internal_attr_ids_traits.hpp"
-#include "common/event/event_internal/event_internal.hpp"
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/stream.hpp"
 
 #include "common/comm/compiler_comm_interface_dispatcher.hpp"
 #include "common/comm/l0/comm_context_id.hpp"
+#include "internal_types.hpp"
 
 namespace native {
 struct ccl_device;
 }
 
 namespace ccl {
-struct gpu_comm_attr;
+namespace v1 {
 class allgatherv_attr;
 class allreduce_attr;
 class alltoall_attr;
@@ -49,16 +47,94 @@ class broadcast_attr;
 class reduce_attr;
 class reduce_scatter_attr;
 class sparse_allreduce_attr;
+} // namespace v1
+
+struct gpu_comm_attr;
 } // namespace ccl
 
 #include "types_generator_defines.hpp"
 
+#define COMM_INTERFACE_COLL_METHODS(TYPE) \
+\
+    COMM_INTERFACE_COLL_##TYPE##__VOID; \
+    COMM_INTERFACE_COLL_##TYPE(int8_t); \
+    COMM_INTERFACE_COLL_##TYPE(uint8_t); \
+    COMM_INTERFACE_COLL_##TYPE(int16_t); \
+    COMM_INTERFACE_COLL_##TYPE(uint16_t); \
+    COMM_INTERFACE_COLL_##TYPE(int32_t); \
+    COMM_INTERFACE_COLL_##TYPE(uint32_t); \
+    COMM_INTERFACE_COLL_##TYPE(int64_t); \
+    COMM_INTERFACE_COLL_##TYPE(uint64_t); \
+    COMM_INTERFACE_COLL_##TYPE(float); \
+    COMM_INTERFACE_COLL_##TYPE(double); \
+\
+    COMM_INTERFACE_SPARSE_##TYPE##__VOID; \
+    COMM_INTERFACE_SPARSE_##TYPE(int32_t, ccl::bfloat16); \
+    COMM_INTERFACE_SPARSE_##TYPE(int32_t, float); \
+    COMM_INTERFACE_SPARSE_##TYPE(int64_t, ccl::bfloat16); \
+    COMM_INTERFACE_SPARSE_##TYPE(int64_t, float);
+
+#define SYCL_COMM_INTERFACE_COLL_METHODS(TYPE) \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<int8_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<uint8_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<int16_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<uint16_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<int32_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<uint32_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<int64_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<uint64_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<float COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_##TYPE(cl::sycl::buffer<double COMMA 1>); \
+\
+    COMM_INTERFACE_SPARSE_CLASS_##TYPE(cl::sycl::buffer<int32_t COMMA 1>, \
+                                       cl::sycl::buffer<float COMMA 1>); \
+    COMM_INTERFACE_SPARSE_CLASS_##TYPE(cl::sycl::buffer<int32_t COMMA 1>, \
+                                       cl::sycl::buffer<ccl::bfloat16 COMMA 1>); \
+\
+    COMM_INTERFACE_SPARSE_CLASS_##TYPE(cl::sycl::buffer<int64_t COMMA 1>, \
+                                       cl::sycl::buffer<float COMMA 1>); \
+    COMM_INTERFACE_SPARSE_CLASS_##TYPE(cl::sycl::buffer<int64_t COMMA 1>, \
+                                       cl::sycl::buffer<ccl::bfloat16 COMMA 1>);
+
+#define COMM_INTERFACE_COLL_INSTANTIATION(COMM) \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, int8_t); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, uint8_t); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, int16_t); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, uint16_t); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, int32_t); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, uint32_t); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, int64_t); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, uint64_t); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, float); \
+    COMM_INTERFACE_COLL_INSTANTIATIONS(COMM, double); \
+    COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int32_t, float); \
+    COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int32_t, ccl::bfloat16); \
+    COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int64_t, float); \
+    COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(COMM, int64_t, ccl::bfloat16);
+
+#define SYCL_COMM_INTERFACE_COLL_INSTANTIATION(COMM) \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<int8_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<int32_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<int64_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<uint64_t COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<float COMMA 1>); \
+    COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(COMM, cl::sycl::buffer<double COMMA 1>); \
+\
+    COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION( \
+        COMM, cl::sycl::buffer<int32_t COMMA 1>, cl::sycl::buffer<float COMMA 1>); \
+    COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION( \
+        COMM, cl::sycl::buffer<int32_t COMMA 1>, cl::sycl::buffer<ccl::bfloat16 COMMA 1>); \
+    COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION( \
+        COMM, cl::sycl::buffer<int64_t COMMA 1>, cl::sycl::buffer<float COMMA 1>); \
+    COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION( \
+        COMM, cl::sycl::buffer<int64_t COMMA 1>, cl::sycl::buffer<ccl::bfloat16 COMMA 1>);
+
 namespace ccl {
 struct communicator_interface : public communicator_interface_dispatcher {
     virtual ~communicator_interface() = default;
 
-    virtual size_t rank() const = 0;
-    virtual size_t size() const = 0;
+    virtual int rank() const = 0;
+    virtual int size() const = 0;
 
     virtual bool is_host() const noexcept = 0;
     virtual bool is_cpu() const noexcept = 0;
@@ -73,66 +149,12 @@ struct communicator_interface : public communicator_interface_dispatcher {
 
     // collectives operation declarations
     virtual ccl::event barrier(const stream::impl_value_t& op_stream,
-                                 const barrier_attr& attr,
-                                 const vector_class<event>& deps = {}) = 0;
-
-    DEVICE_COMM_INTERFACE_COLL_DECLARATION__VOID;
-    DEVICE_COMM_INTERFACE_COLL_DECLARATION(char);
-    DEVICE_COMM_INTERFACE_COLL_DECLARATION(int);
-    DEVICE_COMM_INTERFACE_COLL_DECLARATION(int64_t);
-    DEVICE_COMM_INTERFACE_COLL_DECLARATION(uint64_t);
-    DEVICE_COMM_INTERFACE_COLL_DECLARATION(float);
-    DEVICE_COMM_INTERFACE_COLL_DECLARATION(double);
-
-#ifdef CCL_ENABLE_SYCL
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DECLARATION(cl::sycl::buffer<char COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DECLARATION(cl::sycl::buffer<int COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DECLARATION(cl::sycl::buffer<int64_t COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DECLARATION(cl::sycl::buffer<uint64_t COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DECLARATION(cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DECLARATION(cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION__VOID
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(char, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(char, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(char, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(char, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(char, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(char, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(char, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int64_t, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int64_t, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int64_t, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int64_t, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int64_t, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int64_t, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(int64_t, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(uint64_t, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(uint64_t, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(uint64_t, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(uint64_t, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(uint64_t, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(uint64_t, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(uint64_t, uint64_t);
+                               const barrier_attr& attr,
+                               const vector_class<event>& deps = {}) = 0;
 
+    COMM_INTERFACE_COLL_METHODS(DECLARATION);
 #ifdef CCL_ENABLE_SYCL
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DECLARATION(cl::sycl::buffer<int COMMA 1>,
-                                                   cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DECLARATION(cl::sycl::buffer<int COMMA 1>,
-                                                   cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DECLARATION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                   cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DECLARATION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                   cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+    SYCL_COMM_INTERFACE_COLL_METHODS(DECLARATION);
+#endif /* CCL_ENABLE_SYCL */
 };
 } // namespace ccl
diff --git a/src/common/comm/comm_split_common_attr.hpp b/src/common/comm/comm_split_common_attr.hpp
index d6cf1fa65..f59e9d90f 100644
--- a/src/common/comm/comm_split_common_attr.hpp
+++ b/src/common/comm/comm_split_common_attr.hpp
@@ -13,148 +13,150 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-
-namespace ccl {
-
-/**
- * Base implementation
- */
-template <template <class attr, attr id> class traits_t, class split_attrs_t>
-class ccl_base_comm_split_attr_impl {
-public:
-    /**
-     * `version` operations
-     */
-    using version_traits_t = traits_t<split_attrs_t, split_attrs_t::version>;
-
-    const typename version_traits_t::type& get_attribute_value(
-        const traits_t<split_attrs_t, split_attrs_t::version>& id) const {
-        return version;
-    }
-
-    typename version_traits_t::type set_attribute_value(typename version_traits_t::type val,
-                                                        const version_traits_t& t) {
-        (void)t;
-        throw ccl::exception("Set value for 'version' attribute is not allowed");
-        return version;
-    }
-
-    /**
-     * `color` operations
-     */
-    using color_traits_t = traits_t<split_attrs_t, split_attrs_t::color>;
-
-    const typename color_traits_t::type& get_attribute_value(
-        const traits_t<split_attrs_t, split_attrs_t::color>& id) const {
-        if (!is_valid<split_attrs_t::color>()) {
-            throw ccl::exception("Trying to get the value of the attribute 'color' which was not set");
-        }
-        return color;
-    }
-
-    typename color_traits_t::type set_attribute_value(typename color_traits_t::type val,
-                                                      const color_traits_t& t) {
-        auto old = color;
-        std::swap(color, val);
-        cur_attr = { true, split_attrs_t::color };
-        return old;
-    }
-
-    /**
-     * `group` operations
-     */
-    using group_traits_t = traits_t<split_attrs_t, split_attrs_t::group>;
-
-    const typename group_traits_t::type& get_attribute_value(group_traits_t id) const {
-        if (!is_valid<split_attrs_t::group>()) {
-            throw ccl::exception("Trying to get the value of the attribute 'group' which was not set");
-        }
-        return group;
-    }
-
-    typename group_traits_t::type set_attribute_value(typename group_traits_t::type val,
-                                                      const group_traits_t& t) {
-        auto old = group;
-        std::swap(group, val);
-        cur_attr = { true, split_attrs_t::group };
-        return old;
-    }
-
-    /**
-     * Since we can get values for various attributes,
-     * we need to have a way to ensure that the requested value has been set.
-     * Because if not, an exception is thrown when trying to get a value that was not set.
-     * This method helps with it
-     */
-    template <split_attrs_t attr_id>
-    bool is_valid() const noexcept {
-        return (cur_attr.first && attr_id == cur_attr.second) ||
-               (attr_id == split_attrs_t::version);
-    }
-
-    /**
-     * Since we can split types: color or group,
-     * we need a way to know which specific type we are using.
-     * Returns the pair <exist or not; value>
-     */
-    const std::pair<bool, split_attrs_t>& get_current_split_attr() const noexcept {
-        return cur_attr;
-    }
-
-    static constexpr typename color_traits_t::type get_default_color() {
-        return 0;
-    }
-
-    ccl_base_comm_split_attr_impl(const typename version_traits_t::type& version,
-                                  const typename group_traits_t::type& group)
-            : version(version),
-              color(get_default_color()),
-              group(group),
-              cur_attr({ false, split_attrs_t::color }) {}
-
-protected:
-    const typename version_traits_t::type version;
-    typename color_traits_t::type color;
-    typename group_traits_t::type group;
-
-    template <class T>
-    using ccl_optional_t = std::pair<bool /*exist or not*/, T>;
-
-    ccl_optional_t<split_attrs_t> cur_attr;
-};
-
-/**
- * Device implementation
- */
-class ccl_comm_split_attr_impl
-        : public ccl_base_comm_split_attr_impl<details::ccl_api_type_attr_traits,
-                                               comm_split_attr_id> {
-public:
-    using base_t =
-        ccl_base_comm_split_attr_impl<details::ccl_api_type_attr_traits, comm_split_attr_id>;
-
-    template <class traits_t>
-    const typename traits_t::type& get_attribute_value(const traits_t& id) const {
-        return base_t::get_attribute_value(id);
-    }
-
-    template <class value_t, class traits_t>
-    value_t set_attribute_value(value_t val, const traits_t& t) {
-        return base_t::set_attribute_value(val, t);
-    }
-
-    /**
-     * Device-specific methods
-     */
-    static constexpr typename group_traits_t::type get_default_group_type() {
-        return group_traits_t::type::cluster; // device-specific value (ccl_device_group_split_type)
-    }
-
-    ccl_comm_split_attr_impl(const typename version_traits_t::type& version)
-            : base_t(version, get_default_group_type()) {}
-};
-
-} // namespace ccl
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+
+namespace ccl {
+
+/**
+ * Base implementation
+ */
+template <template <class attr, attr id> class traits_t, class split_attrs_t>
+class ccl_base_comm_split_attr_impl {
+public:
+    /**
+     * `version` operations
+     */
+    using version_traits_t = traits_t<split_attrs_t, split_attrs_t::version>;
+
+    const typename version_traits_t::type& get_attribute_value(
+        const traits_t<split_attrs_t, split_attrs_t::version>& id) const {
+        return version;
+    }
+
+    typename version_traits_t::type set_attribute_value(typename version_traits_t::type val,
+                                                        const version_traits_t& t) {
+        (void)t;
+        throw ccl::exception("Set value for 'version' attribute is not allowed");
+        return version;
+    }
+
+    /**
+     * `color` operations
+     */
+    using color_traits_t = traits_t<split_attrs_t, split_attrs_t::color>;
+
+    const typename color_traits_t::type& get_attribute_value(
+        const traits_t<split_attrs_t, split_attrs_t::color>& id) const {
+        if (!is_valid<split_attrs_t::color>()) {
+            throw ccl::exception(
+                "Trying to get the value of the attribute 'color' which was not set");
+        }
+        return color;
+    }
+
+    typename color_traits_t::type set_attribute_value(typename color_traits_t::type val,
+                                                      const color_traits_t& t) {
+        auto old = color;
+        std::swap(color, val);
+        cur_attr = { true, split_attrs_t::color };
+        return old;
+    }
+
+    /**
+     * `group` operations
+     */
+    using group_traits_t = traits_t<split_attrs_t, split_attrs_t::group>;
+
+    const typename group_traits_t::type& get_attribute_value(group_traits_t id) const {
+        if (!is_valid<split_attrs_t::group>()) {
+            throw ccl::exception(
+                "Trying to get the value of the attribute 'group' which was not set");
+        }
+        return group;
+    }
+
+    typename group_traits_t::type set_attribute_value(typename group_traits_t::type val,
+                                                      const group_traits_t& t) {
+        auto old = group;
+        std::swap(group, val);
+        cur_attr = { true, split_attrs_t::group };
+        return old;
+    }
+
+    /**
+     * Since we can get values for various attributes,
+     * we need to have a way to ensure that the requested value has been set.
+     * Because if not, an exception is thrown when trying to get a value that was not set.
+     * This method helps with it
+     */
+    template <split_attrs_t attr_id>
+    bool is_valid() const noexcept {
+        return (cur_attr.first && attr_id == cur_attr.second) ||
+               (attr_id == split_attrs_t::version);
+    }
+
+    /**
+     * Since we can split types: color or group,
+     * we need a way to know which specific type we are using.
+     * Returns the pair <exist or not; value>
+     */
+    const std::pair<bool, split_attrs_t>& get_current_split_attr() const noexcept {
+        return cur_attr;
+    }
+
+    static constexpr typename color_traits_t::type get_default_color() {
+        return 0;
+    }
+
+    ccl_base_comm_split_attr_impl(const typename version_traits_t::type& version,
+                                  const typename group_traits_t::type& group)
+            : version(version),
+              color(get_default_color()),
+              group(group),
+              cur_attr({ false, split_attrs_t::color }) {}
+
+protected:
+    const typename version_traits_t::type version;
+    typename color_traits_t::type color;
+    typename group_traits_t::type group;
+
+    template <class T>
+    using ccl_optional_t = std::pair<bool /*exist or not*/, T>;
+
+    ccl_optional_t<split_attrs_t> cur_attr;
+};
+
+/**
+ * Device implementation
+ */
+class ccl_comm_split_attr_impl
+        : public ccl_base_comm_split_attr_impl<detail::ccl_api_type_attr_traits,
+                                               comm_split_attr_id> {
+public:
+    using base_t =
+        ccl_base_comm_split_attr_impl<detail::ccl_api_type_attr_traits, comm_split_attr_id>;
+
+    template <class traits_t>
+    const typename traits_t::type& get_attribute_value(const traits_t& id) const {
+        return base_t::get_attribute_value(id);
+    }
+
+    template <class value_t, class traits_t>
+    value_t set_attribute_value(value_t val, const traits_t& t) {
+        return base_t::set_attribute_value(val, t);
+    }
+
+    /**
+     * Device-specific methods
+     */
+    static constexpr typename group_traits_t::type get_default_group_type() {
+        return group_traits_t::type::cluster; // device-specific value (ccl_device_group_split_type)
+    }
+
+    ccl_comm_split_attr_impl(const typename version_traits_t::type& version)
+            : base_t(version, get_default_group_type()) {}
+};
+
+} // namespace ccl
diff --git a/src/common/comm/compiler_comm_interface_dispatcher.hpp b/src/common/comm/compiler_comm_interface_dispatcher.hpp
index 03c5d25f3..ad472ddcf 100644
--- a/src/common/comm/compiler_comm_interface_dispatcher.hpp
+++ b/src/common/comm/compiler_comm_interface_dispatcher.hpp
@@ -17,7 +17,7 @@
 #include <cstddef>
 #include <memory>
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "supported_topologies.hpp"
 #include "communicator_traits.hpp"
 #include "atl/atl_wrapper.h"
@@ -26,25 +26,28 @@ namespace native {
 struct ccl_device;
 }
 namespace ccl {
+namespace v1 {
+class comm_split_attr;
+}
+
 #ifdef MULTI_GPU_SUPPORT
 struct gpu_comm_attr;
 #endif
 struct communicator_interface;
-class comm_split_attr;
 
 using communicator_interface_ptr = std::shared_ptr<communicator_interface>;
 
 struct communicator_interface_dispatcher {
     using device_t = typename ccl::unified_device_type::ccl_native_t;
-    using context_t = typename ccl::unified_device_context_type::ccl_native_t;
+    using context_t = typename ccl::unified_context_type::ccl_native_t;
 
 #ifdef MULTI_GPU_SUPPORT
     virtual void visit(ccl::gpu_comm_attr& comm_attr) = 0;
 #endif //MULTI_GPU_SUPPORT
 
     virtual ccl::device_index_type get_device_path() const = 0;
-    virtual device_t get_device() = 0;
-    virtual context_t get_context() = 0;
+    virtual device_t get_device() const = 0;
+    virtual context_t get_context() const = 0;
     virtual const comm_split_attr& get_comm_split_attr() const = 0;
     virtual group_split_type get_topology_type() const = 0;
     virtual device_topology_type get_topology_class() const = 0;
@@ -84,13 +87,13 @@ struct communicator_interface_dispatcher {
 
     // create communicator for host
     static communicator_interface_ptr create_communicator_impl(const size_t size,
-                                                               const size_t rank,
+                                                               const int rank,
                                                                shared_ptr_class<kvs_interface> kvs);
 
 private:
     static communicator_interface_ptr create_communicator_from_unified_device(
         unified_device_type&& device_id,
-        unified_device_context_type&& context_id,
+        unified_context_type&& context_id,
         size_t thread_idx,
         size_t process_idx,
         const comm_split_attr& attr,
diff --git a/src/common/comm/compiler_comm_interface_dispatcher_impl.hpp b/src/common/comm/compiler_comm_interface_dispatcher_impl.hpp
index 705a622ff..50c7464ed 100644
--- a/src/common/comm/compiler_comm_interface_dispatcher_impl.hpp
+++ b/src/common/comm/compiler_comm_interface_dispatcher_impl.hpp
@@ -19,10 +19,13 @@
 #include "common/comm/comm_interface.hpp"
 #include "unified_device_impl.hpp"
 
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
+
+#include "common/comm/comm_common_attr.hpp"
+#include "comm_attr_impl.hpp"
 
 #include "common/comm/comm_split_common_attr.hpp"
 #include "comm_split_attr_impl.hpp"
@@ -48,14 +51,16 @@ communicator_interface_ptr communicator_interface_dispatcher::create_communicato
     return communicator_interface_ptr(new host_communicator());
 }
 
-communicator_interface_ptr communicator_interface_dispatcher::create_communicator_impl(const size_t size,
-                                                               shared_ptr_class<kvs_interface> kvs) {
+communicator_interface_ptr communicator_interface_dispatcher::create_communicator_impl(
+    const size_t size,
+    shared_ptr_class<kvs_interface> kvs) {
     return communicator_interface_ptr(new host_communicator(size, kvs));
 }
 
-communicator_interface_ptr communicator_interface_dispatcher::create_communicator_impl(const size_t size,
-                                                               const size_t rank,
-                                                               shared_ptr_class<kvs_interface> kvs) {
+communicator_interface_ptr communicator_interface_dispatcher::create_communicator_impl(
+    const size_t size,
+    const int rank,
+    shared_ptr_class<kvs_interface> kvs) {
     return communicator_interface_ptr(new host_communicator(size, rank, kvs));
 }
 
@@ -75,7 +80,12 @@ communicator_interface_ptr communicator_interface_dispatcher::create_communicato
                   "Unsupported 'DeviceType'");
 
     return communicator_interface_dispatcher::create_communicator_from_unified_device(
-        unified_device_type(device), unified_device_context_type(context), thread_idx, process_idx, attr, atl);
+        unified_device_type(device),
+        unified_context_type(context),
+        thread_idx,
+        process_idx,
+        attr,
+        atl);
 }
 
 template <class DeviceType,
@@ -93,43 +103,58 @@ communicator_interface_ptr communicator_interface_dispatcher::create_communicato
 #ifdef CCL_ENABLE_SYCL
     return communicator_interface_dispatcher::create_communicator_from_unified_device(
         unified_device_type(device_id, cl::sycl::info::device_type::gpu),
-        unified_device_context_type(context),
+        unified_context_type(context),
         thread_idx,
         process_idx,
         attr,
         atl);
 #else
-//.    static_assert(std::is_same<typename unified_device_type::handle_t, DeviceType>::value,
-//                  "Unsupported 'DeviceType'");
+    //.    static_assert(std::is_same<typename unified_device_type::handle_t, DeviceType>::value,
+    //                  "Unsupported 'DeviceType'");
     return communicator_interface_dispatcher::create_communicator_from_unified_device(
-        unified_device_type(device_id), unified_device_context_type(context), thread_idx, process_idx, attr, atl);
+        unified_device_type(device_id),
+        unified_context_type(context),
+        thread_idx,
+        process_idx,
+        attr,
+        atl);
 #endif
 }
 
 communicator_interface_ptr
 communicator_interface_dispatcher::create_communicator_from_unified_device(
     ccl::unified_device_type&& device_id,
-    ccl::unified_device_context_type&& context,
+    ccl::unified_context_type&& context,
     size_t thread_idx,
     size_t process_idx,
     const ccl::comm_split_attr& attr,
     std::shared_ptr<atl_wrapper> atl) {
     // TODO ring by default at now. Choose preferred a2a if availbale
     ccl::device_topology_type preferred_topology_class = ccl::device_topology_type::ring;
-    ccl::group_split_type preferred_topology_group = ccl::group_split_type::cluster;
+    ccl::group_split_type preferred_topology_group = ccl::group_split_type::process;
+
+    /* type from API */
+    ccl::split_group split_group = ccl::split_group::cluster;
 
     // read comm split attributes
     if (attr.is_valid<ccl::comm_split_attr_id::group>()) {
-        preferred_topology_group = attr.get<ccl::comm_split_attr_id::group>();
+        split_group = attr.get<ccl::comm_split_attr_id::group>();
         if (attr.is_valid<ccl::comm_split_attr_id::color>()) {
             throw ccl::exception(std::string(
                 "Invalid `comm_split_attr`: both `color` and `group` set. Only one is supported"));
         }
+        if (split_group != ccl::split_group::cluster)
+            throw ccl::exception("unexepcted split_group");
     }
     else if (attr.is_valid<ccl::comm_split_attr_id::color>()) {
         throw ccl::exception(std::string(__FUNCTION__) + " - not implemented for 'color'");
     }
 
+#ifdef CCL_ENABLE_SYCL
+    /* TODO: tmp code to select right branch in switch-case */
+    preferred_topology_group = ccl::group_split_type::undetermined;
+#endif
+
     // TODO creation host communicator from device
     // if (device is host ?)
     // return new host_communicator(atl);
@@ -196,8 +221,8 @@ communicator_interface_dispatcher::create_communicator_from_unified_device(
         }
 #if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
         case device_topology_type::undetermined: {
-            auto comm_impl =
-                new single_device_communicator(std::move(device_id), std::move(context), thread_idx, process_idx, attr);
+            auto comm_impl = new single_device_communicator(
+                std::move(device_id), std::move(context), thread_idx, process_idx, attr);
             ccl::global_data& data = ccl::global_data::get();
             auto comm = std::shared_ptr<ccl_comm>(
                 new ccl_comm(thread_idx, process_idx, data.comm_ids->acquire(), atl));
@@ -226,7 +251,8 @@ communicator_interface_dispatcher::create_communicator_from_unified_device(
         const ccl::comm_split_attr& attr, \
         std::shared_ptr<atl_wrapper> atl);
 
-#define COMMUNICATOR_INTERFACE_DISPATCHER_NON_CLASS_EXPLICIT_INSTANTIATION(DeviceType, ContextType) \
+#define COMMUNICATOR_INTERFACE_DISPATCHER_NON_CLASS_EXPLICIT_INSTANTIATION(DeviceType, \
+                                                                           ContextType) \
     template ccl::communicator_interface_ptr \
     ccl::communicator_interface_dispatcher::create_communicator_impl( \
         DeviceType device_id, \
diff --git a/src/common/comm/host_communicator/host_communicator.cpp b/src/common/comm/host_communicator/host_communicator.cpp
index 70c2ef68d..af88f1fe7 100644
--- a/src/common/comm/host_communicator/host_communicator.cpp
+++ b/src/common/comm/host_communicator/host_communicator.cpp
@@ -15,9 +15,9 @@
 */
 #include "common/global/global.hpp"
 #include "common/comm/host_communicator/host_communicator_impl.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
 
 #include "common/request/request.hpp"
 #include "common/event/impls/host_event.hpp"
@@ -36,13 +36,12 @@
 
 namespace ccl {
 
-host_communicator::host_communicator()
-        : comm_attr(ccl::create_comm_split_attr())
-{
-}
+using ccl::preview::create_comm_split_attr;
+
+host_communicator::host_communicator() : comm_attr(create_comm_split_attr()) {}
 
-host_communicator::host_communicator(size_t size, shared_ptr_class<kvs_interface> kvs)
-        : comm_attr(ccl::create_comm_split_attr()),
+host_communicator::host_communicator(int size, shared_ptr_class<kvs_interface> kvs)
+        : comm_attr(create_comm_split_attr()),
           comm_rank(0),
           comm_size(size) {
     if (size <= 0) {
@@ -50,11 +49,10 @@ host_communicator::host_communicator(size_t size, shared_ptr_class<kvs_interface
     }
 }
 
-host_communicator::host_communicator(size_t size, size_t rank, shared_ptr_class<kvs_interface> kvs)
-        : comm_attr(ccl::create_comm_split_attr()),
+host_communicator::host_communicator(int size, int rank, shared_ptr_class<kvs_interface> kvs)
+        : comm_attr(create_comm_split_attr()),
           comm_rank(rank),
           comm_size(size) {
-
     if (rank > size || size <= 0) {
         throw ccl::exception("Incorrect rank or size value when creating a host communicator");
     }
@@ -69,17 +67,36 @@ host_communicator::host_communicator(size_t size, size_t rank, shared_ptr_class<
         std::shared_ptr<ccl_comm>(new ccl_comm(rank, size, data.comm_ids->acquire(), atl_tmp));
 }
 
+host_communicator::host_communicator(std::shared_ptr<atl_wrapper> atl)
+        : comm_attr(create_comm_split_attr()),
+          comm_rank(atl->get_rank()),
+          comm_size(atl->get_size()) {
+    int rank = atl->get_rank();
+    int size = atl->get_size();
+
+    if (rank > size || size <= 0) {
+        throw ccl::exception("Incorrect rank or size value when creating \
+                             a host communicator: rank" +
+                             std::to_string(rank) + " size: " + std::to_string(size));
+    }
+
+    LOG_DEBUG("host_communicator ctor");
+
+    ccl::global_data& data = ccl::global_data::get();
+    comm_impl = std::shared_ptr<ccl_comm>(new ccl_comm(rank, size, data.comm_ids->acquire(), atl));
+}
+
 host_communicator::host_communicator(std::shared_ptr<ccl_comm> impl)
         : comm_impl(impl),
-          comm_attr(ccl::create_comm_split_attr()),
+          comm_attr(create_comm_split_attr()),
           comm_rank(impl->rank()),
           comm_size(impl->size()) {}
 
-size_t host_communicator::rank() const {
+int host_communicator::rank() const {
     return comm_rank;
 }
 
-size_t host_communicator::size() const {
+int host_communicator::size() const {
     return comm_size;
 }
 
@@ -95,13 +112,13 @@ ccl::device_index_type host_communicator::get_device_path() const {
                                    ccl::unused_index_value };
 }
 
-ccl::communicator_interface::device_t host_communicator::get_device() {
+ccl::communicator_interface::device_t host_communicator::get_device() const {
     throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
     static ccl::communicator_interface::device_t empty;
     return empty;
 }
 
-ccl::communicator_interface::context_t host_communicator::get_context() {
+ccl::communicator_interface::context_t host_communicator::get_context() const {
     throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
     static ccl::communicator_interface::context_t empty;
     return empty;
@@ -110,16 +127,11 @@ ccl::communicator_interface::context_t host_communicator::get_context() {
 void host_communicator::exchange_colors(std::vector<int>& colors) {
     size_t send_count = 1;
     vector_class<size_t> recv_counts(colors.size(), send_count);
-    auto attr = create_operation_attr<allgatherv_attr>(
-        attr_val<operation_attr_id::to_cache>(false));
-
-    this->allgatherv_impl(colors.data(),
-                          send_count,
-                          colors.data(),
-                          recv_counts,
-                          {},
-                          attr,
-                          {}).wait();
+    auto attr =
+        create_operation_attr<allgatherv_attr>(attr_val<operation_attr_id::to_cache>(false));
+
+    this->allgatherv_impl(colors.data(), send_count, colors.data(), recv_counts, {}, attr, {})
+        .wait();
 }
 
 ccl_comm* host_communicator::create_with_color(int color,
@@ -141,7 +153,7 @@ ccl_comm* host_communicator::create_with_color(int color,
 ccl::communicator_interface_ptr host_communicator::split(const comm_split_attr& attr) {
     if (!attr.is_valid<comm_split_attr_id::color>()) {
         throw ccl::exception(std::string(__FUNCTION__) +
-                        " - 'Color' split attribute for host communicator is not set");
+                             " - 'Color' split attribute for host communicator is not set");
     }
 
     ccl::global_data& data = ccl::global_data::get();
@@ -154,17 +166,15 @@ ccl::communicator_interface_ptr host_communicator::split(const comm_split_attr&
         new host_communicator(std::shared_ptr<ccl_comm>(new_comm)));
 }
 
-host_communicator::coll_request_t host_communicator::barrier(
-    const ccl::stream::impl_value_t& op_stream,
-    const ccl::barrier_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::barrier(const ccl::stream::impl_value_t& op_stream,
+                                      const ccl::barrier_attr& attr,
+                                      const ccl::vector_class<ccl::event>& deps) {
     return get_impl()->barrier_impl(op_stream, attr, deps);
 }
 
-host_communicator::coll_request_t host_communicator::barrier_impl(
-    const ccl::stream::impl_value_t& op_stream,
-    const ccl::barrier_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::barrier_impl(const ccl::stream::impl_value_t& op_stream,
+                                           const ccl::barrier_attr& attr,
+                                           const ccl::vector_class<ccl::event>& deps) {
     // TODO what exactly we need to do with 'attr' here?
 
     ccl_barrier_impl(comm_impl.get(), op_stream.get());
@@ -175,89 +185,83 @@ host_communicator::coll_request_t host_communicator::barrier_impl(
 }
 
 /* allgatherv */
-host_communicator::coll_request_t host_communicator::allgatherv_impl(
-    const void* send_buf,
-    size_t send_count,
-    void* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::allgatherv_impl(const void* send_buf,
+                                              size_t send_count,
+                                              void* recv_buf,
+                                              const ccl::vector_class<size_t>& recv_counts,
+                                              ccl::datatype dtype,
+                                              const ccl::stream::impl_value_t& stream,
+                                              const ccl::allgatherv_attr& attr,
+                                              const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_allgatherv_impl(
         send_buf, send_count, recv_buf, recv_counts.data(), dtype, attr, comm_impl.get(), nullptr);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
-host_communicator::coll_request_t host_communicator::allgatherv_impl(
-    const void* send_buf,
-    size_t send_count,
-    const ccl::vector_class<void*>& recv_bufs,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::allgatherv_impl(const void* send_buf,
+                                              size_t send_count,
+                                              const ccl::vector_class<void*>& recv_bufs,
+                                              const ccl::vector_class<size_t>& recv_counts,
+                                              ccl::datatype dtype,
+                                              const ccl::stream::impl_value_t& stream,
+                                              const ccl::allgatherv_attr& attr,
+                                              const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* allreduce */
-host_communicator::coll_request_t host_communicator::allreduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req =
-        ccl_allreduce_impl(send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), nullptr);
+ccl::event host_communicator::allreduce_impl(const void* send_buf,
+                                             void* recv_buf,
+                                             size_t count,
+                                             ccl::datatype dtype,
+                                             ccl::reduction reduction,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::allreduce_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_allreduce_impl(
+        send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), nullptr);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
 /* alltoall */
-host_communicator::coll_request_t host_communicator::alltoall_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::alltoall_impl(const void* send_buf,
+                                            void* recv_buf,
+                                            size_t count,
+                                            ccl::datatype dtype,
+                                            const ccl::stream::impl_value_t& stream,
+                                            const ccl::alltoall_attr& attr,
+                                            const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req =
         ccl_alltoall_impl(send_buf, recv_buf, count, dtype, attr, comm_impl.get(), nullptr);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
-host_communicator::coll_request_t host_communicator::alltoall_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<void*>& recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
+                                            const ccl::vector_class<void*>& recv_buf,
+                                            size_t count,
+                                            ccl::datatype dtype,
+                                            const ccl::stream::impl_value_t& stream,
+                                            const ccl::alltoall_attr& attr,
+                                            const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoallv */
-host_communicator::coll_request_t host_communicator::alltoallv_impl(
-    const void* send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    void* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::alltoallv_impl(const void* send_buf,
+                                             const ccl::vector_class<size_t>& send_counts,
+                                             void* recv_buf,
+                                             const ccl::vector_class<size_t>& recv_counts,
+                                             ccl::datatype dtype,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::alltoallv_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_alltoallv_impl(send_buf,
                                           send_counts.data(),
                                           recv_buf,
@@ -270,45 +274,42 @@ host_communicator::coll_request_t host_communicator::alltoallv_impl(
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
-host_communicator::coll_request_t host_communicator::alltoallv_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    ccl::vector_class<void*> recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& dep) {
+ccl::event host_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
+                                             const ccl::vector_class<size_t>& send_counts,
+                                             ccl::vector_class<void*> recv_buf,
+                                             const ccl::vector_class<size_t>& recv_counts,
+                                             ccl::datatype dtype,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::alltoallv_attr& attr,
+                                             const ccl::vector_class<ccl::event>& dep) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* bcast */
-host_communicator::coll_request_t host_communicator::broadcast_impl(
-    void* buf,
-    size_t count,
-    ccl::datatype dtype,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::broadcast_impl(void* buf,
+                                             size_t count,
+                                             ccl::datatype dtype,
+                                             int root,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::broadcast_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), nullptr);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
 /* reduce */
-host_communicator::coll_request_t host_communicator::reduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::reduce_impl(const void* send_buf,
+                                          void* recv_buf,
+                                          size_t count,
+                                          ccl::datatype dtype,
+                                          ccl::reduction reduction,
+                                          int root,
+                                          const ccl::stream::impl_value_t& stream,
+                                          const ccl::reduce_attr& attr,
+                                          const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_reduce_impl(
         send_buf, recv_buf, count, dtype, reduction, root, attr, comm_impl.get(), nullptr);
 
@@ -316,15 +317,14 @@ host_communicator::coll_request_t host_communicator::reduce_impl(
 }
 
 /* reduce_scatter */
-host_communicator::coll_request_t host_communicator::reduce_scatter_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t recv_count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_scatter_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::reduce_scatter_impl(const void* send_buf,
+                                                  void* recv_buf,
+                                                  size_t recv_count,
+                                                  ccl::datatype dtype,
+                                                  ccl::reduction reduction,
+                                                  const ccl::stream::impl_value_t& stream,
+                                                  const ccl::reduce_scatter_attr& attr,
+                                                  const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_reduce_scatter_impl(
         send_buf, recv_buf, recv_count, dtype, reduction, attr, comm_impl.get(), nullptr);
 
@@ -332,21 +332,20 @@ host_communicator::coll_request_t host_communicator::reduce_scatter_impl(
 }
 
 /* sparse_allreduce */
-host_communicator::coll_request_t host_communicator::sparse_allreduce_impl(
-    const void* send_ind_buf,
-    size_t send_ind_count,
-    const void* send_val_buf,
-    size_t send_val_count,
-    void* recv_ind_buf,
-    size_t recv_ind_count,
-    void* recv_val_buf,
-    size_t recv_val_count,
-    ccl::datatype index_dtype,
-    ccl::datatype value_dtype,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::sparse_allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::sparse_allreduce_impl(const void* send_ind_buf,
+                                                    size_t send_ind_count,
+                                                    const void* send_val_buf,
+                                                    size_t send_val_count,
+                                                    void* recv_ind_buf,
+                                                    size_t recv_ind_count,
+                                                    void* recv_val_buf,
+                                                    size_t recv_val_count,
+                                                    ccl::datatype index_dtype,
+                                                    ccl::datatype value_dtype,
+                                                    ccl::reduction reduction,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::sparse_allreduce_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_sparse_allreduce_impl(send_ind_buf,
                                                  send_ind_count,
                                                  send_val_buf,
@@ -374,129 +373,9 @@ std::string host_communicator::to_string() const {
            std::to_string(size());
 }
 
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(host_communicator, char);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(host_communicator, int);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(host_communicator, int64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(host_communicator, uint64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(host_communicator, float);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(host_communicator, double);
-
-#ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(host_communicator,
-                                                cl::sycl::buffer<char COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(host_communicator,
-                                                cl::sycl::buffer<int COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(host_communicator,
-                                                cl::sycl::buffer<int64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(host_communicator,
-                                                cl::sycl::buffer<uint64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(host_communicator,
-                                                cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(host_communicator,
-                                                cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              char,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              char,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              char,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              char,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              char,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              char,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              char,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator, int, int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              int64_t,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              uint64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              uint64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              uint64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              uint64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              uint64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              uint64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(host_communicator,
-                                                              uint64_t,
-                                                              uint64_t);
-
+COMM_INTERFACE_COLL_INSTANTIATION(host_communicator);
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    host_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    host_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    host_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    host_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(host_communicator);
+#endif /* CCL_ENABLE_SYCL */
 
 } // namespace ccl
diff --git a/src/common/comm/host_communicator/host_communicator.hpp b/src/common/comm/host_communicator/host_communicator.hpp
index 86e5f64b5..059a98b4c 100644
--- a/src/common/comm/host_communicator/host_communicator.hpp
+++ b/src/common/comm/host_communicator/host_communicator.hpp
@@ -16,35 +16,37 @@
 #pragma once
 
 #include "common/comm/comm.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-
-#include "oneapi/ccl/ccl_event.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+
+#include "oneapi/ccl/event.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 
 #include "common/comm/communicator_traits.hpp"
 #include "common/comm/comm_interface.hpp"
 #include "types_generator_defines.hpp"
+#include "atl/atl_wrapper.h"
 
 namespace ccl {
 
+namespace v1 {
 class kvs_interface;
+}
 
 class host_communicator : public ccl::communicator_interface {
 public:
-    using coll_request_t = ccl::event;
     using traits = ccl::host_communicator_traits;
 
-    size_t rank() const override;
-    size_t size() const override;
+    int rank() const override;
+    int size() const override;
 
     // traits
     bool is_host() const noexcept override {
@@ -80,103 +82,49 @@ class host_communicator : public ccl::communicator_interface {
 #endif
 
     ccl::device_index_type get_device_path() const override;
-    ccl::communicator_interface::device_t get_device() override;
-    ccl::communicator_interface::context_t get_context() override;
+    ccl::communicator_interface::device_t get_device() const override;
+    ccl::communicator_interface::context_t get_context() const override;
 
     const ccl::comm_split_attr& get_comm_split_attr() const override {
         return comm_attr;
     }
 
     ccl::group_split_type get_topology_type() const override {
-        throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
+        throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " +
+                             traits::name());
         return ccl::group_split_type::undetermined;
     }
 
     ccl::device_topology_type get_topology_class() const override {
-        throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " + traits::name());
+        throw ccl::exception(std::string(__FUNCTION__) + " is not applicable for " +
+                             traits::name());
         return ccl::device_topology_type::undetermined;
     }
 
     ccl::communicator_interface_ptr split(const comm_split_attr& attr) override;
 
     // collectives operation declarations
-    coll_request_t barrier(const stream::impl_value_t& op_stream,
-                                const barrier_attr& attr,
-                                const vector_class<event>& deps = {}) override;
-    coll_request_t barrier_impl(const stream::impl_value_t& op_stream,
-                                const barrier_attr& attr,
-                                const vector_class<event>& deps = {});
-
-    // communicator interfaces implementation
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION__VOID;
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(char);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(int);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(int64_t);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(uint64_t);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(float);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(double);
-
-#ifdef CCL_ENABLE_SYCL
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<char COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<uint64_t COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION__VOID;
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, uint64_t);
-
+    ccl::event barrier(const stream::impl_value_t& op_stream,
+                       const barrier_attr& attr,
+                       const vector_class<event>& deps = {}) override;
+    ccl::event barrier_impl(const stream::impl_value_t& op_stream,
+                            const barrier_attr& attr,
+                            const vector_class<event>& deps = {});
+
+    COMM_INTERFACE_COLL_METHODS(DEFINITION);
 #ifdef CCL_ENABLE_SYCL
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>,
-                                                  cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>,
-                                                  cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                  cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                  cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-
+    SYCL_COMM_INTERFACE_COLL_METHODS(DEFINITION);
+#endif /* CCL_ENABLE_SYCL */
 
-    DEVICE_COMM_IMPL_DECLARATION;
-    DEVICE_COMM_IMPL_CLASS_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_DECLARATION;
-    DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION
+    COMM_IMPL_DECLARATION;
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION;
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
 
     host_communicator();
-    host_communicator(size_t size, shared_ptr_class<kvs_interface> kvs);
-    host_communicator(size_t size, size_t rank, shared_ptr_class<kvs_interface> kvs);
+    host_communicator(int size, shared_ptr_class<kvs_interface> kvs);
+    host_communicator(int size, int rank, shared_ptr_class<kvs_interface> kvs);
+    host_communicator(std::shared_ptr<atl_wrapper> atl);
     host_communicator(std::shared_ptr<ccl_comm> impl);
     host_communicator(host_communicator& src) = delete;
     host_communicator(host_communicator&& src) = default;
@@ -192,11 +140,11 @@ class host_communicator : public ccl::communicator_interface {
     friend struct group_context;
     std::shared_ptr<ccl_comm> comm_impl;
     ccl::comm_split_attr comm_attr;
-    size_t comm_rank;
-    size_t comm_size;
+    int comm_rank;
+    int comm_size;
     ccl::group_unique_key owner_id;
     // ccl::unified_device_type device;
-    // ccl::unified_device_context_type context;
+    // ccl::unified_context_type context;
 
     host_communicator* get_impl() {
         return this;
diff --git a/src/common/comm/host_communicator/host_communicator_impl.hpp b/src/common/comm/host_communicator/host_communicator_impl.hpp
index 0afe613b3..d71640c00 100644
--- a/src/common/comm/host_communicator/host_communicator_impl.hpp
+++ b/src/common/comm/host_communicator/host_communicator_impl.hpp
@@ -28,19 +28,18 @@ namespace ccl {
 
 /* allgatherv */
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::allgatherv_impl(
-    const buffer_type* send_buf,
-    size_t send_count,
-    buffer_type* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
+                                              size_t send_count,
+                                              buffer_type* recv_buf,
+                                              const ccl::vector_class<size_t>& recv_counts,
+                                              const ccl::stream::impl_value_t& stream,
+                                              const ccl::allgatherv_attr& attr,
+                                              const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf),
                                            send_count,
                                            reinterpret_cast<void*>(recv_buf),
                                            recv_counts.data(),
-                                           ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                           ccl::native_type_info<buffer_type>::dtype,
                                            attr,
                                            comm_impl.get(),
                                            nullptr);
@@ -49,15 +48,13 @@ host_communicator::coll_request_t host_communicator::allgatherv_impl(
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::allgatherv_impl(
-    const buffer_type* send_buf,
-    size_t send_count,
-    ccl::vector_class<buffer_type*>& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-  
+ccl::event host_communicator::allgatherv_impl(const buffer_type* send_buf,
+                                              size_t send_count,
+                                              ccl::vector_class<buffer_type*>& recv_buf,
+                                              const ccl::vector_class<size_t>& recv_counts,
+                                              const ccl::stream::impl_value_t& stream,
+                                              const ccl::allgatherv_attr& attr,
+                                              const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
     internal_attr.vector_buf = 1;
 
@@ -65,7 +62,7 @@ host_communicator::coll_request_t host_communicator::allgatherv_impl(
                                            send_count,
                                            (void*)(recv_buf.data()),
                                            recv_counts.data(),
-                                           ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                           ccl::native_type_info<buffer_type>::dtype,
                                            internal_attr,
                                            comm_impl.get(),
                                            nullptr);
@@ -74,21 +71,20 @@ host_communicator::coll_request_t host_communicator::allgatherv_impl(
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::allgatherv_impl(
-    const buffer_type& send_buf,
-    size_t send_count,
-    buffer_type& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::allgatherv_impl(const buffer_type& send_buf,
+                                              size_t send_count,
+                                              buffer_type& recv_buf,
+                                              const ccl::vector_class<size_t>& recv_counts,
+                                              const ccl::stream::impl_value_t& stream,
+                                              const ccl::allgatherv_attr& attr,
+                                              const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::allgatherv_impl(
+ccl::event host_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -103,18 +99,17 @@ host_communicator::coll_request_t host_communicator::allgatherv_impl(
 
 /* allreduce */
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::allreduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::allreduce_impl(const buffer_type* send_buf,
+                                             buffer_type* recv_buf,
+                                             size_t count,
+                                             ccl::reduction reduction,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::allreduce_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_allreduce_impl(reinterpret_cast<const void*>(send_buf),
                                           reinterpret_cast<void*>(recv_buf),
                                           count,
-                                          ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                          ccl::native_type_info<buffer_type>::dtype,
                                           reduction,
                                           attr,
                                           comm_impl.get(),
@@ -124,14 +119,13 @@ host_communicator::coll_request_t host_communicator::allreduce_impl(
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::allreduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::allreduce_impl(const buffer_type& send_buf,
+                                             buffer_type& recv_buf,
+                                             size_t count,
+                                             ccl::reduction reduction,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::allreduce_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
@@ -139,17 +133,16 @@ host_communicator::coll_request_t host_communicator::allreduce_impl(
 
 /* alltoall */
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::alltoall_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::alltoall_impl(const buffer_type* send_buf,
+                                            buffer_type* recv_buf,
+                                            size_t count,
+                                            const ccl::stream::impl_value_t& stream,
+                                            const ccl::alltoall_attr& attr,
+                                            const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_alltoall_impl(reinterpret_cast<const void*>(send_buf),
                                          reinterpret_cast<void*>(recv_buf),
                                          count,
-                                         ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                         ccl::native_type_info<buffer_type>::dtype,
                                          attr,
                                          comm_impl.get(),
                                          nullptr);
@@ -158,33 +151,31 @@ host_communicator::coll_request_t host_communicator::alltoall_impl(
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::alltoall_impl(
-    const ccl::vector_class<buffer_type*>& send_buf,
-    const ccl::vector_class<buffer_type*>& recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::alltoall_impl(const ccl::vector_class<buffer_type*>& send_buf,
+                                            const ccl::vector_class<buffer_type*>& recv_buf,
+                                            size_t count,
+                                            const ccl::stream::impl_value_t& stream,
+                                            const ccl::alltoall_attr& attr,
+                                            const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::alltoall_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::alltoall_impl(const buffer_type& send_buf,
+                                            buffer_type& recv_buf,
+                                            size_t count,
+                                            const ccl::stream::impl_value_t& stream,
+                                            const ccl::alltoall_attr& attr,
+                                            const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::alltoall_impl(
+ccl::event host_communicator::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -198,19 +189,18 @@ host_communicator::coll_request_t host_communicator::alltoall_impl(
 
 /* alltoallv */
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::alltoallv_impl(
-    const buffer_type* send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    buffer_type* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::alltoallv_impl(const buffer_type* send_buf,
+                                             const ccl::vector_class<size_t>& send_counts,
+                                             buffer_type* recv_buf,
+                                             const ccl::vector_class<size_t>& recv_counts,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::alltoallv_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_alltoallv_impl(reinterpret_cast<const void*>(send_buf),
                                           send_counts.data(),
                                           reinterpret_cast<void*>(recv_buf),
                                           recv_counts.data(),
-                                          ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                          ccl::native_type_info<buffer_type>::dtype,
                                           attr,
                                           comm_impl.get(),
                                           nullptr);
@@ -219,34 +209,32 @@ host_communicator::coll_request_t host_communicator::alltoallv_impl(
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::alltoallv_impl(
-    const ccl::vector_class<buffer_type*>& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    const ccl::vector_class<buffer_type*>& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& dep) {
+ccl::event host_communicator::alltoallv_impl(const ccl::vector_class<buffer_type*>& send_buf,
+                                             const ccl::vector_class<size_t>& send_counts,
+                                             const ccl::vector_class<buffer_type*>& recv_buf,
+                                             const ccl::vector_class<size_t>& recv_counts,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::alltoallv_attr& attr,
+                                             const ccl::vector_class<ccl::event>& dep) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::alltoallv_impl(
-    const buffer_type& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    buffer_type& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::alltoallv_impl(const buffer_type& send_buf,
+                                             const ccl::vector_class<size_t>& send_counts,
+                                             buffer_type& recv_buf,
+                                             const ccl::vector_class<size_t>& recv_counts,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::alltoallv_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::alltoallv_impl(
+ccl::event host_communicator::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -261,16 +249,15 @@ host_communicator::coll_request_t host_communicator::alltoallv_impl(
 
 /* bcast */
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::broadcast_impl(
-    buffer_type* buf,
-    size_t count,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::broadcast_impl(buffer_type* buf,
+                                             size_t count,
+                                             int root,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::broadcast_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_broadcast_impl(reinterpret_cast<void*>(buf),
                                           count,
-                                          ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                          ccl::native_type_info<buffer_type>::dtype,
                                           root,
                                           attr,
                                           comm_impl.get(),
@@ -280,13 +267,12 @@ host_communicator::coll_request_t host_communicator::broadcast_impl(
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::broadcast_impl(
-    buffer_type& buf,
-    size_t count,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::broadcast_impl(buffer_type& buf,
+                                             size_t count,
+                                             int root,
+                                             const ccl::stream::impl_value_t& stream,
+                                             const ccl::broadcast_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
@@ -294,19 +280,18 @@ host_communicator::coll_request_t host_communicator::broadcast_impl(
 
 /* reduce */
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::reduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::reduce_impl(const buffer_type* send_buf,
+                                          buffer_type* recv_buf,
+                                          size_t count,
+                                          ccl::reduction reduction,
+                                          int root,
+                                          const ccl::stream::impl_value_t& stream,
+                                          const ccl::reduce_attr& attr,
+                                          const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_reduce_impl(reinterpret_cast<const void*>(send_buf),
                                        reinterpret_cast<void*>(recv_buf),
                                        count,
-                                       ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                       ccl::native_type_info<buffer_type>::dtype,
                                        reduction,
                                        root,
                                        attr,
@@ -317,15 +302,14 @@ host_communicator::coll_request_t host_communicator::reduce_impl(
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::reduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::reduce_impl(const buffer_type& send_buf,
+                                          buffer_type& recv_buf,
+                                          size_t count,
+                                          ccl::reduction reduction,
+                                          int root,
+                                          const ccl::stream::impl_value_t& stream,
+                                          const ccl::reduce_attr& attr,
+                                          const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
@@ -333,35 +317,33 @@ host_communicator::coll_request_t host_communicator::reduce_impl(
 
 /* reduce_scatter */
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::reduce_scatter_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t recv_count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_scatter_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::reduce_scatter_impl(const buffer_type* send_buf,
+                                                  buffer_type* recv_buf,
+                                                  size_t recv_count,
+                                                  ccl::reduction reduction,
+                                                  const ccl::stream::impl_value_t& stream,
+                                                  const ccl::reduce_scatter_attr& attr,
+                                                  const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_reduce_scatter_impl(reinterpret_cast<const void*>(send_buf),
-                                       reinterpret_cast<void*>(recv_buf),
-                                       recv_count,
-                                       ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                                       reduction,
-                                       attr,
-                                       comm_impl.get(),
-                                       nullptr);
+                                               reinterpret_cast<void*>(recv_buf),
+                                               recv_count,
+                                               ccl::native_type_info<buffer_type>::dtype,
+                                               reduction,
+                                               attr,
+                                               comm_impl.get(),
+                                               nullptr);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
 template <class buffer_type>
-host_communicator::coll_request_t host_communicator::reduce_scatter_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t recv_count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_scatter_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::reduce_scatter_impl(const buffer_type& send_buf,
+                                                  buffer_type& recv_buf,
+                                                  size_t recv_count,
+                                                  ccl::reduction reduction,
+                                                  const ccl::stream::impl_value_t& stream,
+                                                  const ccl::reduce_scatter_attr& attr,
+                                                  const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
@@ -369,52 +351,49 @@ host_communicator::coll_request_t host_communicator::reduce_scatter_impl(
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-host_communicator::coll_request_t host_communicator::sparse_allreduce_impl(
-    const index_buffer_type* send_ind_buf,
-    size_t send_ind_count,
-    const value_buffer_type* send_val_buf,
-    size_t send_val_count,
-    index_buffer_type* recv_ind_buf,
-    size_t recv_ind_count,
-    value_buffer_type* recv_val_buf,
-    size_t recv_val_count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::sparse_allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    ccl_request* req =
-        ccl_sparse_allreduce_impl((const void*)send_ind_buf,
-                                  send_ind_count,
-                                  (const void*)send_val_buf,
-                                  send_val_count,
-                                  (void*)recv_ind_buf,
-                                  recv_ind_count,
-                                  (void*)recv_val_buf,
-                                  recv_val_count,
-                                  ccl::native_type_info<index_buffer_type>::ccl_datatype_value,
-                                  ccl::native_type_info<value_buffer_type>::ccl_datatype_value,
-                                  reduction,
-                                  attr,
-                                  comm_impl.get(),
-                                  nullptr);
+ccl::event host_communicator::sparse_allreduce_impl(const index_buffer_type* send_ind_buf,
+                                                    size_t send_ind_count,
+                                                    const value_buffer_type* send_val_buf,
+                                                    size_t send_val_count,
+                                                    index_buffer_type* recv_ind_buf,
+                                                    size_t recv_ind_count,
+                                                    value_buffer_type* recv_val_buf,
+                                                    size_t recv_val_count,
+                                                    ccl::reduction reduction,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::sparse_allreduce_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
+    ccl_request* req = ccl_sparse_allreduce_impl((const void*)send_ind_buf,
+                                                 send_ind_count,
+                                                 (const void*)send_val_buf,
+                                                 send_val_count,
+                                                 (void*)recv_ind_buf,
+                                                 recv_ind_count,
+                                                 (void*)recv_val_buf,
+                                                 recv_val_count,
+                                                 ccl::native_type_info<index_buffer_type>::dtype,
+                                                 ccl::native_type_info<value_buffer_type>::dtype,
+                                                 reduction,
+                                                 attr,
+                                                 comm_impl.get(),
+                                                 nullptr);
 
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-host_communicator::coll_request_t host_communicator::sparse_allreduce_impl(
-    const index_buffer_container_type& send_ind_buf,
-    size_t send_ind_count,
-    const value_buffer_container_type& send_val_buf,
-    size_t send_val_count,
-    index_buffer_container_type& recv_ind_buf,
-    size_t recv_ind_count,
-    value_buffer_container_type& recv_val_buf,
-    size_t recv_val_count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::sparse_allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event host_communicator::sparse_allreduce_impl(const index_buffer_container_type& send_ind_buf,
+                                                    size_t send_ind_count,
+                                                    const value_buffer_container_type& send_val_buf,
+                                                    size_t send_val_count,
+                                                    index_buffer_container_type& recv_ind_buf,
+                                                    size_t recv_ind_count,
+                                                    value_buffer_container_type& recv_val_buf,
+                                                    size_t recv_val_count,
+                                                    ccl::reduction reduction,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::sparse_allreduce_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     // TODO not implemented
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
diff --git a/src/common/comm/l0/comm_context.cpp b/src/common/comm/l0/comm_context.cpp
index 1ef30e87e..196079c12 100644
--- a/src/common/comm/l0/comm_context.cpp
+++ b/src/common/comm/l0/comm_context.cpp
@@ -13,7 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_aliases.hpp"
+#include "oneapi/ccl/aliases.hpp"
 #include "common/comm/host_communicator/host_communicator.hpp"
 #include "common/comm/l0/comm_context_impl.hpp"
 #include "common/utils/spinlock.hpp"
@@ -21,10 +21,10 @@
 
 namespace ccl {
 comm_group::comm_group(shared_communicator_t parent_comm,
-                       size_t threads_count,
-                       size_t on_process_ranks_count,
+                       size_t threads_per_process,
+                       size_t ranks_per_process,
                        group_unique_key id)
-        : pimpl(new gpu_comm_attr(parent_comm, threads_count, on_process_ranks_count, id)){};
+        : pimpl(new gpu_comm_attr(parent_comm, threads_per_process, ranks_per_process, id)){};
 
 bool comm_group::sync_group_size(size_t device_group_size) {
     return pimpl->sync_group_size(device_group_size);
@@ -42,10 +42,17 @@ std::string comm_group::to_string() const
 }*/
 } // namespace ccl
 // container-based method force-instantiation will trigger ALL other methods instantiations
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::vector_class<ccl::device_index_type>, typename ccl::unified_device_context_type::ccl_native_t);
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::list_class<ccl::device_index_type>, typename ccl::unified_device_context_type::ccl_native_t);
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::device_indices_t, typename ccl::unified_device_context_type::ccl_native_t);
-COMM_CREATOR_INDEXED_INSTANTIATION_TYPE(ccl::device_index_type, typename ccl::unified_device_context_type::ccl_native_t);
+COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::vector_class<ccl::device_index_type>,
+                                             typename ccl::unified_context_type::ccl_native_t);
+COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::list_class<ccl::device_index_type>,
+                                             typename ccl::unified_context_type::ccl_native_t);
+COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::device_indices_type,
+                                             typename ccl::unified_context_type::ccl_native_t);
+COMM_CREATOR_INDEXED_INSTANTIATION_TYPE(ccl::device_index_type,
+                                        typename ccl::unified_context_type::ccl_native_t);
 
-COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(ccl::vector_class<typename ccl::unified_device_type::ccl_native_t>, typename ccl::unified_device_context_type::ccl_native_t);
-COMM_CREATOR_INDEXED_INSTANTIATION_TYPE(typename ccl::unified_device_type::ccl_native_t, typename ccl::unified_device_context_type::ccl_native_t);
+COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(
+    ccl::vector_class<typename ccl::unified_device_type::ccl_native_t>,
+    typename ccl::unified_context_type::ccl_native_t);
+COMM_CREATOR_INDEXED_INSTANTIATION_TYPE(typename ccl::unified_device_type::ccl_native_t,
+                                        typename ccl::unified_context_type::ccl_native_t);
diff --git a/src/common/comm/l0/comm_context.hpp b/src/common/comm/l0/comm_context.hpp
index 68c795f45..070e75ab9 100644
--- a/src/common/comm/l0/comm_context.hpp
+++ b/src/common/comm/l0/comm_context.hpp
@@ -13,33 +13,32 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_aliases.hpp"
-#include "oneapi/ccl/ccl_device_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
+#include "oneapi/ccl/aliases.hpp"
+#include "oneapi/ccl/device_types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
 
-#include "oneapi/ccl/ccl_coll_attr_ids.hpp"
-#include "oneapi/ccl/ccl_coll_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_coll_attr.hpp"
+#include "oneapi/ccl/coll_attr_ids.hpp"
+#include "oneapi/ccl/coll_attr_ids_traits.hpp"
+#include "oneapi/ccl/coll_attr.hpp"
 
-#include "common/event/event_internal/event_internal_attr_ids.hpp"
-#include "common/event/event_internal/event_internal_attr_ids_traits.hpp"
-#include "common/event/event_internal/event_internal.hpp"
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/stream.hpp"
 
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_stream.hpp"
-
-#include "oneapi/ccl/ccl_event.hpp"
-#include "oneapi/ccl/ccl_communicator.hpp"
+#include "oneapi/ccl/event.hpp"
+#include "oneapi/ccl/communicator.hpp"
 
 #include "common/comm/l0/comm_context_id.hpp"
 #include "common/comm/comm_interface.hpp"
 
 namespace ccl {
+namespace detail {
+class environment;
+}
 
 class host_communicator;
 struct gpu_comm_attr;
@@ -47,10 +46,10 @@ using shared_communicator_t = std::shared_ptr<host_communicator>;
 
 class comm_group {
 public:
-    friend class environment;
+    friend class ccl::detail::environment;
     friend struct group_context;
 
-    using context_t = typename unified_device_context_type::ccl_native_t;
+    using context_t = typename unified_context_type::ccl_native_t;
 
     ~comm_group();
     /**
@@ -61,9 +60,10 @@ class comm_group {
               typename std::enable_if<not std::is_same<typename std::remove_cv<DeviceType>::type,
                                                        ccl::device_index_type>::value,
                                       int>::type = 0>
-    ccl::communicator_interface_ptr create_communicator_from_group(const DeviceType& device,
-                                        ContextType& context,
-                                        const comm_split_attr& attr = ccl_empty_attr());
+    ccl::communicator_interface_ptr create_communicator_from_group(
+        const DeviceType& device,
+        const ContextType& context,
+        const comm_split_attr& attr = ccl_empty_attr());
 
     /**
      * Device Communicator creation API: single communicator creation, based on index @device_id
@@ -73,9 +73,10 @@ class comm_group {
               typename std::enable_if<std::is_same<typename std::remove_cv<DeviceType>::type,
                                                    ccl::device_index_type>::value,
                                       int>::type = 0>
-    ccl::communicator_interface_ptr create_communicator_from_group(const DeviceType& device_id,
-                                        ContextType& context,
-                                        const comm_split_attr& attr = ccl_empty_attr());
+    ccl::communicator_interface_ptr create_communicator_from_group(
+        const DeviceType& device_id,
+        const ContextType& context,
+        const comm_split_attr& attr = ccl_empty_attr());
 
     /**
      * Device Communicator creation vectorized API:
@@ -84,7 +85,7 @@ class comm_group {
     template <class InputIt, class ContextType>
     std::vector<communicator> create_communicators_group(InputIt first,
                                                          InputIt last,
-                                                         ContextType& context,
+                                                         const ContextType& context,
                                                          comm_split_attr attr = ccl_empty_attr());
 
     /**
@@ -93,13 +94,13 @@ class comm_group {
      */
     template <template <class...> class Container, class Type, class ContextType>
     std::vector<communicator> create_communicators_group(const Container<Type>& device_ids,
-                                                         ContextType& context,
+                                                         const ContextType& context,
                                                          comm_split_attr attr = ccl_empty_attr());
 
     /**
      * Return device context allocated during group creation
      */
-    //device_context_native_const_reference_t get_context() const;
+    //context_native_const_reference_t get_context() const;
 
     bool sync_group_size(size_t device_group_size);
     /*
diff --git a/src/common/comm/l0/comm_context_impl.hpp b/src/common/comm/l0/comm_context_impl.hpp
index b1b18abf0..32e7449a0 100644
--- a/src/common/comm/l0/comm_context_impl.hpp
+++ b/src/common/comm/l0/comm_context_impl.hpp
@@ -13,8 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_kvs.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/kvs.hpp"
 #include "common/log/log.hpp"
 #include "common/comm/host_communicator/host_communicator.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
@@ -32,11 +32,12 @@ template <class DeviceType,
                                   int>::type>
 ccl::communicator_interface_ptr ccl::comm_group::create_communicator_from_group(
     const DeviceType& device,
-    ContextType& context,
+    const ContextType& context,
     const ccl::comm_split_attr& attr /* = comm_device_attr_t()*/) {
 #ifdef CCL_ENABLE_SYCL
-    static_assert(std::is_same<DeviceType, cl::sycl::device>::value,
-                  "ccl::comm_group::create_communicator_from_group() - supports SYCL devices at now");
+    static_assert(
+        std::is_same<DeviceType, cl::sycl::device>::value,
+        "ccl::comm_group::create_communicator_from_group() - supports SYCL devices at now");
 #endif
 
     ccl::communicator_interface_ptr impl;
@@ -50,10 +51,13 @@ ccl::communicator_interface_ptr ccl::comm_group::create_communicator_from_group(
         LOG_TRACE("Create single device communicator from SYCL device");
         //TODO
         ccl::comm_split_attr single_dev_attr = attr;
-        single_dev_attr.set<ccl::comm_split_attr_id::group>(
-            ccl::group_split_type::undetermined);
-        impl = ccl::communicator_interface::create_communicator_impl(
-            device, context, host_comm->rank(), host_comm->size(), single_dev_attr, host_comm->get_atl());
+        single_dev_attr.set<ccl::comm_split_attr_id::group>(ccl::split_group::cluster);
+        impl = ccl::communicator_interface::create_communicator_impl(device,
+                                                                     context,
+                                                                     host_comm->rank(),
+                                                                     host_comm->size(),
+                                                                     single_dev_attr,
+                                                                     host_comm->get_atl());
     }
     else {
         // multiple device case
@@ -73,7 +77,7 @@ template <class DeviceType,
                                   int>::type>
 ccl::communicator_interface_ptr ccl::comm_group::create_communicator_from_group(
     const DeviceType& device_id,
-    ContextType& context,
+    const ContextType& context,
     const ccl::comm_split_attr& attr /* = nullptr*/) {
     LOG_TRACE("Create communicator from id: ", device_id);
     auto host_comm = pimpl->get_host_communicator();
@@ -92,7 +96,7 @@ template <class InputIt, class ContextType>
 std::vector<ccl::communicator> ccl::comm_group::create_communicators_group(
     InputIt first,
     InputIt last,
-    ContextType& context,
+    const ContextType& context,
     ccl::comm_split_attr attr /* = nullptr*/) {
     /*
     static_assert(not std::is_same<InputIt, typename ccl::vector_class<cl::sycl::device>::const_iterator>::value, "SYCL");
@@ -108,17 +112,21 @@ std::vector<ccl::communicator> ccl::comm_group::create_communicators_group(
 
     std::vector<ccl::communicator> comms;
     comms.reserve(indices_count);
-    std::transform(
-        first, last, std::back_inserter(comms), [this, attr, &context](const iterator_value_type& device_id) {
-            return ccl::communicator(create_communicator_from_group<iterator_value_type, ContextType>(device_id, context, attr));
-        });
+    std::transform(first,
+                   last,
+                   std::back_inserter(comms),
+                   [this, attr, &context](const iterator_value_type& device_id) {
+                       return ccl::communicator(
+                           create_communicator_from_group<iterator_value_type, ContextType>(
+                               device_id, context, attr));
+                   });
     return comms;
 }
 
 template <template <class...> class Container, class Type, class ContextType>
 std::vector<ccl::communicator> ccl::comm_group::create_communicators_group(
     const Container<Type>& device_ids,
-    ContextType& context,
+    const ContextType& context,
     ccl::comm_split_attr attr /* = nullptr*/) {
     //static_assert(not std::is_same<Type, cl::sycl::device>::value, "SYCL cont");
     //static_assert(std::is_same<Type, ccl::device_index_type>::value, "Invalid Type in create_communicators");
@@ -129,10 +137,10 @@ std::vector<ccl::communicator> ccl::comm_group::create_communicators_group(
         device_ids.begin(), device_ids.end(), context, attr);
 }
 /*
- ccl::comm_group::device_context_native_const_reference_t ccl::comm_group::get_context() const
+ ccl::comm_group::context_native_const_reference_t ccl::comm_group::get_context() const
 {
     //TODO use PIMPL as context provider
-    static unified_device_context_type context;
+    static unified_context_type context;
     return context.get();
 }
 */
@@ -141,8 +149,8 @@ std::vector<ccl::communicator> ccl::comm_group::create_communicators_group(
 /***************************************************************************************************/
 #define COMM_CREATOR_INDEXED_INSTANTIATION_CONTAINER(type, context_type) \
     template ccl::vector_class<ccl::communicator> ccl::comm_group::create_communicators_group( \
-        const type& devices, context_type& ctx, ccl::comm_split_attr attr);
+        const type& devices, const context_type& ctx, ccl::comm_split_attr attr);
 
 #define COMM_CREATOR_INDEXED_INSTANTIATION_TYPE(type, context_type) \
     template ccl::communicator_interface_ptr ccl::comm_group::create_communicator_from_group( \
-        const type& device, context_type& context, const ccl::comm_split_attr& attr);
+        const type& device, const context_type& context, const ccl::comm_split_attr& attr);
diff --git a/src/common/comm/l0/comm_context_storage.cpp b/src/common/comm/l0/comm_context_storage.cpp
index e9981ac7c..80c72748b 100644
--- a/src/common/comm/l0/comm_context_storage.cpp
+++ b/src/common/comm/l0/comm_context_storage.cpp
@@ -28,21 +28,33 @@ group_context& group_context::instance() {
 }
 
 group_context::comm_group_t group_context::group_by_kvs(
-    const std::vector<size_t>& local_thread_device_group_ranks,
-    size_t cluster_device_group_size,
+    const std::vector<int>& local_thread_device_group_ranks,
+    int cluster_device_group_size,
     std::shared_ptr<kvs_interface> kvs) {
-    //TODO
-    static ccl_comm_id_storage::comm_id TODO_TMP_ID = ccl::global_data::get().comm_ids->acquire();
+    LOG_INFO("Thread acquire by barrier");
+    std::shared_ptr<ikvs_wrapper> kvs_wrap = std::shared_ptr<ikvs_wrapper>(new users_kvs(kvs));
+    std::shared_ptr<atl_wrapper> atl = std::shared_ptr<atl_wrapper>(
+        new atl_wrapper(cluster_device_group_size, local_thread_device_group_ranks, kvs_wrap));
 
-    //barrier operation acquire: wait while all threads from all processes enters here...
-    std::shared_ptr<host_communicator> host_comm =
-        std::make_shared<host_communicator>(std::make_shared<ccl_comm>(
-            local_thread_device_group_ranks, cluster_device_group_size, kvs, TODO_TMP_ID.clone()));
-    //barrier operation release: every threads continue its execution here...
-    LOG_INFO("Thread released by barrier");
+    /* Indicate that multiple devices are not supported, don't throw anything if kernel_path env variable
+     * is set to enable our testing with partial functionality.
+     * Most of the cases are handled in communicator_impl_details.hpp, but here we check the case
+     * when we have multiple threads and each of them has 1 device. And we don't know the total number
+     * of ranks in the process until we sync them above */
+    if (atl->get_ranks_per_process() > 1 && ccl::global_data::env().kernel_path.empty()) {
+        throw ccl::unimplemented("API", "create_communicators", "for multiple devices");
+    }
 
+    LOG_INFO("Thread released by barrier");
+    LOG_INFO("Cluster_device_group size: ",
+             cluster_device_group_size,
+             "\nThread device group ranks size: ",
+             local_thread_device_group_ranks.size());
+    for (size_t i = 0; i < local_thread_device_group_ranks.size(); i++) {
+        LOG_INFO("\nLocal thread device group ranks: ", local_thread_device_group_ranks[i]);
+    }
     // register group slot in global context table, based on communicator id
-    comm_group_t group = group_context::group_by_comm(host_comm);
+    comm_group_t group = group_context::group_by_comm(atl);
 
     // sync existing group: blocking operation - wait for all groups
     LOG_INFO("group thread barrier acquired: ", static_cast<void*>(group.get()));
@@ -51,27 +63,43 @@ group_context::comm_group_t group_context::group_by_kvs(
     return group;
 }
 
-group_context::comm_group_t group_context::group_by_comm(shared_communicator_t host_comm) {
-    group_context::group_unique_key unique_id = host_comm->comm_impl->id();
-    size_t threads_count = host_comm->comm_impl->thread_count();
-    size_t on_process_ranks_count = host_comm->comm_impl->on_process_ranks_count();
+group_context::comm_group_t group_context::group_by_comm(std::shared_ptr<atl_wrapper> atl) {
+    LOG_INFO("\n",
+             "\nATL info:",
+             "\n  threads per process: ",
+             atl->get_threads_per_process(),
+             "\n  ranks per process:   ",
+             atl->get_ranks_per_process(),
+             "\n  atl size:            ",
+             atl->get_size(),
+             "\n  rank:                ",
+             atl->get_rank(),
+             "\n  unique id of atl:     ",
+             atl->get_id(),
+             "\n")
 
     comm_group_t group;
     {
+        // mutex
         std::unique_lock<ccl_spinlock> lock(mutex);
+        size_t threads_per_process = atl->get_threads_per_process();
+        size_t ranks_per_process = atl->get_ranks_per_process();
+        group_context::group_unique_key unique_id = atl->get_id();
+
         auto ctx_it = communicator_group_map.find(unique_id);
         if (ctx_it == communicator_group_map.end()) {
+            std::shared_ptr<host_communicator> host_comm = std::make_shared<host_communicator>(atl);
             group.reset(
-                new ccl::comm_group(host_comm, threads_count, on_process_ranks_count, unique_id));
+                new ccl::comm_group(host_comm, threads_per_process, ranks_per_process, unique_id));
             communicator_group_map.insert({ unique_id, group });
             LOG_INFO("Comm group: ",
                      static_cast<void*>(group.get()),
                      " has been created for unique_id: ",
                      unique_id,
-                     ", expected thread count: ",
-                     threads_count,
-                     ", on process rank count: ",
-                     on_process_ranks_count);
+                     ", threads per process: ",
+                     threads_per_process,
+                     ", ranks per process: ",
+                     ranks_per_process);
         }
         else {
             group = ctx_it->second;
diff --git a/src/common/comm/l0/comm_context_storage.hpp b/src/common/comm/l0/comm_context_storage.hpp
index d95d903b5..37b16e2cf 100644
--- a/src/common/comm/l0/comm_context_storage.hpp
+++ b/src/common/comm/l0/comm_context_storage.hpp
@@ -19,11 +19,15 @@
 
 #include "common/utils/spinlock.hpp"
 #include "common/comm/atl_tag.hpp"
+#include "atl/atl_wrapper.h"
 
 namespace ccl {
+namespace v1 {
+class kvs_interface;
+}
+
 class host_communicator;
 class comm_group;
-class kvs_interface;
 
 struct group_context {
     /* TODO
@@ -41,10 +45,10 @@ struct group_context {
     std::map<group_unique_key, comm_group_t> communicator_group_map;
     ccl_spinlock mutex;
 
-    comm_group_t group_by_kvs(const std::vector<size_t>& local_thread_device_group_ranks,
-                              size_t cluster_device_group_size,
+    comm_group_t group_by_kvs(const std::vector<int>& local_thread_device_group_ranks,
+                              int cluster_device_group_size,
                               std::shared_ptr<kvs_interface> kvs);
-    comm_group_t group_by_comm(std::shared_ptr<host_communicator> host_comm);
+    comm_group_t group_by_comm(std::shared_ptr<atl_wrapper> atl);
     comm_group_t get_existing_group_by_id(const group_unique_key& id);
     static group_context& instance();
 
diff --git a/src/common/comm/l0/communicator/base_communicator.hpp b/src/common/comm/l0/communicator/base_communicator.hpp
index 71818aec5..ef315d4a1 100644
--- a/src/common/comm/l0/communicator/base_communicator.hpp
+++ b/src/common/comm/l0/communicator/base_communicator.hpp
@@ -24,7 +24,7 @@ struct base_communicator : public ccl::communicator_interface {
     //TODO using group_comm_storage = native::specific_indexed_device_storage;
 
     base_communicator(ccl::unified_device_type&& owned_device,
-                      ccl::unified_device_context_type&& owned_ctx,
+                      ccl::unified_context_type&& owned_ctx,
                       size_t thread_idx,
                       size_t process_idx,
                       const ccl::comm_split_attr& attr)
@@ -41,11 +41,11 @@ struct base_communicator : public ccl::communicator_interface {
 
     virtual ~base_communicator() = default;
 
-    size_t rank() const override {
+    int rank() const override {
         return comm_rank;
     }
 
-    size_t size() const override {
+    int size() const override {
         return comm_size;
     }
 
@@ -53,11 +53,11 @@ struct base_communicator : public ccl::communicator_interface {
         return device.get_id();
     }
 
-    ccl::communicator_interface::device_t get_device() override {
+    ccl::communicator_interface::device_t get_device() const override {
         return device.get();
     }
 
-    ccl::communicator_interface::context_t get_context() override {
+    ccl::communicator_interface::context_t get_context() const override {
         return context.get();
     }
 
@@ -84,14 +84,14 @@ struct base_communicator : public ccl::communicator_interface {
     }
 */
     ccl::unified_device_type device;
-    ccl::unified_device_context_type context;
+    ccl::unified_context_type context;
     size_t thread_id;
     size_t process_id;
     const ccl::comm_split_attr comm_attr;
 
     //TODO add context_comm_addr to aggregate device_id,thread_id, process_id & ranks
-    size_t comm_rank;
-    size_t comm_size;
+    int comm_rank;
+    int comm_size;
 
     mutable ccl_spinlock ready_mutex;
 
diff --git a/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp b/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp
index e2541aaa5..286ef7802 100644
--- a/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp
+++ b/src/common/comm/l0/communicator/device_group/device_a2a_communicator.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
 #include "common/comm/l0/context/thread_group_ctx.hpp"
@@ -22,13 +22,13 @@
 
 using namespace ccl;
 
-device_group_a2a_communicator::device_group_a2a_communicator(
-    ccl::unified_device_type&& device,
-    ccl::unified_device_context_type&& ctx,
-    size_t thread_idx,
-    size_t process_idx,
-    const ccl::comm_split_attr& attr)
-        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx /*, comm_attr*/, attr) {}
+device_group_a2a_communicator::device_group_a2a_communicator(ccl::unified_device_type&& device,
+                                                             ccl::unified_context_type&& ctx,
+                                                             size_t thread_idx,
+                                                             size_t process_idx,
+                                                             const ccl::comm_split_attr& attr)
+        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx /*, comm_attr*/, attr) {
+}
 
 void device_group_a2a_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     auto process_ctx = comm_attr.get_process_context();
@@ -44,15 +44,14 @@ void device_group_a2a_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     this->set_comm_group_id(comm_attr.get_unique_id());
 }
 
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::barrier(
-    const ccl::stream::impl_value_t& stream,
-    const ccl::barrier_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_a2a_communicator::barrier(const ccl::stream::impl_value_t& stream,
+                                                  const ccl::barrier_attr& attr,
+                                                  const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented yet");
 }
 
 /* allgatherv */
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allgatherv_impl(
+ccl::event device_group_a2a_communicator::allgatherv_impl(
     const void* send_buf,
     size_t send_count,
     void* recv_buf,
@@ -64,7 +63,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allgatherv_impl(
+ccl::event device_group_a2a_communicator::allgatherv_impl(
     const void* send_buf,
     size_t send_count,
     const ccl::vector_class<void*>& recv_bufs,
@@ -78,7 +77,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 }
 
 /* allreduce */
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allreduce_impl(
+ccl::event device_group_a2a_communicator::allreduce_impl(
     const void* send_buf,
     void* recv_buf,
     size_t count,
@@ -92,31 +91,29 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 }
 
 /* alltoall */
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoall_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_a2a_communicator::alltoall_impl(const void* send_buf,
+                                                        void* recv_buf,
+                                                        size_t count,
+                                                        ccl::datatype dtype,
+                                                        const ccl::stream::impl_value_t& stream,
+                                                        const ccl::alltoall_attr& attr,
+                                                        const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoall_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<void*>& recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_a2a_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
+                                                        const ccl::vector_class<void*>& recv_buf,
+                                                        size_t count,
+                                                        ccl::datatype dtype,
+                                                        const ccl::stream::impl_value_t& stream,
+                                                        const ccl::alltoall_attr& attr,
+                                                        const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoallv */
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoallv_impl(
+ccl::event device_group_a2a_communicator::alltoallv_impl(
     const void* send_buf,
     const ccl::vector_class<size_t>& send_counts,
     void* recv_buf,
@@ -128,7 +125,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoallv_impl(
+ccl::event device_group_a2a_communicator::alltoallv_impl(
     const ccl::vector_class<void*>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     ccl::vector_class<void*> recv_buf,
@@ -142,11 +139,11 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 }
 
 /* bcast */
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::broadcast_impl(
+ccl::event device_group_a2a_communicator::broadcast_impl(
     void* buf,
     size_t count,
     ccl::datatype dtype,
-    size_t root,
+    int root,
     const ccl::stream::impl_value_t& stream,
     const ccl::broadcast_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
@@ -155,22 +152,21 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::bro
 }
 
 /* reduce */
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::reduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_a2a_communicator::reduce_impl(const void* send_buf,
+                                                      void* recv_buf,
+                                                      size_t count,
+                                                      ccl::datatype dtype,
+                                                      ccl::reduction reduction,
+                                                      int root,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::reduce_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce_scatter */
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::reduce_scatter_impl(
+ccl::event device_group_a2a_communicator::reduce_scatter_impl(
     const void* send_buf,
     void* recv_buf,
     size_t recv_count,
@@ -184,7 +180,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::red
 }
 
 /* sparse_allreduce */
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::sparse_allreduce_impl(
+ccl::event device_group_a2a_communicator::sparse_allreduce_impl(
     const void* send_ind_buf,
     size_t send_ind_count,
     const void* send_val_buf,
@@ -203,129 +199,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::spa
     return {};
 }
 
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_a2a_communicator, char);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_a2a_communicator, int);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_a2a_communicator, int64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_a2a_communicator, uint64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_a2a_communicator, float);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_a2a_communicator, double);
-
-#ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_a2a_communicator,
-                                                cl::sycl::buffer<char COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_a2a_communicator,
-                                                cl::sycl::buffer<int COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_a2a_communicator,
-                                                cl::sycl::buffer<int64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_a2a_communicator,
-                                                cl::sycl::buffer<uint64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_a2a_communicator,
-                                                cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_a2a_communicator,
-                                                cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              char,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              char,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              char,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              char,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              char,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              char,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              char,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              int64_t,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              uint64_t);
-
+COMM_INTERFACE_COLL_INSTANTIATION(device_group_a2a_communicator);
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    device_group_a2a_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    device_group_a2a_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    device_group_a2a_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    device_group_a2a_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(device_group_a2a_communicator);
+#endif /* CCL_ENABLE_SYCL */
diff --git a/src/common/comm/l0/communicator/device_group/device_a2a_communicator.hpp b/src/common/comm/l0/communicator/device_group/device_a2a_communicator.hpp
index cc6af49c9..2889124a1 100644
--- a/src/common/comm/l0/communicator/device_group/device_a2a_communicator.hpp
+++ b/src/common/comm/l0/communicator/device_group/device_a2a_communicator.hpp
@@ -20,35 +20,32 @@ namespace native {
 struct device_group_context;
 }
 
-class device_group_a2a_communicator
-        : public typed_base_communicator<device_group_a2a_communicator,
-                                         ccl::group_split_type::thread,
-                                         ccl::device_topology_type::a2a,
-                                         ccl::gpu_communicator_traits> {
+class device_group_a2a_communicator : public typed_base_communicator<device_group_a2a_communicator,
+                                                                     ccl::group_split_type::thread,
+                                                                     ccl::device_topology_type::a2a,
+                                                                     ccl::gpu_communicator_traits> {
 public:
     using base_t = typed_base_communicator<device_group_a2a_communicator,
                                            ccl::group_split_type::thread,
                                            ccl::device_topology_type::a2a,
                                            ccl::gpu_communicator_traits>;
 
-    using coll_request_t = ccl::event;
-
     device_group_a2a_communicator(ccl::unified_device_type&& device,
-                                  ccl::unified_device_context_type&& ctx,
+                                  ccl::unified_context_type&& ctx,
                                   size_t thread_idx,
                                   size_t proces_idx,
                                   const ccl::comm_split_attr& attr);
 
     void visit(ccl::gpu_comm_attr& comm_attr) override;
 
-    coll_request_t barrier(const ccl::stream::impl_value_t& stream,
-                           const ccl::barrier_attr& attr,
-                           const ccl::vector_class<ccl::event>& deps) override;
+    ccl::event barrier(const ccl::stream::impl_value_t& stream,
+                       const ccl::barrier_attr& attr,
+                       const ccl::vector_class<ccl::event>& deps) override;
 
-    DEVICE_COMM_IMPL_DECLARATION
-    DEVICE_COMM_IMPL_CLASS_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION
+    COMM_IMPL_DECLARATION
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
 
 private:
     std::shared_ptr<native::device_group_context> ctx;
diff --git a/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp
index 282db0e4d..a30937aea 100644
--- a/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/device_group/device_a2a_communicator_impl.hpp
@@ -25,7 +25,7 @@
 
 /* allgatherv */
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allgatherv_impl(
+ccl::event device_group_a2a_communicator::allgatherv_impl(
     const buffer_type* send_buf,
     size_t send_count,
     buffer_type* recv_buf,
@@ -37,7 +37,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
     return {};
 }
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allgatherv_impl(
+ccl::event device_group_a2a_communicator::allgatherv_impl(
     const buffer_type* send_buf,
     size_t send_count,
     ccl::vector_class<buffer_type*>& recv_buf,
@@ -50,7 +50,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 }
 
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allgatherv_impl(
+ccl::event device_group_a2a_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     buffer_type& recv_buf,
@@ -63,7 +63,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 }
 
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allgatherv_impl(
+ccl::event device_group_a2a_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -77,7 +77,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 
 /* allreduce */
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allreduce_impl(
+ccl::event device_group_a2a_communicator::allreduce_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t count,
@@ -94,7 +94,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
             "Device communicator for group_id: " + ::to_string(group_id) +
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
 
     //TODO make const!
@@ -157,7 +157,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 }
 
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::allreduce_impl(
+ccl::event device_group_a2a_communicator::allreduce_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t count,
@@ -171,18 +171,17 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 
 /* alltoall */
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoall_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_a2a_communicator::alltoall_impl(const buffer_type* send_buf,
+                                                        buffer_type* recv_buf,
+                                                        size_t count,
+                                                        const ccl::stream::impl_value_t& stream,
+                                                        const ccl::alltoall_attr& attr,
+                                                        const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoall_impl(
+ccl::event device_group_a2a_communicator::alltoall_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<buffer_type*>& recv_buf,
     size_t count,
@@ -194,18 +193,17 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 }
 
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoall_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_a2a_communicator::alltoall_impl(const buffer_type& send_buf,
+                                                        buffer_type& recv_buf,
+                                                        size_t count,
+                                                        const ccl::stream::impl_value_t& stream,
+                                                        const ccl::alltoall_attr& attr,
+                                                        const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoall_impl(
+ccl::event device_group_a2a_communicator::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -218,7 +216,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 
 /* alltoallv */
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoallv_impl(
+ccl::event device_group_a2a_communicator::alltoallv_impl(
     const buffer_type* send_buf,
     const ccl::vector_class<size_t>& send_counts,
     buffer_type* recv_buf,
@@ -230,7 +228,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
     return {};
 }
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoallv_impl(
+ccl::event device_group_a2a_communicator::alltoallv_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<buffer_type*>& recv_buf,
@@ -243,7 +241,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 }
 
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoallv_impl(
+ccl::event device_group_a2a_communicator::alltoallv_impl(
     const buffer_type& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     buffer_type& recv_buf,
@@ -255,7 +253,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
     return {};
 }
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::alltoallv_impl(
+ccl::event device_group_a2a_communicator::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -269,10 +267,10 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::all
 
 /* bcast */
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::broadcast_impl(
+ccl::event device_group_a2a_communicator::broadcast_impl(
     buffer_type* buf,
     size_t count,
-    size_t root,
+    int root,
     const ccl::stream::impl_value_t& stream,
     const ccl::broadcast_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
@@ -281,10 +279,10 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::bro
 }
 
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::broadcast_impl(
+ccl::event device_group_a2a_communicator::broadcast_impl(
     buffer_type& buf,
     size_t count,
-    size_t root,
+    int root,
     const ccl::stream::impl_value_t& stream,
     const ccl::broadcast_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
@@ -294,36 +292,34 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::bro
 
 /* reduce */
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::reduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_a2a_communicator::reduce_impl(const buffer_type* send_buf,
+                                                      buffer_type* recv_buf,
+                                                      size_t count,
+                                                      ccl::reduction reduction,
+                                                      int root,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::reduce_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::reduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_a2a_communicator::reduce_impl(const buffer_type& send_buf,
+                                                      buffer_type& recv_buf,
+                                                      size_t count,
+                                                      ccl::reduction reduction,
+                                                      int root,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::reduce_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce_scatter */
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::reduce_scatter_impl(
+ccl::event device_group_a2a_communicator::reduce_scatter_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t recv_count,
@@ -335,7 +331,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::red
     return {};
 }
 template <class buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::reduce_scatter_impl(
+ccl::event device_group_a2a_communicator::reduce_scatter_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t recv_count,
@@ -349,7 +345,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::red
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::sparse_allreduce_impl(
+ccl::event device_group_a2a_communicator::sparse_allreduce_impl(
     const index_buffer_type* send_ind_buf,
     size_t send_ind_count,
     const value_buffer_type* send_val_buf,
@@ -367,7 +363,7 @@ device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::spa
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-device_group_a2a_communicator::coll_request_t device_group_a2a_communicator::sparse_allreduce_impl(
+ccl::event device_group_a2a_communicator::sparse_allreduce_impl(
     const index_buffer_container_type& send_ind_buf,
     size_t send_ind_count,
     const value_buffer_container_type& send_val_buf,
diff --git a/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp b/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp
index ee75203b6..4cff85f0a 100644
--- a/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp
+++ b/src/common/comm/l0/communicator/device_group/device_ring_communicator.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
 #include "common/comm/l0/context/thread_group_ctx.hpp"
@@ -22,13 +22,13 @@
 
 using namespace ccl;
 
-device_group_ring_communicator::device_group_ring_communicator(
-    ccl::unified_device_type&& device,
-    ccl::unified_device_context_type&& ctx,
-    size_t thread_idx,
-    size_t process_idx,
-    const ccl::comm_split_attr& attr)
-        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx /*, comm_attr*/, attr) {}
+device_group_ring_communicator::device_group_ring_communicator(ccl::unified_device_type&& device,
+                                                               ccl::unified_context_type&& ctx,
+                                                               size_t thread_idx,
+                                                               size_t process_idx,
+                                                               const ccl::comm_split_attr& attr)
+        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx /*, comm_attr*/, attr) {
+}
 
 void device_group_ring_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     auto process_ctx = comm_attr.get_process_context();
@@ -45,15 +45,14 @@ void device_group_ring_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     this->set_comm_group_id(comm_attr.get_unique_id());
 }
 
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::barrier(
-    const ccl::stream::impl_value_t& stream,
-    const ccl::barrier_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_ring_communicator::barrier(const ccl::stream::impl_value_t& stream,
+                                                   const ccl::barrier_attr& attr,
+                                                   const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented yet");
 }
 
 /* allgatherv */
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allgatherv_impl(
+ccl::event device_group_ring_communicator::allgatherv_impl(
     const void* send_buf,
     size_t send_count,
     void* recv_buf,
@@ -65,7 +64,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allgatherv_impl(
+ccl::event device_group_ring_communicator::allgatherv_impl(
     const void* send_buf,
     size_t send_count,
     const ccl::vector_class<void*>& recv_bufs,
@@ -79,7 +78,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 }
 
 /* allreduce */
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allreduce_impl(
+ccl::event device_group_ring_communicator::allreduce_impl(
     const void* send_buf,
     void* recv_buf,
     size_t count,
@@ -93,7 +92,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 }
 
 /* alltoall */
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoall_impl(
+ccl::event device_group_ring_communicator::alltoall_impl(
     const void* send_buf,
     void* recv_buf,
     size_t count,
@@ -104,7 +103,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoall_impl(
+ccl::event device_group_ring_communicator::alltoall_impl(
     const ccl::vector_class<void*>& send_buf,
     const ccl::vector_class<void*>& recv_buf,
     size_t count,
@@ -117,7 +116,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 }
 
 /* alltoallv */
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoallv_impl(
+ccl::event device_group_ring_communicator::alltoallv_impl(
     const void* send_buf,
     const ccl::vector_class<size_t>& send_counts,
     void* recv_buf,
@@ -129,7 +128,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoallv_impl(
+ccl::event device_group_ring_communicator::alltoallv_impl(
     const ccl::vector_class<void*>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     ccl::vector_class<void*> recv_buf,
@@ -144,11 +143,11 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 }
 
 /* bcast */
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::broadcast_impl(
+ccl::event device_group_ring_communicator::broadcast_impl(
     void* buf,
     size_t count,
     ccl::datatype dtype,
-    size_t root,
+    int root,
     const ccl::stream::impl_value_t& stream,
     const ccl::broadcast_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
@@ -157,22 +156,21 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::b
 }
 
 /* reduce */
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::reduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_ring_communicator::reduce_impl(const void* send_buf,
+                                                       void* recv_buf,
+                                                       size_t count,
+                                                       ccl::datatype dtype,
+                                                       ccl::reduction reduction,
+                                                       int root,
+                                                       const ccl::stream::impl_value_t& stream,
+                                                       const ccl::reduce_attr& attr,
+                                                       const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce_scatter */
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::reduce_scatter_impl(
+ccl::event device_group_ring_communicator::reduce_scatter_impl(
     const void* send_buf,
     void* recv_buf,
     size_t recv_count,
@@ -186,148 +184,26 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::r
 }
 
 /* sparse_allreduce */
-device_group_ring_communicator::coll_request_t
-device_group_ring_communicator::sparse_allreduce_impl(const void* send_ind_buf,
-                                                      size_t send_ind_count,
-                                                      const void* send_val_buf,
-                                                      size_t send_val_count,
-                                                      void* recv_ind_buf,
-                                                      size_t recv_ind_count,
-                                                      void* recv_val_buf,
-                                                      size_t recv_val_count,
-                                                      ccl::datatype index_dtype,
-                                                      ccl::datatype value_dtype,
-                                                      ccl::reduction reduction,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::sparse_allreduce_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_ring_communicator::sparse_allreduce_impl(
+    const void* send_ind_buf,
+    size_t send_ind_count,
+    const void* send_val_buf,
+    size_t send_val_count,
+    void* recv_ind_buf,
+    size_t recv_ind_count,
+    void* recv_val_buf,
+    size_t recv_val_count,
+    ccl::datatype index_dtype,
+    ccl::datatype value_dtype,
+    ccl::reduction reduction,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::sparse_allreduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_ring_communicator, char);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_ring_communicator, int);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_ring_communicator, int64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_ring_communicator, uint64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_ring_communicator, float);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(device_group_ring_communicator, double);
-
+COMM_INTERFACE_COLL_INSTANTIATION(device_group_ring_communicator);
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_ring_communicator,
-                                                cl::sycl::buffer<char COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_ring_communicator,
-                                                cl::sycl::buffer<int COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_ring_communicator,
-                                                cl::sycl::buffer<int64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_ring_communicator,
-                                                cl::sycl::buffer<uint64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_ring_communicator,
-                                                cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(device_group_ring_communicator,
-                                                cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              char,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              char,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              char,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              char,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              char,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              char,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              char,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              int64_t,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              uint64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              uint64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              uint64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              uint64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              uint64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              uint64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(device_group_ring_communicator,
-                                                              uint64_t,
-                                                              uint64_t);
-
-#ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    device_group_ring_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    device_group_ring_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    device_group_ring_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    device_group_ring_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(device_group_ring_communicator);
+#endif /* CCL_ENABLE_SYCL */
diff --git a/src/common/comm/l0/communicator/device_group/device_ring_communicator.hpp b/src/common/comm/l0/communicator/device_group/device_ring_communicator.hpp
index cd50177dd..f9f6f1be2 100644
--- a/src/common/comm/l0/communicator/device_group/device_ring_communicator.hpp
+++ b/src/common/comm/l0/communicator/device_group/device_ring_communicator.hpp
@@ -31,24 +31,22 @@ class device_group_ring_communicator
                                            ccl::device_topology_type::ring,
                                            ccl::gpu_communicator_traits>;
 
-    using coll_request_t = ccl::event;
-
     device_group_ring_communicator(ccl::unified_device_type&& device,
-                                   ccl::unified_device_context_type&& ctx,
+                                   ccl::unified_context_type&& ctx,
                                    size_t thread_idx,
                                    size_t proces_idx,
                                    const ccl::comm_split_attr& attr);
 
     void visit(ccl::gpu_comm_attr& comm_attr) override;
 
-    coll_request_t barrier(const ccl::stream::impl_value_t& stream,
-                           const ccl::barrier_attr& attr,
-                           const ccl::vector_class<ccl::event>& deps) override;
+    ccl::event barrier(const ccl::stream::impl_value_t& stream,
+                       const ccl::barrier_attr& attr,
+                       const ccl::vector_class<ccl::event>& deps) override;
 
-    DEVICE_COMM_IMPL_DECLARATION
-    DEVICE_COMM_IMPL_CLASS_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION
+    COMM_IMPL_DECLARATION
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
 
 private:
     std::shared_ptr<native::device_group_context> ctx;
diff --git a/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp
index 6a116bbe9..328b2b6ce 100644
--- a/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/device_group/device_ring_communicator_impl.hpp
@@ -25,7 +25,7 @@
 
 /* allgatherv */
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allgatherv_impl(
+ccl::event device_group_ring_communicator::allgatherv_impl(
     const buffer_type* send_buf,
     size_t send_count,
     buffer_type* recv_buf,
@@ -37,7 +37,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     return {};
 }
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allgatherv_impl(
+ccl::event device_group_ring_communicator::allgatherv_impl(
     const buffer_type* send_buf,
     size_t send_count,
     ccl::vector_class<buffer_type*>& recv_buf,
@@ -51,7 +51,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 }
 
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allgatherv_impl(
+ccl::event device_group_ring_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     buffer_type& recv_buf,
@@ -63,7 +63,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     return {};
 }
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allgatherv_impl(
+ccl::event device_group_ring_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -78,7 +78,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 
 /* allreduce */
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allreduce_impl(
+ccl::event device_group_ring_communicator::allreduce_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t count,
@@ -99,7 +99,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 
     size_t ring_index = 0;
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     LOG_DEBUG("communicator for device idx: ",
               get_device_path(),
               ", rank idx: , ring_index: ",
@@ -132,6 +132,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
                        ->submit_entry<gpu_allreduce_entry, ccl_sched_add_back, group_id, class_id>(
                            *community,
                            real_device_it->second,
+                           this->get_native_context(),
                            send_entry_buffer,
                            recv_entry_buffer,
                            count,
@@ -150,6 +151,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
                     ->submit_entry<gpu_allreduce_entry, ccl_sched_add_back, group_id, class_id>(
                         *community,
                         virtual_device_it->second,
+                        this->get_native_context(),
                         send_entry_buffer,
                         recv_entry_buffer,
                         count,
@@ -166,7 +168,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 }
 
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::allreduce_impl(
+ccl::event device_group_ring_communicator::allreduce_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t count,
@@ -180,7 +182,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 
 /* alltoall */
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoall_impl(
+ccl::event device_group_ring_communicator::alltoall_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t count,
@@ -191,7 +193,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     return {};
 }
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoall_impl(
+ccl::event device_group_ring_communicator::alltoall_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<buffer_type*>& recv_buf,
     size_t count,
@@ -204,7 +206,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 }
 
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoall_impl(
+ccl::event device_group_ring_communicator::alltoall_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t count,
@@ -215,7 +217,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     return {};
 }
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoall_impl(
+ccl::event device_group_ring_communicator::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -229,7 +231,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 
 /* alltoallv */
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoallv_impl(
+ccl::event device_group_ring_communicator::alltoallv_impl(
     const buffer_type* send_buf,
     const ccl::vector_class<size_t>& send_counts,
     buffer_type* recv_buf,
@@ -241,7 +243,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     return {};
 }
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoallv_impl(
+ccl::event device_group_ring_communicator::alltoallv_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<buffer_type*>& recv_buf,
@@ -255,7 +257,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 }
 
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoallv_impl(
+ccl::event device_group_ring_communicator::alltoallv_impl(
     const buffer_type& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     buffer_type& recv_buf,
@@ -267,7 +269,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
     return {};
 }
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::alltoallv_impl(
+ccl::event device_group_ring_communicator::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -282,10 +284,10 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::a
 
 /* bcast */
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::broadcast_impl(
+ccl::event device_group_ring_communicator::broadcast_impl(
     buffer_type* buf,
     size_t count,
-    size_t root,
+    int root,
     const ccl::stream::impl_value_t& stream,
     const ccl::broadcast_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
@@ -294,10 +296,10 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::b
 }
 
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::broadcast_impl(
+ccl::event device_group_ring_communicator::broadcast_impl(
     buffer_type& buf,
     size_t count,
-    size_t root,
+    int root,
     const ccl::stream::impl_value_t& stream,
     const ccl::broadcast_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
@@ -307,35 +309,33 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::b
 
 /* reduce */
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::reduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_ring_communicator::reduce_impl(const buffer_type* send_buf,
+                                                       buffer_type* recv_buf,
+                                                       size_t count,
+                                                       ccl::reduction reduction,
+                                                       int root,
+                                                       const ccl::stream::impl_value_t& stream,
+                                                       const ccl::reduce_attr& attr,
+                                                       const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::reduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_ring_communicator::reduce_impl(const buffer_type& send_buf,
+                                                       buffer_type& recv_buf,
+                                                       size_t count,
+                                                       ccl::reduction reduction,
+                                                       int root,
+                                                       const ccl::stream::impl_value_t& stream,
+                                                       const ccl::reduce_attr& attr,
+                                                       const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 /* reduce_scatter */
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::reduce_scatter_impl(
+ccl::event device_group_ring_communicator::reduce_scatter_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t recv_count,
@@ -347,7 +347,7 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::r
     return {};
 }
 template <class buffer_type>
-device_group_ring_communicator::coll_request_t device_group_ring_communicator::reduce_scatter_impl(
+ccl::event device_group_ring_communicator::reduce_scatter_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t recv_count,
@@ -361,26 +361,25 @@ device_group_ring_communicator::coll_request_t device_group_ring_communicator::r
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-device_group_ring_communicator::coll_request_t
-device_group_ring_communicator::sparse_allreduce_impl(const index_buffer_type* send_ind_buf,
-                                                      size_t send_ind_count,
-                                                      const value_buffer_type* send_val_buf,
-                                                      size_t send_val_count,
-                                                      index_buffer_type* recv_ind_buf,
-                                                      size_t recv_ind_count,
-                                                      value_buffer_type* recv_val_buf,
-                                                      size_t recv_val_count,
-                                                      ccl::reduction reduction,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::sparse_allreduce_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event device_group_ring_communicator::sparse_allreduce_impl(
+    const index_buffer_type* send_ind_buf,
+    size_t send_ind_count,
+    const value_buffer_type* send_val_buf,
+    size_t send_val_count,
+    index_buffer_type* recv_ind_buf,
+    size_t recv_ind_count,
+    value_buffer_type* recv_val_buf,
+    size_t recv_val_count,
+    ccl::reduction reduction,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::sparse_allreduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-device_group_ring_communicator::coll_request_t
-device_group_ring_communicator::sparse_allreduce_impl(
+ccl::event device_group_ring_communicator::sparse_allreduce_impl(
     const index_buffer_container_type& send_ind_buf,
     size_t send_ind_count,
     const value_buffer_container_type& send_val_buf,
diff --git a/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp b/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp
index 38985adae..43bd9fcc0 100644
--- a/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp
+++ b/src/common/comm/l0/communicator/process_group/process_a2a_communicator.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
 #include "common/comm/l0/context/process_group_ctx.hpp"
@@ -22,11 +22,12 @@
 using namespace ccl;
 
 process_a2a_communicator::process_a2a_communicator(ccl::unified_device_type&& device,
-                                                   ccl::unified_device_context_type&& ctx,
+                                                   ccl::unified_context_type&& ctx,
                                                    size_t thread_idx,
                                                    size_t process_idx,
                                                    const ccl::comm_split_attr& attr)
-        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx, /*comm_attr, */ attr) {}
+        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx, /*comm_attr, */ attr) {
+}
 
 void process_a2a_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     ctx = comm_attr.get_process_context();
@@ -38,135 +39,125 @@ void process_a2a_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     this->set_comm_group_id(comm_attr.get_unique_id());
 }
 
-process_a2a_communicator::coll_request_t process_a2a_communicator::barrier(
-    const ccl::stream::impl_value_t& stream,
-    const ccl::barrier_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::barrier(const ccl::stream::impl_value_t& stream,
+                                             const ccl::barrier_attr& attr,
+                                             const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented yet");
 }
 
 /* allgatherv */
-process_a2a_communicator::coll_request_t process_a2a_communicator::allgatherv_impl(
-    const void* send_buf,
-    size_t send_count,
-    void* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::allgatherv_impl(const void* send_buf,
+                                                     size_t send_count,
+                                                     void* recv_buf,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     ccl::datatype dtype,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::allgatherv_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-process_a2a_communicator::coll_request_t process_a2a_communicator::allgatherv_impl(
-    const void* send_buf,
-    size_t send_count,
-    const ccl::vector_class<void*>& recv_bufs,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::allgatherv_impl(const void* send_buf,
+                                                     size_t send_count,
+                                                     const ccl::vector_class<void*>& recv_bufs,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     ccl::datatype dtype,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::allgatherv_attr& attr,
+
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* allreduce */
-process_a2a_communicator::coll_request_t process_a2a_communicator::allreduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::allreduce_impl(const void* send_buf,
+                                                    void* recv_buf,
+                                                    size_t count,
+                                                    ccl::datatype dtype,
+                                                    ccl::reduction reduction,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::allreduce_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoall */
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoall_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::alltoall_impl(const void* send_buf,
+                                                   void* recv_buf,
+                                                   size_t count,
+                                                   ccl::datatype dtype,
+                                                   const ccl::stream::impl_value_t& stream,
+                                                   const ccl::alltoall_attr& attr,
+                                                   const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoall_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<void*>& recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
+                                                   const ccl::vector_class<void*>& recv_buf,
+                                                   size_t count,
+                                                   ccl::datatype dtype,
+                                                   const ccl::stream::impl_value_t& stream,
+                                                   const ccl::alltoall_attr& attr,
+                                                   const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoallv */
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoallv_impl(
-    const void* send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    void* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::alltoallv_impl(const void* send_buf,
+                                                    const ccl::vector_class<size_t>& send_counts,
+                                                    void* recv_buf,
+                                                    const ccl::vector_class<size_t>& recv_counts,
+                                                    ccl::datatype dtype,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoallv_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoallv_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    ccl::vector_class<void*> recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-
-    const ccl::vector_class<ccl::event>& dep) {
+ccl::event process_a2a_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
+                                                    const ccl::vector_class<size_t>& send_counts,
+                                                    ccl::vector_class<void*> recv_buf,
+                                                    const ccl::vector_class<size_t>& recv_counts,
+                                                    ccl::datatype dtype,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoallv_attr& attr,
+
+                                                    const ccl::vector_class<ccl::event>& dep) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* bcast */
-process_a2a_communicator::coll_request_t process_a2a_communicator::broadcast_impl(
-    void* buf,
-    size_t count,
-    ccl::datatype dtype,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::broadcast_impl(void* buf,
+                                                    size_t count,
+                                                    ccl::datatype dtype,
+                                                    int root,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::broadcast_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce */
-process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::reduce_impl(const void* send_buf,
+                                                 void* recv_buf,
+                                                 size_t count,
+                                                 ccl::datatype dtype,
+                                                 ccl::reduction reduction,
+                                                 int root,
+                                                 const ccl::stream::impl_value_t& stream,
+                                                 const ccl::reduce_attr& attr,
+                                                 const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce_scatter */
-process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_scatter_impl(
+ccl::event process_a2a_communicator::reduce_scatter_impl(
     const void* send_buf,
     void* recv_buf,
     size_t recv_count,
@@ -180,7 +171,7 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_scatte
 }
 
 /* sparse_allreduce */
-process_a2a_communicator::coll_request_t process_a2a_communicator::sparse_allreduce_impl(
+ccl::event process_a2a_communicator::sparse_allreduce_impl(
     const void* send_ind_buf,
     size_t send_ind_count,
     const void* send_val_buf,
@@ -199,119 +190,7 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::sparse_allred
     return {};
 }
 
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_a2a_communicator, char);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_a2a_communicator, int);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_a2a_communicator, int64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_a2a_communicator, uint64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_a2a_communicator, float);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_a2a_communicator, double);
-
-#ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_a2a_communicator,
-                                                cl::sycl::buffer<char COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_a2a_communicator,
-                                                cl::sycl::buffer<int COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_a2a_communicator,
-                                                cl::sycl::buffer<int64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_a2a_communicator,
-                                                cl::sycl::buffer<uint64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_a2a_communicator,
-                                                cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_a2a_communicator,
-                                                cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator, char, char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator, char, int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              char,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              char,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              char,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              char,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              char,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator, int, char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator, int, int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator, int, float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              int64_t,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              uint64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              uint64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              uint64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              uint64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              uint64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              uint64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_a2a_communicator,
-                                                              uint64_t,
-                                                              uint64_t);
-
+COMM_INTERFACE_COLL_INSTANTIATION(process_a2a_communicator);
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    process_a2a_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    process_a2a_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    process_a2a_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    process_a2a_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(process_a2a_communicator);
+#endif /* CCL_ENABLE_SYCL */
diff --git a/src/common/comm/l0/communicator/process_group/process_a2a_communicator.hpp b/src/common/comm/l0/communicator/process_group/process_a2a_communicator.hpp
index 7fc952079..90ff24ad1 100644
--- a/src/common/comm/l0/communicator/process_group/process_a2a_communicator.hpp
+++ b/src/common/comm/l0/communicator/process_group/process_a2a_communicator.hpp
@@ -20,35 +20,32 @@ namespace native {
 struct process_group_context;
 }
 
-class process_a2a_communicator
-        : public typed_base_communicator<process_a2a_communicator,
-                                         ccl::group_split_type::cluster,
-                                         ccl::device_topology_type::a2a,
-                                         ccl::gpu_communicator_traits> {
+class process_a2a_communicator : public typed_base_communicator<process_a2a_communicator,
+                                                                ccl::group_split_type::cluster,
+                                                                ccl::device_topology_type::a2a,
+                                                                ccl::gpu_communicator_traits> {
 public:
     using base_t = typed_base_communicator<process_a2a_communicator,
                                            ccl::group_split_type::cluster,
                                            ccl::device_topology_type::a2a,
                                            ccl::gpu_communicator_traits>;
 
-    using coll_request_t = ccl::event;
-
     process_a2a_communicator(ccl::unified_device_type&& device,
-                             ccl::unified_device_context_type&& ctx,
+                             ccl::unified_context_type&& ctx,
                              size_t thread_idx,
                              size_t proces_idx,
                              const ccl::comm_split_attr& attr);
 
     void visit(ccl::gpu_comm_attr& comm_attr) override;
 
-    coll_request_t barrier(const ccl::stream::impl_value_t& stream,
-                           const ccl::barrier_attr& attr,
-                           const ccl::vector_class<ccl::event>& deps) override;
+    ccl::event barrier(const ccl::stream::impl_value_t& stream,
+                       const ccl::barrier_attr& attr,
+                       const ccl::vector_class<ccl::event>& deps) override;
 
-    DEVICE_COMM_IMPL_DECLARATION
-    DEVICE_COMM_IMPL_CLASS_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION
+    COMM_IMPL_DECLARATION
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
 
 private:
     std::shared_ptr<native::process_group_context> ctx;
diff --git a/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp
index 428a8c581..e5711d36b 100644
--- a/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/process_group/process_a2a_communicator_impl.hpp
@@ -25,45 +25,42 @@
 
 /* allgatherv */
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::allgatherv_impl(
-    const buffer_type* send_buf,
-    size_t send_count,
-    buffer_type* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::allgatherv_impl(const buffer_type* send_buf,
+                                                     size_t send_count,
+                                                     buffer_type* recv_buf,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::allgatherv_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::allgatherv_impl(
-    const buffer_type* send_buf,
-    size_t send_count,
-    ccl::vector_class<buffer_type*>& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::allgatherv_impl(const buffer_type* send_buf,
+                                                     size_t send_count,
+                                                     ccl::vector_class<buffer_type*>& recv_buf,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::allgatherv_attr& attr,
+
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::allgatherv_impl(
-    const buffer_type& send_buf,
-    size_t send_count,
-    buffer_type& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::allgatherv_impl(const buffer_type& send_buf,
+                                                     size_t send_count,
+                                                     buffer_type& recv_buf,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::allgatherv_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::allgatherv_impl(
+ccl::event process_a2a_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -78,14 +75,13 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::allgatherv_im
 
 /* allreduce */
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::allreduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::allreduce_impl(const buffer_type* send_buf,
+                                                    buffer_type* recv_buf,
+                                                    size_t count,
+                                                    ccl::reduction reduction,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::allreduce_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     using namespace native;
 
     static constexpr ccl::group_split_type group_id = base_t::topology_type();
@@ -97,7 +93,7 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::allreduce_imp
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
 
     //TODO make const!
@@ -205,61 +201,56 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::allreduce_imp
     if (schedule) {
         LOG_DEBUG("Device group finalized");
     }
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::allreduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::allreduce_impl(const buffer_type& send_buf,
+                                                    buffer_type& recv_buf,
+                                                    size_t count,
+                                                    ccl::reduction reduction,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::allreduce_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoall */
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoall_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::alltoall_impl(const buffer_type* send_buf,
+                                                   buffer_type* recv_buf,
+                                                   size_t count,
+                                                   const ccl::stream::impl_value_t& stream,
+                                                   const ccl::alltoall_attr& attr,
+                                                   const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoall_impl(
-    const ccl::vector_class<buffer_type*>& send_buf,
-    const ccl::vector_class<buffer_type*>& recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
+ccl::event process_a2a_communicator::alltoall_impl(const ccl::vector_class<buffer_type*>& send_buf,
+                                                   const ccl::vector_class<buffer_type*>& recv_buf,
+                                                   size_t count,
+                                                   const ccl::stream::impl_value_t& stream,
+                                                   const ccl::alltoall_attr& attr,
 
-    const ccl::vector_class<ccl::event>& deps) {
+                                                   const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoall_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::alltoall_impl(const buffer_type& send_buf,
+                                                   buffer_type& recv_buf,
+                                                   size_t count,
+                                                   const ccl::stream::impl_value_t& stream,
+                                                   const ccl::alltoall_attr& attr,
+                                                   const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoall_impl(
+ccl::event process_a2a_communicator::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -273,45 +264,42 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::alltoall_impl
 
 /* alltoallv */
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoallv_impl(
-    const buffer_type* send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    buffer_type* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::alltoallv_impl(const buffer_type* send_buf,
+                                                    const ccl::vector_class<size_t>& send_counts,
+                                                    buffer_type* recv_buf,
+                                                    const ccl::vector_class<size_t>& recv_counts,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoallv_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoallv_impl(
-    const ccl::vector_class<buffer_type*>& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    const ccl::vector_class<buffer_type*>& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-
-    const ccl::vector_class<ccl::event>& dep) {
+ccl::event process_a2a_communicator::alltoallv_impl(const ccl::vector_class<buffer_type*>& send_buf,
+                                                    const ccl::vector_class<size_t>& send_counts,
+                                                    const ccl::vector_class<buffer_type*>& recv_buf,
+                                                    const ccl::vector_class<size_t>& recv_counts,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoallv_attr& attr,
+
+                                                    const ccl::vector_class<ccl::event>& dep) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoallv_impl(
-    const buffer_type& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    buffer_type& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::alltoallv_impl(const buffer_type& send_buf,
+                                                    const ccl::vector_class<size_t>& send_counts,
+                                                    buffer_type& recv_buf,
+                                                    const ccl::vector_class<size_t>& recv_counts,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoallv_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::alltoallv_impl(
+ccl::event process_a2a_communicator::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -326,60 +314,56 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::alltoallv_imp
 
 /* bcast */
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::broadcast_impl(
-    buffer_type* buf,
-    size_t count,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::broadcast_impl(buffer_type* buf,
+                                                    size_t count,
+                                                    int root,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::broadcast_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::broadcast_impl(
-    buffer_type& buf,
-    size_t count,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::broadcast_impl(buffer_type& buf,
+                                                    size_t count,
+                                                    int root,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::broadcast_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce */
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::reduce_impl(const buffer_type* send_buf,
+                                                 buffer_type* recv_buf,
+                                                 size_t count,
+                                                 ccl::reduction reduction,
+                                                 int root,
+                                                 const ccl::stream::impl_value_t& stream,
+                                                 const ccl::reduce_attr& attr,
+                                                 const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_a2a_communicator::reduce_impl(const buffer_type& send_buf,
+                                                 buffer_type& recv_buf,
+                                                 size_t count,
+                                                 ccl::reduction reduction,
+                                                 int root,
+                                                 const ccl::stream::impl_value_t& stream,
+                                                 const ccl::reduce_attr& attr,
+                                                 const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 /* reduce_scatter */
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_scatter_impl(
+ccl::event process_a2a_communicator::reduce_scatter_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t recv_count,
@@ -391,7 +375,7 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_scatte
     return {};
 }
 template <class buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_scatter_impl(
+ccl::event process_a2a_communicator::reduce_scatter_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t recv_count,
@@ -405,7 +389,7 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::reduce_scatte
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::sparse_allreduce_impl(
+ccl::event process_a2a_communicator::sparse_allreduce_impl(
     const index_buffer_type* send_ind_buf,
     size_t send_ind_count,
     const value_buffer_type* send_val_buf,
@@ -423,7 +407,7 @@ process_a2a_communicator::coll_request_t process_a2a_communicator::sparse_allred
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-process_a2a_communicator::coll_request_t process_a2a_communicator::sparse_allreduce_impl(
+ccl::event process_a2a_communicator::sparse_allreduce_impl(
     const index_buffer_container_type& send_ind_buf,
     size_t send_ind_count,
     const value_buffer_container_type& send_val_buf,
diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp
index 8de84372e..5f96958cd 100644
--- a/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp
+++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator.cpp
@@ -21,7 +21,7 @@
 using namespace ccl;
 
 process_ring_communicator::process_ring_communicator(ccl::unified_device_type&& device,
-                                                     ccl::unified_device_context_type&& ctx,
+                                                     ccl::unified_context_type&& ctx,
                                                      size_t thread_idx,
                                                      size_t process_idx,
                                                      const ccl::comm_split_attr& attr)
@@ -47,135 +47,125 @@ size_t process_ring_communicator::group_size() const
 }
 */
 
-process_ring_communicator::coll_request_t process_ring_communicator::barrier(
-    const ccl::stream::impl_value_t& stream,
-    const ccl::barrier_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::barrier(const ccl::stream::impl_value_t& stream,
+                                              const ccl::barrier_attr& attr,
+                                              const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented yet");
 }
 
 /* allgatherv */
-process_ring_communicator::coll_request_t process_ring_communicator::allgatherv_impl(
-    const void* send_buf,
-    size_t send_count,
-    void* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::allgatherv_impl(const void* send_buf,
+                                                      size_t send_count,
+                                                      void* recv_buf,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      ccl::datatype dtype,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::allgatherv_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-process_ring_communicator::coll_request_t process_ring_communicator::allgatherv_impl(
-    const void* send_buf,
-    size_t send_count,
-    const ccl::vector_class<void*>& recv_bufs,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::allgatherv_impl(const void* send_buf,
+                                                      size_t send_count,
+                                                      const ccl::vector_class<void*>& recv_bufs,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      ccl::datatype dtype,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::allgatherv_attr& attr,
+
+                                                      const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* allreduce */
-process_ring_communicator::coll_request_t process_ring_communicator::allreduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::allreduce_impl(const void* send_buf,
+                                                     void* recv_buf,
+                                                     size_t count,
+                                                     ccl::datatype dtype,
+                                                     ccl::reduction reduction,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::allreduce_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoall */
-process_ring_communicator::coll_request_t process_ring_communicator::alltoall_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::alltoall_impl(const void* send_buf,
+                                                    void* recv_buf,
+                                                    size_t count,
+                                                    ccl::datatype dtype,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoall_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-process_ring_communicator::coll_request_t process_ring_communicator::alltoall_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<void*>& recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
+                                                    const ccl::vector_class<void*>& recv_buf,
+                                                    size_t count,
+                                                    ccl::datatype dtype,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoall_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoallv */
-process_ring_communicator::coll_request_t process_ring_communicator::alltoallv_impl(
-    const void* send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    void* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::alltoallv_impl(const void* send_buf,
+                                                     const ccl::vector_class<size_t>& send_counts,
+                                                     void* recv_buf,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     ccl::datatype dtype,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::alltoallv_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-process_ring_communicator::coll_request_t process_ring_communicator::alltoallv_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    ccl::vector_class<void*> recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-
-    const ccl::vector_class<ccl::event>& dep) {
+ccl::event process_ring_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
+                                                     const ccl::vector_class<size_t>& send_counts,
+                                                     ccl::vector_class<void*> recv_buf,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     ccl::datatype dtype,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::alltoallv_attr& attr,
+
+                                                     const ccl::vector_class<ccl::event>& dep) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* bcast */
-process_ring_communicator::coll_request_t process_ring_communicator::broadcast_impl(
-    void* buf,
-    size_t count,
-    ccl::datatype dtype,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::broadcast_impl(void* buf,
+                                                     size_t count,
+                                                     ccl::datatype dtype,
+                                                     int root,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::broadcast_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce */
-process_ring_communicator::coll_request_t process_ring_communicator::reduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::reduce_impl(const void* send_buf,
+                                                  void* recv_buf,
+                                                  size_t count,
+                                                  ccl::datatype dtype,
+                                                  ccl::reduction reduction,
+                                                  int root,
+                                                  const ccl::stream::impl_value_t& stream,
+                                                  const ccl::reduce_attr& attr,
+                                                  const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce_scatter */
-process_ring_communicator::coll_request_t process_ring_communicator::reduce_scatter_impl(
+ccl::event process_ring_communicator::reduce_scatter_impl(
     const void* send_buf,
     void* recv_buf,
     size_t recv_count,
@@ -189,7 +179,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::reduce_scat
 }
 
 /* sparse_allreduce */
-process_ring_communicator::coll_request_t process_ring_communicator::sparse_allreduce_impl(
+ccl::event process_ring_communicator::sparse_allreduce_impl(
     const void* send_ind_buf,
     size_t send_ind_count,
     const void* send_val_buf,
@@ -208,123 +198,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::sparse_allr
     return {};
 }
 
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_ring_communicator, char);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_ring_communicator, int);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_ring_communicator, int64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_ring_communicator, uint64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_ring_communicator, float);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(process_ring_communicator, double);
-
-#ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_ring_communicator,
-                                                cl::sycl::buffer<char COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_ring_communicator,
-                                                cl::sycl::buffer<int COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_ring_communicator,
-                                                cl::sycl::buffer<int64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_ring_communicator,
-                                                cl::sycl::buffer<uint64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_ring_communicator,
-                                                cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(process_ring_communicator,
-                                                cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              char,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator, char, int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              char,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              char,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              char,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              char,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              char,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator, int, char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator, int, int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              int64_t,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              uint64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              uint64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              uint64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              uint64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              uint64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              uint64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(process_ring_communicator,
-                                                              uint64_t,
-                                                              uint64_t);
-
+COMM_INTERFACE_COLL_INSTANTIATION(process_ring_communicator);
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    process_ring_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    process_ring_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    process_ring_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    process_ring_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(process_ring_communicator);
+#endif /* CCL_ENABLE_SYCL */
diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp
index bac6c903a..f9cf1455a 100644
--- a/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp
+++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator.hpp
@@ -20,35 +20,32 @@ namespace native {
 struct process_group_context;
 }
 
-class process_ring_communicator
-        : public typed_base_communicator<process_ring_communicator,
-                                         ccl::group_split_type::cluster,
-                                         ccl::device_topology_type::ring,
-                                         ccl::gpu_communicator_traits> {
+class process_ring_communicator : public typed_base_communicator<process_ring_communicator,
+                                                                 ccl::group_split_type::cluster,
+                                                                 ccl::device_topology_type::ring,
+                                                                 ccl::gpu_communicator_traits> {
 public:
     using base_t = typed_base_communicator<process_ring_communicator,
                                            ccl::group_split_type::cluster,
                                            ccl::device_topology_type::ring,
                                            ccl::gpu_communicator_traits>;
 
-    using coll_request_t = ccl::event;
-
     process_ring_communicator(ccl::unified_device_type&& device,
-                              ccl::unified_device_context_type&& ctx,
+                              ccl::unified_context_type&& ctx,
                               size_t thread_idx,
                               size_t process_idx,
                               const ccl::comm_split_attr& attr);
 
     void visit(ccl::gpu_comm_attr& comm_attr) override;
 
-    coll_request_t barrier(const ccl::stream::impl_value_t& stream,
-                           const ccl::barrier_attr& attr,
-                           const ccl::vector_class<ccl::event>& deps) override;
+    ccl::event barrier(const ccl::stream::impl_value_t& stream,
+                       const ccl::barrier_attr& attr,
+                       const ccl::vector_class<ccl::event>& deps) override;
 
-    DEVICE_COMM_IMPL_DECLARATION
-    DEVICE_COMM_IMPL_CLASS_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION
+    COMM_IMPL_DECLARATION
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
 
 private:
     std::shared_ptr<native::process_group_context> ctx;
diff --git a/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp
index eec1ef5fd..402c70570 100644
--- a/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/process_group/process_ring_communicator_impl.hpp
@@ -25,45 +25,42 @@
 
 /* allgatherv */
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::allgatherv_impl(
-    const buffer_type* send_buf,
-    size_t send_count,
-    buffer_type* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::allgatherv_impl(const buffer_type* send_buf,
+                                                      size_t send_count,
+                                                      buffer_type* recv_buf,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::allgatherv_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::allgatherv_impl(
-    const buffer_type* send_buf,
-    size_t send_count,
-    ccl::vector_class<buffer_type*>& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::allgatherv_impl(const buffer_type* send_buf,
+                                                      size_t send_count,
+                                                      ccl::vector_class<buffer_type*>& recv_buf,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::allgatherv_attr& attr,
+
+                                                      const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::allgatherv_impl(
-    const buffer_type& send_buf,
-    size_t send_count,
-    buffer_type& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::allgatherv_impl(const buffer_type& send_buf,
+                                                      size_t send_count,
+                                                      buffer_type& recv_buf,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::allgatherv_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::allgatherv_impl(
+ccl::event process_ring_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -78,14 +75,13 @@ process_ring_communicator::coll_request_t process_ring_communicator::allgatherv_
 
 /* allreduce */
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::allreduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::allreduce_impl(const buffer_type* send_buf,
+                                                     buffer_type* recv_buf,
+                                                     size_t count,
+                                                     ccl::reduction reduction,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::allreduce_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     using namespace native;
 
     static constexpr ccl::group_split_type group_id = base_t::topology_type();
@@ -97,7 +93,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::allreduce_i
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     size_t ring_index = 0;
     LOG_DEBUG("communicator for device idx: ",
               get_device_path(),
@@ -143,6 +139,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::allreduce_i
                     thread_id,
                     *community,
                     ipc_src_real_it->second,
+                    this->get_native_context(),
                     send_entry_buffer,
                     recv_entry_buffer,
                     count,
@@ -166,6 +163,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::allreduce_i
                         thread_id,
                         *community,
                         ipc_src_virt_it->second,
+                        this->get_native_context(),
                         send_entry_buffer,
                         recv_entry_buffer,
                         count,
@@ -180,18 +178,20 @@ process_ring_communicator::coll_request_t process_ring_communicator::allreduce_i
                 using gpu_allreduce_entry =
                     l0_allreduce_typed_entry<buffer_type, ccl_gpu_comm, group_id>;
 
-                schedule = ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry,
-                                                                 ccl_sched_add_back,
-                                                                 group_id,
-                                                                 class_id>(process_id,
-                                                                           thread_id,
-                                                                           *community,
-                                                                           real_device_it->second,
-                                                                           send_entry_buffer,
-                                                                           recv_entry_buffer,
-                                                                           count,
-                                                                           reduction,
-                                                                           stream);
+                schedule =
+                    ctx->scheduler_impl->submit_entry_ipc<gpu_allreduce_entry,
+                                                          ccl_sched_add_back,
+                                                          group_id,
+                                                          class_id>(process_id,
+                                                                    thread_id,
+                                                                    *community,
+                                                                    real_device_it->second,
+                                                                    this->get_native_context(),
+                                                                    send_entry_buffer,
+                                                                    recv_entry_buffer,
+                                                                    count,
+                                                                    reduction,
+                                                                    stream);
             }
             else {
                 auto virtual_device_it = virtual_process_gpu_storage.find(comm_rank);
@@ -208,6 +208,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::allreduce_i
                                                                         thread_id,
                                                                         *community,
                                                                         virtual_device_it->second,
+                                                                        this->get_native_context(),
                                                                         send_entry_buffer,
                                                                         recv_entry_buffer,
                                                                         count,
@@ -222,61 +223,56 @@ process_ring_communicator::coll_request_t process_ring_communicator::allreduce_i
     if (schedule) {
         LOG_DEBUG("Device group finalized");
     }
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::allreduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::allreduce_impl(const buffer_type& send_buf,
+                                                     buffer_type& recv_buf,
+                                                     size_t count,
+                                                     ccl::reduction reduction,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::allreduce_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoall */
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::alltoall_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::alltoall_impl(const buffer_type* send_buf,
+                                                    buffer_type* recv_buf,
+                                                    size_t count,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoall_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::alltoall_impl(
-    const ccl::vector_class<buffer_type*>& send_buf,
-    const ccl::vector_class<buffer_type*>& recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
+ccl::event process_ring_communicator::alltoall_impl(const ccl::vector_class<buffer_type*>& send_buf,
+                                                    const ccl::vector_class<buffer_type*>& recv_buf,
+                                                    size_t count,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoall_attr& attr,
 
-    const ccl::vector_class<ccl::event>& deps) {
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::alltoall_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::alltoall_impl(const buffer_type& send_buf,
+                                                    buffer_type& recv_buf,
+                                                    size_t count,
+                                                    const ccl::stream::impl_value_t& stream,
+                                                    const ccl::alltoall_attr& attr,
+                                                    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::alltoall_impl(
+ccl::event process_ring_communicator::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -290,19 +286,18 @@ process_ring_communicator::coll_request_t process_ring_communicator::alltoall_im
 
 /* alltoallv */
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::alltoallv_impl(
-    const buffer_type* send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    buffer_type* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::alltoallv_impl(const buffer_type* send_buf,
+                                                     const ccl::vector_class<size_t>& send_counts,
+                                                     buffer_type* recv_buf,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::alltoallv_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::alltoallv_impl(
+ccl::event process_ring_communicator::alltoallv_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<buffer_type*>& recv_buf,
@@ -316,19 +311,18 @@ process_ring_communicator::coll_request_t process_ring_communicator::alltoallv_i
 }
 
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::alltoallv_impl(
-    const buffer_type& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    buffer_type& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::alltoallv_impl(const buffer_type& send_buf,
+                                                     const ccl::vector_class<size_t>& send_counts,
+                                                     buffer_type& recv_buf,
+                                                     const ccl::vector_class<size_t>& recv_counts,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::alltoallv_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::alltoallv_impl(
+ccl::event process_ring_communicator::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -343,60 +337,56 @@ process_ring_communicator::coll_request_t process_ring_communicator::alltoallv_i
 
 /* bcast */
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::broadcast_impl(
-    buffer_type* buf,
-    size_t count,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::broadcast_impl(buffer_type* buf,
+                                                     size_t count,
+                                                     int root,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::broadcast_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::broadcast_impl(
-    buffer_type& buf,
-    size_t count,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::broadcast_impl(buffer_type& buf,
+                                                     size_t count,
+                                                     int root,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::broadcast_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce */
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::reduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::reduce_impl(const buffer_type* send_buf,
+                                                  buffer_type* recv_buf,
+                                                  size_t count,
+                                                  ccl::reduction reduction,
+                                                  int root,
+                                                  const ccl::stream::impl_value_t& stream,
+                                                  const ccl::reduce_attr& attr,
+                                                  const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::reduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event process_ring_communicator::reduce_impl(const buffer_type& send_buf,
+                                                  buffer_type& recv_buf,
+                                                  size_t count,
+                                                  ccl::reduction reduction,
+                                                  int root,
+                                                  const ccl::stream::impl_value_t& stream,
+                                                  const ccl::reduce_attr& attr,
+                                                  const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 /* reduce_scatter */
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::reduce_scatter_impl(
+ccl::event process_ring_communicator::reduce_scatter_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t recv_count,
@@ -408,7 +398,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::reduce_scat
     return {};
 }
 template <class buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::reduce_scatter_impl(
+ccl::event process_ring_communicator::reduce_scatter_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t recv_count,
@@ -422,7 +412,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::reduce_scat
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-process_ring_communicator::coll_request_t process_ring_communicator::sparse_allreduce_impl(
+ccl::event process_ring_communicator::sparse_allreduce_impl(
     const index_buffer_type* send_ind_buf,
     size_t send_ind_count,
     const value_buffer_type* send_val_buf,
@@ -440,7 +430,7 @@ process_ring_communicator::coll_request_t process_ring_communicator::sparse_allr
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-process_ring_communicator::coll_request_t process_ring_communicator::sparse_allreduce_impl(
+ccl::event process_ring_communicator::sparse_allreduce_impl(
     const index_buffer_container_type& send_ind_buf,
     size_t send_ind_count,
     const value_buffer_container_type& send_val_buf,
diff --git a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp
index 11cc362f2..416d23ecc 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
 #include "common/comm/l0/context/process_group_ctx.hpp"
@@ -23,11 +23,12 @@ using namespace ccl;
 
 thread_device_group_a2a_communicator::thread_device_group_a2a_communicator(
     ccl::unified_device_type&& device,
-    ccl::unified_device_context_type&& ctx,
+    ccl::unified_context_type&& ctx,
     size_t thread_idx,
     size_t process_idx,
     const ccl::comm_split_attr& attr)
-        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx, /*comm_attr, */ attr) {}
+        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx, /*comm_attr, */ attr) {
+}
 
 void thread_device_group_a2a_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     auto process_ctx = comm_attr.get_process_context();
@@ -52,7 +53,7 @@ size_t thread_device_group_ring_communicator::group_size() const
 
 }
 */
-thread_device_group_a2a_communicator::coll_request_t thread_device_group_a2a_communicator::barrier(
+ccl::event thread_device_group_a2a_communicator::barrier(
     const ccl::stream::impl_value_t& stream,
     const ccl::barrier_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
@@ -60,128 +61,127 @@ thread_device_group_a2a_communicator::coll_request_t thread_device_group_a2a_com
 }
 
 /* allgatherv */
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allgatherv_impl(const void* send_buf,
-                                                      size_t send_count,
-                                                      void* recv_buf,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      ccl::datatype dtype,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::allgatherv_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::allgatherv_impl(
+    const void* send_buf,
+    size_t send_count,
+    void* recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allgatherv_impl(const void* send_buf,
-                                                      size_t send_count,
-                                                      const ccl::vector_class<void*>& recv_bufs,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      ccl::datatype dtype,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::allgatherv_attr& attr,
+ccl::event thread_device_group_a2a_communicator::allgatherv_impl(
+    const void* send_buf,
+    size_t send_count,
+    const ccl::vector_class<void*>& recv_bufs,
+    const ccl::vector_class<size_t>& recv_counts,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
 
-                                                      const ccl::vector_class<ccl::event>& deps) {
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* allreduce */
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allreduce_impl(const void* send_buf,
-                                                     void* recv_buf,
-                                                     size_t count,
-                                                     ccl::datatype dtype,
-                                                     ccl::reduction reduction,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::allreduce_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::allreduce_impl(
+    const void* send_buf,
+    void* recv_buf,
+    size_t count,
+    ccl::datatype dtype,
+    ccl::reduction reduction,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allreduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoall */
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoall_impl(const void* send_buf,
-                                                    void* recv_buf,
-                                                    size_t count,
-                                                    ccl::datatype dtype,
-                                                    const ccl::stream::impl_value_t& stream,
-                                                    const ccl::alltoall_attr& attr,
-                                                    const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::alltoall_impl(
+    const void* send_buf,
+    void* recv_buf,
+    size_t count,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
-                                                    const ccl::vector_class<void*>& recv_buf,
-                                                    size_t count,
-                                                    ccl::datatype dtype,
-                                                    const ccl::stream::impl_value_t& stream,
-                                                    const ccl::alltoall_attr& attr,
-                                                    const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::alltoall_impl(
+    const ccl::vector_class<void*>& send_buf,
+    const ccl::vector_class<void*>& recv_buf,
+    size_t count,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoallv */
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoallv_impl(const void* send_buf,
-                                                     const ccl::vector_class<size_t>& send_counts,
-                                                     void* recv_buf,
-                                                     const ccl::vector_class<size_t>& recv_counts,
-                                                     ccl::datatype dtype,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::alltoallv_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::alltoallv_impl(
+    const void* send_buf,
+    const ccl::vector_class<size_t>& send_counts,
+    void* recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoallv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
-                                                     const ccl::vector_class<size_t>& send_counts,
-                                                     ccl::vector_class<void*> recv_buf,
-                                                     const ccl::vector_class<size_t>& recv_counts,
-                                                     ccl::datatype dtype,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::alltoallv_attr& attr,
+ccl::event thread_device_group_a2a_communicator::alltoallv_impl(
+    const ccl::vector_class<void*>& send_buf,
+    const ccl::vector_class<size_t>& send_counts,
+    ccl::vector_class<void*> recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoallv_attr& attr,
 
-                                                     const ccl::vector_class<ccl::event>& dep) {
+    const ccl::vector_class<ccl::event>& dep) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* bcast */
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::broadcast_impl(void* buf,
-                                                     size_t count,
-                                                     ccl::datatype dtype,
-                                                     size_t root,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::broadcast_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::broadcast_impl(
+    void* buf,
+    size_t count,
+    ccl::datatype dtype,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::broadcast_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce */
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::reduce_impl(const void* send_buf,
-                                                  void* recv_buf,
-                                                  size_t count,
-                                                  ccl::datatype dtype,
-                                                  ccl::reduction reduction,
-                                                  size_t root,
-                                                  const ccl::stream::impl_value_t& stream,
-                                                  const ccl::reduce_attr& attr,
-                                                  const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::reduce_impl(
+    const void* send_buf,
+    void* recv_buf,
+    size_t count,
+    ccl::datatype dtype,
+    ccl::reduction reduction,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::reduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce_scatter */
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::reduce_scatter_impl(
+ccl::event thread_device_group_a2a_communicator::reduce_scatter_impl(
     const void* send_buf,
     void* recv_buf,
     size_t recv_count,
@@ -195,8 +195,7 @@ thread_device_group_a2a_communicator::reduce_scatter_impl(
 }
 
 /* sparse_allreduce */
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::sparse_allreduce_impl(
+ccl::event thread_device_group_a2a_communicator::sparse_allreduce_impl(
     const void* send_ind_buf,
     size_t send_ind_count,
     const void* send_val_buf,
@@ -215,129 +214,7 @@ thread_device_group_a2a_communicator::sparse_allreduce_impl(
     return {};
 }
 
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_a2a_communicator, char);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_a2a_communicator, int);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_a2a_communicator, int64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_a2a_communicator, uint64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_a2a_communicator, float);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_a2a_communicator, double);
-
+COMM_INTERFACE_COLL_INSTANTIATION(thread_device_group_a2a_communicator);
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_a2a_communicator,
-                                                cl::sycl::buffer<char COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_a2a_communicator,
-                                                cl::sycl::buffer<int COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_a2a_communicator,
-                                                cl::sycl::buffer<int64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_a2a_communicator,
-                                                cl::sycl::buffer<uint64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_a2a_communicator,
-                                                cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_a2a_communicator,
-                                                cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              char,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              char,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              char,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              char,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              char,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              char,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              char,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              int64_t,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_a2a_communicator,
-                                                              uint64_t,
-                                                              uint64_t);
-
-#ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    thread_device_group_a2a_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    thread_device_group_a2a_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    thread_device_group_a2a_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    thread_device_group_a2a_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(thread_device_group_a2a_communicator);
+#endif /* CCL_ENABLE_SYCL */
diff --git a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.hpp b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.hpp
index bb1e5b6b7..c18efe03b 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.hpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator.hpp
@@ -31,24 +31,22 @@ class thread_device_group_a2a_communicator
                                            ccl::device_topology_type::a2a,
                                            ccl::gpu_communicator_traits>;
 
-    using coll_request_t = ccl::event;
-
     thread_device_group_a2a_communicator(ccl::unified_device_type&& device,
-                                         ccl::unified_device_context_type&& ctx,
+                                         ccl::unified_context_type&& ctx,
                                          size_t thread_idx,
                                          size_t proces_idx,
                                          const ccl::comm_split_attr& attr);
 
     void visit(ccl::gpu_comm_attr& comm_attr) override;
 
-    coll_request_t barrier(const ccl::stream::impl_value_t& stream,
-                           const ccl::barrier_attr& attr,
-                           const ccl::vector_class<ccl::event>& deps) override;
+    ccl::event barrier(const ccl::stream::impl_value_t& stream,
+                       const ccl::barrier_attr& attr,
+                       const ccl::vector_class<ccl::event>& deps) override;
 
-    DEVICE_COMM_IMPL_DECLARATION
-    DEVICE_COMM_IMPL_CLASS_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION
+    COMM_IMPL_DECLARATION
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
 
 private:
     std::shared_ptr<native::thread_group_context> ctx;
diff --git a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp
index 8d2db211a..8479bd052 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_a2a_communicator_impl.hpp
@@ -25,46 +25,45 @@
 
 /* allgatherv */
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allgatherv_impl(const buffer_type& send_buf,
-                                                      size_t send_count,
-                                                      buffer_type& recv_buf,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::allgatherv_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::allgatherv_impl(
+    const buffer_type& send_buf,
+    size_t send_count,
+    buffer_type& recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allgatherv_impl(const buffer_type* send_buf,
-                                                      size_t send_count,
-                                                      ccl::vector_class<buffer_type*>& recv_buf,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::allgatherv_attr& attr,
-
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::allgatherv_impl(
+    const buffer_type* send_buf,
+    size_t send_count,
+    ccl::vector_class<buffer_type*>& recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
+
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allgatherv_impl(const buffer_type* send_buf,
-                                                      size_t send_count,
-                                                      buffer_type* recv_buf,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::allgatherv_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::allgatherv_impl(
+    const buffer_type* send_buf,
+    size_t send_count,
+    buffer_type* recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allgatherv_impl(
+ccl::event thread_device_group_a2a_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -79,27 +78,27 @@ thread_device_group_a2a_communicator::allgatherv_impl(
 
 /* allreduce */
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allreduce_impl(const buffer_type& send_buf,
-                                                     buffer_type& recv_buf,
-                                                     size_t count,
-                                                     ccl::reduction reduction,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::allreduce_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::allreduce_impl(
+    const buffer_type& send_buf,
+    buffer_type& recv_buf,
+    size_t count,
+    ccl::reduction reduction,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allreduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::allreduce_impl(const buffer_type* send_buf,
-                                                     buffer_type* recv_buf,
-                                                     size_t count,
-                                                     ccl::reduction reduction,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::allreduce_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::allreduce_impl(
+    const buffer_type* send_buf,
+    buffer_type* recv_buf,
+    size_t count,
+    ccl::reduction reduction,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allreduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     using namespace native;
 
     static constexpr ccl::group_split_type group_id = base_t::topology_type();
@@ -111,7 +110,7 @@ thread_device_group_a2a_communicator::allreduce_impl(const buffer_type* send_buf
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     LOG_DEBUG("communicator for device idx: ", get_device_path(), ", rank idx: ", comm_rank);
 
     //TODO make const!
@@ -171,49 +170,47 @@ thread_device_group_a2a_communicator::allreduce_impl(const buffer_type* send_buf
     if (schedule) {
         LOG_DEBUG("Device group finalized");
     }
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 /* alltoall */
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoall_impl(const buffer_type* send_buf,
-                                                    buffer_type* recv_buf,
-                                                    size_t count,
-                                                    const ccl::stream::impl_value_t& stream,
-                                                    const ccl::alltoall_attr& attr,
-                                                    const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::alltoall_impl(
+    const buffer_type* send_buf,
+    buffer_type* recv_buf,
+    size_t count,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoall_impl(const ccl::vector_class<buffer_type*>& send_buf,
-                                                    const ccl::vector_class<buffer_type*>& recv_buf,
-                                                    size_t count,
-                                                    const ccl::stream::impl_value_t& stream,
-                                                    const ccl::alltoall_attr& attr,
-
-                                                    const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::alltoall_impl(
+    const ccl::vector_class<buffer_type*>& send_buf,
+    const ccl::vector_class<buffer_type*>& recv_buf,
+    size_t count,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoall_impl(const buffer_type& send_buf,
-                                                    buffer_type& recv_buf,
-                                                    size_t count,
-                                                    const ccl::stream::impl_value_t& stream,
-                                                    const ccl::alltoall_attr& attr,
-                                                    const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::alltoall_impl(
+    const buffer_type& send_buf,
+    buffer_type& recv_buf,
+    size_t count,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoall_impl(
+ccl::event thread_device_group_a2a_communicator::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -227,20 +224,19 @@ thread_device_group_a2a_communicator::alltoall_impl(
 
 /* alltoallv */
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoallv_impl(const buffer_type* send_buf,
-                                                     const ccl::vector_class<size_t>& send_counts,
-                                                     buffer_type* recv_buf,
-                                                     const ccl::vector_class<size_t>& recv_counts,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::alltoallv_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::alltoallv_impl(
+    const buffer_type* send_buf,
+    const ccl::vector_class<size_t>& send_counts,
+    buffer_type* recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoallv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoallv_impl(
+ccl::event thread_device_group_a2a_communicator::alltoallv_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<buffer_type*>& recv_buf,
@@ -254,20 +250,19 @@ thread_device_group_a2a_communicator::alltoallv_impl(
 }
 
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoallv_impl(const buffer_type& send_buf,
-                                                     const ccl::vector_class<size_t>& send_counts,
-                                                     buffer_type& recv_buf,
-                                                     const ccl::vector_class<size_t>& recv_counts,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::alltoallv_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::alltoallv_impl(
+    const buffer_type& send_buf,
+    const ccl::vector_class<size_t>& send_counts,
+    buffer_type& recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoallv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::alltoallv_impl(
+ccl::event thread_device_group_a2a_communicator::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -282,61 +277,60 @@ thread_device_group_a2a_communicator::alltoallv_impl(
 
 /* bcast */
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::broadcast_impl(buffer_type* buf,
-                                                     size_t count,
-                                                     size_t root,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::broadcast_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::broadcast_impl(
+    buffer_type* buf,
+    size_t count,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::broadcast_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::broadcast_impl(buffer_type& buf,
-                                                     size_t count,
-                                                     size_t root,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::broadcast_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::broadcast_impl(
+    buffer_type& buf,
+    size_t count,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::broadcast_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce */
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::reduce_impl(const buffer_type* send_buf,
-                                                  buffer_type* recv_buf,
-                                                  size_t count,
-                                                  ccl::reduction reduction,
-                                                  size_t root,
-                                                  const ccl::stream::impl_value_t& stream,
-                                                  const ccl::reduce_attr& attr,
-                                                  const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::reduce_impl(
+    const buffer_type* send_buf,
+    buffer_type* recv_buf,
+    size_t count,
+    ccl::reduction reduction,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::reduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::reduce_impl(const buffer_type& send_buf,
-                                                  buffer_type& recv_buf,
-                                                  size_t count,
-                                                  ccl::reduction reduction,
-                                                  size_t root,
-                                                  const ccl::stream::impl_value_t& stream,
-                                                  const ccl::reduce_attr& attr,
-                                                  const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_a2a_communicator::reduce_impl(
+    const buffer_type& send_buf,
+    buffer_type& recv_buf,
+    size_t count,
+    ccl::reduction reduction,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::reduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 /* reduce_scatter */
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::reduce_scatter_impl(
+ccl::event thread_device_group_a2a_communicator::reduce_scatter_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t recv_count,
@@ -348,8 +342,7 @@ thread_device_group_a2a_communicator::reduce_scatter_impl(
     return {};
 }
 template <class buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::reduce_scatter_impl(
+ccl::event thread_device_group_a2a_communicator::reduce_scatter_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t recv_count,
@@ -363,8 +356,7 @@ thread_device_group_a2a_communicator::reduce_scatter_impl(
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::sparse_allreduce_impl(
+ccl::event thread_device_group_a2a_communicator::sparse_allreduce_impl(
     const index_buffer_type* send_ind_buf,
     size_t send_ind_count,
     const value_buffer_type* send_val_buf,
@@ -382,8 +374,7 @@ thread_device_group_a2a_communicator::sparse_allreduce_impl(
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-thread_device_group_a2a_communicator::coll_request_t
-thread_device_group_a2a_communicator::sparse_allreduce_impl(
+ccl::event thread_device_group_a2a_communicator::sparse_allreduce_impl(
     const index_buffer_container_type& send_ind_buf,
     size_t send_ind_count,
     const value_buffer_container_type& send_val_buf,
diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp
index 9f9d65146..09b9dcd31 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
 #include "common/comm/l0/context/process_group_ctx.hpp"
@@ -23,11 +23,12 @@ using namespace ccl;
 
 thread_device_group_ring_communicator::thread_device_group_ring_communicator(
     ccl::unified_device_type&& device,
-    ccl::unified_device_context_type&& ctx,
+    ccl::unified_context_type&& ctx,
     size_t thread_idx,
     size_t process_idx,
     const ccl::comm_split_attr& attr)
-        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx, /*comm_attr, */ attr) {}
+        : base_t(std::move(device), std::move(ctx), thread_idx, process_idx, /*comm_attr, */ attr) {
+}
 
 void thread_device_group_ring_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     auto process_ctx = comm_attr.get_process_context();
@@ -53,136 +54,135 @@ size_t thread_device_group_ring_communicator::group_size() const
 
 }
 */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::barrier(const ccl::stream::impl_value_t& stream,
-                                               const ccl::barrier_attr& attr,
-                                               const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::barrier(
+    const ccl::stream::impl_value_t& stream,
+    const ccl::barrier_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented yet");
 }
 
 /* allgatherv */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allgatherv_impl(const void* send_buf,
-                                                       size_t send_count,
-                                                       void* recv_buf,
-                                                       const ccl::vector_class<size_t>& recv_counts,
-                                                       ccl::datatype dtype,
-                                                       const ccl::stream::impl_value_t& stream,
-                                                       const ccl::allgatherv_attr& attr,
-                                                       const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::allgatherv_impl(
+    const void* send_buf,
+    size_t send_count,
+    void* recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allgatherv_impl(const void* send_buf,
-                                                       size_t send_count,
-                                                       const ccl::vector_class<void*>& recv_bufs,
-                                                       const ccl::vector_class<size_t>& recv_counts,
-                                                       ccl::datatype dtype,
-                                                       const ccl::stream::impl_value_t& stream,
-                                                       const ccl::allgatherv_attr& attr,
+ccl::event thread_device_group_ring_communicator::allgatherv_impl(
+    const void* send_buf,
+    size_t send_count,
+    const ccl::vector_class<void*>& recv_bufs,
+    const ccl::vector_class<size_t>& recv_counts,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
 
-                                                       const ccl::vector_class<ccl::event>& deps) {
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* allreduce */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allreduce_impl(const void* send_buf,
-                                                      void* recv_buf,
-                                                      size_t count,
-                                                      ccl::datatype dtype,
-                                                      ccl::reduction reduction,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::allreduce_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::allreduce_impl(
+    const void* send_buf,
+    void* recv_buf,
+    size_t count,
+    ccl::datatype dtype,
+    ccl::reduction reduction,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allreduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoall */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoall_impl(const void* send_buf,
-                                                     void* recv_buf,
-                                                     size_t count,
-                                                     ccl::datatype dtype,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::alltoall_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::alltoall_impl(
+    const void* send_buf,
+    void* recv_buf,
+    size_t count,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
-                                                     const ccl::vector_class<void*>& recv_buf,
-                                                     size_t count,
-                                                     ccl::datatype dtype,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::alltoall_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::alltoall_impl(
+    const ccl::vector_class<void*>& send_buf,
+    const ccl::vector_class<void*>& recv_buf,
+    size_t count,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoallv */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoallv_impl(const void* send_buf,
-                                                      const ccl::vector_class<size_t>& send_counts,
-                                                      void* recv_buf,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      ccl::datatype dtype,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::alltoallv_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::alltoallv_impl(
+    const void* send_buf,
+    const ccl::vector_class<size_t>& send_counts,
+    void* recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoallv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
-                                                      const ccl::vector_class<size_t>& send_counts,
-                                                      ccl::vector_class<void*> recv_buf,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      ccl::datatype dtype,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::alltoallv_attr& attr,
+ccl::event thread_device_group_ring_communicator::alltoallv_impl(
+    const ccl::vector_class<void*>& send_buf,
+    const ccl::vector_class<size_t>& send_counts,
+    ccl::vector_class<void*> recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    ccl::datatype dtype,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoallv_attr& attr,
 
-                                                      const ccl::vector_class<ccl::event>& dep) {
+    const ccl::vector_class<ccl::event>& dep) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* bcast */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::broadcast_impl(void* buf,
-                                                      size_t count,
-                                                      ccl::datatype dtype,
-                                                      size_t root,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::broadcast_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::broadcast_impl(
+    void* buf,
+    size_t count,
+    ccl::datatype dtype,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::broadcast_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::reduce_impl(const void* send_buf,
-                                                   void* recv_buf,
-                                                   size_t count,
-                                                   ccl::datatype dtype,
-                                                   ccl::reduction reduction,
-                                                   size_t root,
-                                                   const ccl::stream::impl_value_t& stream,
-                                                   const ccl::reduce_attr& attr,
-                                                   const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::reduce_impl(
+    const void* send_buf,
+    void* recv_buf,
+    size_t count,
+    ccl::datatype dtype,
+    ccl::reduction reduction,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::reduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce_scatter */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::reduce_scatter_impl(
+ccl::event thread_device_group_ring_communicator::reduce_scatter_impl(
     const void* send_buf,
     void* recv_buf,
     size_t recv_count,
@@ -196,8 +196,7 @@ thread_device_group_ring_communicator::reduce_scatter_impl(
 }
 
 /* sparse_allreduce */
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::sparse_allreduce_impl(
+ccl::event thread_device_group_ring_communicator::sparse_allreduce_impl(
     const void* send_ind_buf,
     size_t send_ind_count,
     const void* send_val_buf,
@@ -216,129 +215,7 @@ thread_device_group_ring_communicator::sparse_allreduce_impl(
     return {};
 }
 
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_ring_communicator, char);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_ring_communicator, int);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_ring_communicator, int64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_ring_communicator, uint64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_ring_communicator, float);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(thread_device_group_ring_communicator, double);
-
+COMM_INTERFACE_COLL_INSTANTIATION(thread_device_group_ring_communicator);
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_ring_communicator,
-                                                cl::sycl::buffer<char COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_ring_communicator,
-                                                cl::sycl::buffer<int COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_ring_communicator,
-                                                cl::sycl::buffer<int64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_ring_communicator,
-                                                cl::sycl::buffer<uint64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_ring_communicator,
-                                                cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(thread_device_group_ring_communicator,
-                                                cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              char,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              char,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              char,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              char,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              char,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              char,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              char,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              int64_t,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              uint64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              uint64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              uint64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              uint64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              uint64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              uint64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(thread_device_group_ring_communicator,
-                                                              uint64_t,
-                                                              uint64_t);
-
-#ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    thread_device_group_ring_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    thread_device_group_ring_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    thread_device_group_ring_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    thread_device_group_ring_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(thread_device_group_ring_communicator);
+#endif /* CCL_ENABLE_SYCL */
diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp
index 9b0e408af..12a68505d 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator.hpp
@@ -31,24 +31,22 @@ class thread_device_group_ring_communicator
                                            ccl::device_topology_type::ring,
                                            ccl::gpu_communicator_traits>;
 
-    using coll_request_t = ccl::event;
-
     thread_device_group_ring_communicator(ccl::unified_device_type&& device,
-                                          ccl::unified_device_context_type&& ctx,
+                                          ccl::unified_context_type&& ctx,
                                           size_t thread_idx,
                                           size_t process_idx,
                                           const ccl::comm_split_attr& attr);
 
     void visit(ccl::gpu_comm_attr& comm_attr) override;
 
-    coll_request_t barrier(const ccl::stream::impl_value_t& stream,
-                           const ccl::barrier_attr& attr,
-                           const ccl::vector_class<ccl::event>& deps) override;
+    ccl::event barrier(const ccl::stream::impl_value_t& stream,
+                       const ccl::barrier_attr& attr,
+                       const ccl::vector_class<ccl::event>& deps) override;
 
-    DEVICE_COMM_IMPL_DECLARATION
-    DEVICE_COMM_IMPL_CLASS_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION
+    COMM_IMPL_DECLARATION
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
 
 private:
     std::shared_ptr<native::thread_group_context> ctx;
diff --git a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp
index cf4117119..765cea1d2 100644
--- a/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/thread_group/thread_ring_communicator_impl.hpp
@@ -25,34 +25,33 @@
 
 /* allgatherv */
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allgatherv_impl(const buffer_type& send_buf,
-                                                       size_t send_count,
-                                                       buffer_type& recv_buf,
-                                                       const ccl::vector_class<size_t>& recv_counts,
-                                                       const ccl::stream::impl_value_t& stream,
-                                                       const ccl::allgatherv_attr& attr,
-                                                       const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::allgatherv_impl(
+    const buffer_type& send_buf,
+    size_t send_count,
+    buffer_type& recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allgatherv_impl(const buffer_type* send_buf,
-                                                       size_t send_count,
-                                                       ccl::vector_class<buffer_type*>& recv_buf,
-                                                       const ccl::vector_class<size_t>& recv_counts,
-                                                       const ccl::stream::impl_value_t& stream,
-                                                       const ccl::allgatherv_attr& attr,
-
-                                                       const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::allgatherv_impl(
+    const buffer_type* send_buf,
+    size_t send_count,
+    ccl::vector_class<buffer_type*>& recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
+
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allgatherv_impl(
+ccl::event thread_device_group_ring_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -66,14 +65,14 @@ thread_device_group_ring_communicator::allgatherv_impl(
 }
 
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allgatherv_impl(const buffer_type* send_buf,
-                                                       size_t send_count,
-                                                       buffer_type* recv_buf,
-                                                       const ccl::vector_class<size_t>& recv_counts,
-                                                       const ccl::stream::impl_value_t& stream,
-                                                       const ccl::allgatherv_attr& attr,
-                                                       const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::allgatherv_impl(
+    const buffer_type* send_buf,
+    size_t send_count,
+    buffer_type* recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allgatherv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     using namespace native;
 
     static constexpr ccl::group_split_type group_id = base_t::topology_type();
@@ -85,7 +84,7 @@ thread_device_group_ring_communicator::allgatherv_impl(const buffer_type* send_b
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     size_t ring_index = 0;
     LOG_DEBUG("communicator for device idx: ",
               get_device_path(),
@@ -124,6 +123,7 @@ thread_device_group_ring_communicator::allgatherv_impl(const buffer_type* send_b
                            thread_id,
                            *community,
                            real_device_it->second,
+                           this->get_native_context(),
                            send_entry_buffer,
                            send_count,
                            recv_entry_buffer,
@@ -143,6 +143,7 @@ thread_device_group_ring_communicator::allgatherv_impl(const buffer_type* send_b
                         thread_id,
                         *community,
                         virtual_device_it->second,
+                        this->get_native_context(),
                         send_entry_buffer,
                         send_count,
                         recv_entry_buffer,
@@ -155,20 +156,19 @@ thread_device_group_ring_communicator::allgatherv_impl(const buffer_type* send_b
     if (schedule) {
         LOG_DEBUG("Device group finalized");
     }
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 /* allreduce */
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allreduce_impl(const buffer_type* send_buf,
-                                                      buffer_type* recv_buf,
-                                                      size_t count,
-                                                      ccl::reduction reduction,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::allreduce_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::allreduce_impl(
+    const buffer_type* send_buf,
+    buffer_type* recv_buf,
+    size_t count,
+    ccl::reduction reduction,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allreduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     using namespace native;
 
     static constexpr ccl::group_split_type group_id = base_t::topology_type();
@@ -180,7 +180,7 @@ thread_device_group_ring_communicator::allreduce_impl(const buffer_type* send_bu
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     size_t ring_index = 0;
     LOG_DEBUG("communicator for device idx: ",
               get_device_path(),
@@ -219,6 +219,7 @@ thread_device_group_ring_communicator::allreduce_impl(const buffer_type* send_bu
                            thread_id,
                            *community,
                            real_device_it->second,
+                           this->get_native_context(),
                            send_entry_buffer,
                            recv_entry_buffer,
                            count,
@@ -238,6 +239,7 @@ thread_device_group_ring_communicator::allreduce_impl(const buffer_type* send_bu
                         thread_id,
                         *community,
                         virtual_device_it->second,
+                        this->get_native_context(),
                         send_entry_buffer,
                         recv_entry_buffer,
                         count,
@@ -250,38 +252,36 @@ thread_device_group_ring_communicator::allreduce_impl(const buffer_type* send_bu
     if (schedule) {
         LOG_DEBUG("Device group finalized");
     }
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::allreduce_impl(const buffer_type& send_buf,
-                                                      buffer_type& recv_buf,
-                                                      size_t count,
-                                                      ccl::reduction reduction,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::allreduce_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::allreduce_impl(
+    const buffer_type& send_buf,
+    buffer_type& recv_buf,
+    size_t count,
+    ccl::reduction reduction,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::allreduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoall */
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoall_impl(const buffer_type* send_buf,
-                                                     buffer_type* recv_buf,
-                                                     size_t count,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::alltoall_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::alltoall_impl(
+    const buffer_type* send_buf,
+    buffer_type* recv_buf,
+    size_t count,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoall_impl(
+ccl::event thread_device_group_ring_communicator::alltoall_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<buffer_type*>& recv_buf,
     size_t count,
@@ -293,19 +293,18 @@ thread_device_group_ring_communicator::alltoall_impl(
 }
 
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoall_impl(const buffer_type& send_buf,
-                                                     buffer_type& recv_buf,
-                                                     size_t count,
-                                                     const ccl::stream::impl_value_t& stream,
-                                                     const ccl::alltoall_attr& attr,
-                                                     const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::alltoall_impl(
+    const buffer_type& send_buf,
+    buffer_type& recv_buf,
+    size_t count,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoall_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoall_impl(
+ccl::event thread_device_group_ring_communicator::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -318,8 +317,7 @@ thread_device_group_ring_communicator::alltoall_impl(
 
 /* alltoallv */
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoallv_impl(
+ccl::event thread_device_group_ring_communicator::alltoallv_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<buffer_type*>& recv_buf,
@@ -332,20 +330,19 @@ thread_device_group_ring_communicator::alltoallv_impl(
 }
 
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoallv_impl(const buffer_type& send_buf,
-                                                      const ccl::vector_class<size_t>& send_counts,
-                                                      buffer_type& recv_buf,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::alltoallv_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::alltoallv_impl(
+    const buffer_type& send_buf,
+    const ccl::vector_class<size_t>& send_counts,
+    buffer_type& recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoallv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoallv_impl(
+ccl::event thread_device_group_ring_communicator::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -359,14 +356,14 @@ thread_device_group_ring_communicator::alltoallv_impl(
 }
 
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::alltoallv_impl(const buffer_type* send_buf,
-                                                      const ccl::vector_class<size_t>& send_counts,
-                                                      buffer_type* recv_buf,
-                                                      const ccl::vector_class<size_t>& recv_counts,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::alltoallv_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::alltoallv_impl(
+    const buffer_type* send_buf,
+    const ccl::vector_class<size_t>& send_counts,
+    buffer_type* recv_buf,
+    const ccl::vector_class<size_t>& recv_counts,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::alltoallv_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     using namespace native;
     static constexpr ccl::group_split_type group_id = base_t::topology_type();
     static constexpr ccl::device_topology_type class_id = base_t::topology_class();
@@ -377,7 +374,7 @@ thread_device_group_ring_communicator::alltoallv_impl(const buffer_type* send_bu
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     size_t ring_index = 0;
     LOG_DEBUG("communicator for device idx: ",
               get_device_path(),
@@ -416,6 +413,7 @@ thread_device_group_ring_communicator::alltoallv_impl(const buffer_type* send_bu
                            thread_id,
                            *community,
                            real_device_it->second,
+                           this->get_native_context(),
                            send_entry_buffer,
                            send_counts.data(),
                            recv_entry_buffer,
@@ -435,6 +433,7 @@ thread_device_group_ring_communicator::alltoallv_impl(const buffer_type* send_bu
                         thread_id,
                         *community,
                         virtual_device_it->second,
+                        this->get_native_context(),
                         send_entry_buffer,
                         send_counts.data(),
                         recv_entry_buffer,
@@ -447,19 +446,18 @@ thread_device_group_ring_communicator::alltoallv_impl(const buffer_type* send_bu
     if (schedule) {
         LOG_DEBUG("Device group finalized");
     }
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 /* bcast */
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::broadcast_impl(buffer_type* buf,
-                                                      size_t count,
-                                                      size_t root,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::broadcast_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::broadcast_impl(
+    buffer_type* buf,
+    size_t count,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::broadcast_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     using namespace native;
 
     static constexpr ccl::group_split_type group_id = base_t::topology_type();
@@ -471,7 +469,7 @@ thread_device_group_ring_communicator::broadcast_impl(buffer_type* buf,
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     size_t ring_index = 0;
     LOG_DEBUG("communicator for device idx: ",
               get_device_path(),
@@ -505,6 +503,7 @@ thread_device_group_ring_communicator::broadcast_impl(buffer_type* buf,
                            thread_id,
                            *community,
                            real_device_it->second,
+                           this->get_native_context(),
                            entry_buffer,
                            count,
                            root,
@@ -522,6 +521,7 @@ thread_device_group_ring_communicator::broadcast_impl(buffer_type* buf,
                                thread_id,
                                *community,
                                virtual_device_it->second,
+                               this->get_native_context(),
                                entry_buffer,
                                count,
                                root,
@@ -533,33 +533,32 @@ thread_device_group_ring_communicator::broadcast_impl(buffer_type* buf,
     if (schedule) {
         LOG_DEBUG("Device group finalized");
     }
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::broadcast_impl(buffer_type& buf,
-                                                      size_t count,
-                                                      size_t root,
-                                                      const ccl::stream::impl_value_t& stream,
-                                                      const ccl::broadcast_attr& attr,
-                                                      const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::broadcast_impl(
+    buffer_type& buf,
+    size_t count,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::broadcast_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce */
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::reduce_impl(const buffer_type* send_buf,
-                                                   buffer_type* recv_buf,
-                                                   size_t count,
-                                                   ccl::reduction reduction,
-                                                   size_t root,
-                                                   const ccl::stream::impl_value_t& stream,
-                                                   const ccl::reduce_attr& attr,
-                                                   const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::reduce_impl(
+    const buffer_type* send_buf,
+    buffer_type* recv_buf,
+    size_t count,
+    ccl::reduction reduction,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::reduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     using namespace native;
 
     static constexpr ccl::group_split_type group_id = base_t::topology_type();
@@ -571,7 +570,7 @@ thread_device_group_ring_communicator::reduce_impl(const buffer_type* send_buf,
             " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
     }
 
-    size_t comm_rank = rank();
+    int comm_rank = rank();
     size_t ring_index = 0;
     LOG_DEBUG("communicator for device idx: ",
               get_device_path(),
@@ -610,6 +609,7 @@ thread_device_group_ring_communicator::reduce_impl(const buffer_type* send_buf,
                            thread_id,
                            *community,
                            real_device_it->second,
+                           this->get_native_context(),
                            send_entry_buffer,
                            recv_entry_buffer,
                            count,
@@ -629,6 +629,7 @@ thread_device_group_ring_communicator::reduce_impl(const buffer_type* send_buf,
                                thread_id,
                                *community,
                                virtual_device_it->second,
+                               this->get_native_context(),
                                send_entry_buffer,
                                recv_entry_buffer,
                                count,
@@ -643,28 +644,26 @@ thread_device_group_ring_communicator::reduce_impl(const buffer_type* send_buf,
         LOG_DEBUG("Device group finalized");
     }
 
-    return std::unique_ptr<ccl::event_impl>(
-        new ccl::gpu_shared_event_impl(std::move(schedule)));
+    return std::unique_ptr<ccl::event_impl>(new ccl::gpu_shared_event_impl(std::move(schedule)));
 }
 
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::reduce_impl(const buffer_type& send_buf,
-                                                   buffer_type& recv_buf,
-                                                   size_t count,
-                                                   ccl::reduction reduction,
-                                                   size_t root,
-                                                   const ccl::stream::impl_value_t& stream,
-                                                   const ccl::reduce_attr& attr,
-                                                   const ccl::vector_class<ccl::event>& deps) {
+ccl::event thread_device_group_ring_communicator::reduce_impl(
+    const buffer_type& send_buf,
+    buffer_type& recv_buf,
+    size_t count,
+    ccl::reduction reduction,
+    int root,
+    const ccl::stream::impl_value_t& stream,
+    const ccl::reduce_attr& attr,
+    const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* reduce_scatter */
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::reduce_scatter_impl(
+ccl::event thread_device_group_ring_communicator::reduce_scatter_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t recv_count,
@@ -676,8 +675,7 @@ thread_device_group_ring_communicator::reduce_scatter_impl(
     return {};
 }
 template <class buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::reduce_scatter_impl(
+ccl::event thread_device_group_ring_communicator::reduce_scatter_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t recv_count,
@@ -691,8 +689,7 @@ thread_device_group_ring_communicator::reduce_scatter_impl(
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::sparse_allreduce_impl(
+ccl::event thread_device_group_ring_communicator::sparse_allreduce_impl(
     const index_buffer_type* send_ind_buf,
     size_t send_ind_count,
     const value_buffer_type* send_val_buf,
@@ -710,8 +707,7 @@ thread_device_group_ring_communicator::sparse_allreduce_impl(
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-thread_device_group_ring_communicator::coll_request_t
-thread_device_group_ring_communicator::sparse_allreduce_impl(
+ccl::event thread_device_group_ring_communicator::sparse_allreduce_impl(
     const index_buffer_container_type& send_ind_buf,
     size_t send_ind_count,
     const value_buffer_container_type& send_val_buf,
diff --git a/src/common/comm/l0/communicator/typed_base_communicator.hpp b/src/common/comm/l0/communicator/typed_base_communicator.hpp
index 0b903e175..95043c18e 100644
--- a/src/common/comm/l0/communicator/typed_base_communicator.hpp
+++ b/src/common/comm/l0/communicator/typed_base_communicator.hpp
@@ -17,6 +17,7 @@
 
 #include "common/comm/l0/communicator/base_communicator.hpp"
 #include "common/comm/l0/device_community_holder.hpp"
+#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
 
 /*
 namespace native
@@ -68,7 +69,7 @@ class typed_base_communicator : public base_communicator {
     ccl::communicator_interface_ptr split(const ccl::comm_split_attr& attr) override;
 
     typed_base_communicator(ccl::unified_device_type&& device,
-                            ccl::unified_device_context_type&& ctx,
+                            ccl::unified_context_type&& ctx,
                             size_t thread_idx,
                             size_t process_idx,
                             const ccl::comm_split_attr& attr);
@@ -81,65 +82,14 @@ class typed_base_communicator : public base_communicator {
 
     bool is_ready() const override;
 
-    // communicator interfaces implementation
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION__VOID;
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(char);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(int);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(int64_t);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(uint64_t);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(float);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(double);
-
-#ifdef CCL_ENABLE_SYCL
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<char COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<uint64_t COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION__VOID;
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, uint64_t);
+    native::ccl_driver_context_ptr get_native_context() {
+        return native::get_runtime_context(context.get());
+    }
 
+    COMM_INTERFACE_COLL_METHODS(DEFINITION);
 #ifdef CCL_ENABLE_SYCL
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>,
-                                                  cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>,
-                                                  cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                  cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                  cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+    SYCL_COMM_INTERFACE_COLL_METHODS(DEFINITION);
+#endif /* CCL_ENABLE_SYCL */
 
     // Device community interface
     /*    template<class device_t>
diff --git a/src/common/comm/l0/communicator/typed_base_communicator_impl.hpp b/src/common/comm/l0/communicator/typed_base_communicator_impl.hpp
index 7b1558bae..53d0d23fc 100644
--- a/src/common/comm/l0/communicator/typed_base_communicator_impl.hpp
+++ b/src/common/comm/l0/communicator/typed_base_communicator_impl.hpp
@@ -14,8 +14,8 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/comm/l0/communicator/typed_base_communicator.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
 #include "common/comm/l0/context/thread_group_ctx.hpp"
@@ -31,11 +31,12 @@
 template <TEMPLATE_DECL_ARG>
 typed_base_communicator<TEMPLATE_DEF_ARG>::typed_base_communicator(
     ccl::unified_device_type&& owned_device,
-    ccl::unified_device_context_type&& owned_ctx,
+    ccl::unified_context_type&& owned_ctx,
     size_t thread_idx,
     size_t process_idx,
     const ccl::comm_split_attr& attr)
-        : base_communicator(std::move(owned_device), std::move(owned_ctx),
+        : base_communicator(std::move(owned_device),
+                            std::move(owned_ctx),
                             thread_idx,
                             process_idx /*, comm_attr*/,
                             attr) {
@@ -130,7 +131,7 @@ native::indexed_device_container<device_t>& typed_base_communicator<TEMPLATE_DEF
 */
 template <TEMPLATE_DECL_ARG>
 std::string typed_base_communicator<TEMPLATE_DEF_ARG>::to_string() const {
-    native::details::printer<self_t::topology_type(), self_t::topology_class()> p;
+    native::detail::printer<self_t::topology_type(), self_t::topology_class()> p;
     ccl_tuple_for_each(device_community_impl->get_device_storage(), p);
     return std::string("Rank (") + std::to_string(rank()) + "/" + std::to_string(size()) +
            "\nGroup id: " + ::to_string(self_t::topology_type()) +
@@ -138,30 +139,30 @@ std::string typed_base_communicator<TEMPLATE_DEF_ARG>::to_string() const {
 }
 
 template <TEMPLATE_DECL_ARG>
-ccl::communicator_interface_ptr
-typed_base_communicator<TEMPLATE_DEF_ARG>::split(const ccl::comm_split_attr& attr) {
+ccl::communicator_interface_ptr typed_base_communicator<TEMPLATE_DEF_ARG>::split(
+    const ccl::comm_split_attr& attr) {
     if (!attr.is_valid<ccl::comm_split_attr_id::group>()) {
         throw ccl::exception(std::string(__FUNCTION__) +
-                        " - TODO `comm_split_attr`: supports `group` only");
+                             " - TODO `comm_split_attr`: supports `group` only");
     }
-    //TODO
-    #ifdef MULTI_GPU_SUPPORT
-        auto id = get_impl()->get_comm_group_id();
-        ccl::group_context::comm_group_t my_group =
-            ccl::group_context::instance().get_existing_group_by_id(id);
-        #ifdef CCL_ENABLE_SYCL
-            auto ctx = get_impl()->get_context();
-            return my_group->create_communicator_from_group<cl::sycl::device>(get_device(), ctx, attr);
-        #else
-            #ifdef MULTI_GPU_SUPPORT
-                auto ctx = get_impl()->get_context();
-                return my_group->create_communicator_from_group(get_impl()->get_device_path(), ctx, attr);
-            #endif
-        #endif
-    #else
-        throw ccl::exception(std::string(__FUNCTION__) + " - TODO `comm_split_attr`: unsupported");
-        return this;
-    #endif
+//TODO
+#ifdef MULTI_GPU_SUPPORT
+    auto id = get_impl()->get_comm_group_id();
+    ccl::group_context::comm_group_t my_group =
+        ccl::group_context::instance().get_existing_group_by_id(id);
+#ifdef CCL_ENABLE_SYCL
+    auto ctx = get_impl()->get_context();
+    return my_group->create_communicator_from_group<cl::sycl::device>(get_device(), ctx, attr);
+#else
+#ifdef MULTI_GPU_SUPPORT
+    auto ctx = get_impl()->get_context();
+    return my_group->create_communicator_from_group(get_impl()->get_device_path(), ctx, attr);
+#endif
+#endif
+#else
+    throw ccl::exception(std::string(__FUNCTION__) + " - TODO `comm_split_attr`: unsupported");
+    return this;
+#endif
 }
 
 #undef TEMPLATE_DECL_ARG
diff --git a/src/common/comm/l0/context/base_scaling_ctx.hpp b/src/common/comm/l0/context/base_scaling_ctx.hpp
index 95ff7f92d..2a31c32d2 100644
--- a/src/common/comm/l0/context/base_scaling_ctx.hpp
+++ b/src/common/comm/l0/context/base_scaling_ctx.hpp
@@ -53,9 +53,7 @@ class base_scaling_ctx {
         return static_cast<const ctx_impl_t*>(this);
     }
 
-    template <ccl::group_split_type group_id,
-              ccl::device_topology_type class_id,
-              class device_t>
+    template <ccl::group_split_type group_id, ccl::device_topology_type class_id, class device_t>
     void attach(device_t* obj) {
         static_assert(std::is_base_of<proxy_observer<device_t>, device_t>::value,
                       "Only `proxy_observer` derived class can be attached to context");
@@ -69,10 +67,9 @@ class base_scaling_ctx {
         static_assert(std::is_base_of<proxy_observer<device_t>, device_t>::value,
                       "Only `proxy_observer` derived class can invoke context");
 
-        get_this()->invoke_ctx_observer(
-            obj,
-            std::integral_constant<ccl::group_split_type, group_id>{},
-            std::forward<Args>(args)...);
+        get_this()->invoke_ctx_observer(obj,
+                                        std::integral_constant<ccl::group_split_type, group_id>{},
+                                        std::forward<Args>(args)...);
     }
 
     // helpers
diff --git a/src/common/comm/l0/context/device_group_ctx.cpp b/src/common/comm/l0/context/device_group_ctx.cpp
index 3d14a7a68..7e11f6cdd 100644
--- a/src/common/comm/l0/context/device_group_ctx.cpp
+++ b/src/common/comm/l0/context/device_group_ctx.cpp
@@ -29,7 +29,7 @@ namespace native {
 
 std::shared_ptr<device_group_context> device_group_context::create(
     const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_t& group_device_ids,
+    const ccl::device_indices_type& group_device_ids,
     device_storage& devices) {
     std::shared_ptr<device_group_context> ret(
         new device_group_context(comm_addr, group_device_ids));
@@ -54,7 +54,7 @@ std::shared_ptr<device_group_context> device_group_context::create(
                   " build RING topology. Log:\n ",
                   ss.str());
 
-        /*        native::details::printer<device_group_ring_topology::type()> p;
+        /*        native::detail::printer<device_group_ring_topology::type()> p;
         ccl_tuple_for_each(ring_device_topology->get_device_storage(), p);
         LOG_INFO("Device Group ", context_addr.to_string(), " RING topology:\n", p.to_string());
 */
@@ -74,7 +74,7 @@ std::shared_ptr<device_group_context> device_group_context::create(
             abort();
         }
         LOG_DEBUG("Device Group Context for ", context_addr.to_string(), " build RING topology. Log:\n ", ss.str());
-        native::details::printer<device_group_ring_topology::type()> p;
+        native::detail::printer<device_group_ring_topology::type()> p;
         ccl_tuple_for_each(ring_device_topology->get_device_storage(), p);
         LOG_INFO("Device Group ", context_addr.to_string(), " RING topology:\n", p.to_string());
         LOG_INFO("Device Group ", context_addr.to_string(), " A2A topology:\nTODO!");
@@ -87,7 +87,7 @@ std::shared_ptr<device_group_context> device_group_context::create(
 }
 
 device_group_context::device_group_context(const ccl::context_comm_addr& comm_addr,
-                                           const ccl::device_indices_t& group_device_ids)
+                                           const ccl::device_indices_type& group_device_ids)
         : scaling_context_base(),
           device_indices(group_device_ids),
           context_addr(comm_addr) {
@@ -97,7 +97,7 @@ device_group_context::device_group_context(const ccl::context_comm_addr& comm_ad
 
 device_group_context::~device_group_context() {}
 
-const ccl::device_indices_t& device_group_context::get_group_device_indices() const {
+const ccl::device_indices_type& device_group_context::get_group_device_indices() const {
     return device_indices;
 }
 
diff --git a/src/common/comm/l0/context/device_group_ctx.hpp b/src/common/comm/l0/context/device_group_ctx.hpp
index 14c8be84e..9c814ee2c 100644
--- a/src/common/comm/l0/context/device_group_ctx.hpp
+++ b/src/common/comm/l0/context/device_group_ctx.hpp
@@ -19,7 +19,7 @@
 #include <memory>
 #include <mutex>
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "supported_topologies.hpp"
 #include "common/comm/l0/gpu_comm_attr.hpp"
 #include "common/comm/l0/context/scaling_ctx/numa_ctx.hpp"
@@ -57,7 +57,7 @@ struct device_group_context : numa_ctx<device_group_context, SUPPORTED_TOPOLOGY_
     using topologies = device_group_community_holder<ccl::group_split_type::thread,
                                                      SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST>;
 
-    ccl::device_indices_t device_indices;
+    ccl::device_indices_type device_indices;
     topologies device_topology;
 
     template <ccl::device_topology_type class_id>
@@ -70,9 +70,9 @@ struct device_group_context : numa_ctx<device_group_context, SUPPORTED_TOPOLOGY_
 
     static std::shared_ptr<device_group_context> create(
         const ccl::context_comm_addr& comm_addr,
-        const ccl::device_indices_t& group_device_ids,
+        const ccl::device_indices_type& group_device_ids,
         device_storage& devices);
-    const ccl::device_indices_t& get_group_device_indices() const;
+    const ccl::device_indices_type& get_group_device_indices() const;
 
     ccl::context_comm_addr context_addr;
     std::unique_ptr<device_group_scheduler> scheduler_impl;
@@ -82,6 +82,6 @@ struct device_group_context : numa_ctx<device_group_context, SUPPORTED_TOPOLOGY_
 
 private:
     device_group_context(const ccl::context_comm_addr& comm_addr,
-                         const ccl::device_indices_t& device_mask);
+                         const ccl::device_indices_type& device_mask);
 };
 } // namespace native
diff --git a/src/common/comm/l0/context/device_storage.cpp b/src/common/comm/l0/context/device_storage.cpp
index 5753acad8..de7b214bc 100644
--- a/src/common/comm/l0/context/device_storage.cpp
+++ b/src/common/comm/l0/context/device_storage.cpp
@@ -20,7 +20,7 @@ namespace native {
 
 std::shared_ptr<specific_plain_device_storage> device_storage::create_devices_by_indices(
     size_t thread_id,
-    const ccl::device_indices_t& indices) {
+    const ccl::device_indices_type& indices) {
     std::shared_ptr<specific_plain_device_storage> out_devices =
         std::make_shared<specific_plain_device_storage>();
     size_t index_in_group = 0;
@@ -67,7 +67,7 @@ std::shared_ptr<specific_plain_device_storage> device_storage::create_devices_by
 }
 
 size_t device_storage::get_storage_size() const {
-    return details::get_aggregated_size<specific_device_storage, SUPPORTED_DEVICES_DECL_LIST>(
+    return detail::get_aggregated_size<specific_device_storage, SUPPORTED_DEVICES_DECL_LIST>(
         gpu_device_storage); /*
         return get_size<ccl_gpu_comm>() +
                get_size<ccl_ipc_gpu_comm>() +
@@ -83,7 +83,7 @@ size_t device_storage::get_storage_size() const {
     template<class DeviceType, class ...Types>
     size_t get_aggregated_size() const
     {
-        return get_size<DeviceType>() + details::get_aggregated_size_helper<Types...>(gpu_device_storage);
+        return get_size<DeviceType>() + detail::get_aggregated_size_helper<Types...>(gpu_device_storage);
     }
 */
 } // namespace native
diff --git a/src/common/comm/l0/context/device_storage.hpp b/src/common/comm/l0/context/device_storage.hpp
index 356aab9d9..3e45515f3 100644
--- a/src/common/comm/l0/context/device_storage.hpp
+++ b/src/common/comm/l0/context/device_storage.hpp
@@ -42,7 +42,7 @@ struct device_storage {
     // Result is a shared vector, which is remembered in per-thread storage
     std::shared_ptr<specific_plain_device_storage> create_devices_by_indices(
         size_t thread_id,
-        const ccl::device_indices_t& indices);
+        const ccl::device_indices_type& indices);
 
     // creation specific device type, determined from 'create_devices_by_indices'
     template <class device_t, class... Args>
diff --git a/src/common/comm/l0/context/process_group_ctx.cpp b/src/common/comm/l0/context/process_group_ctx.cpp
index c0e410c07..c8e1aeb98 100644
--- a/src/common/comm/l0/context/process_group_ctx.cpp
+++ b/src/common/comm/l0/context/process_group_ctx.cpp
@@ -64,7 +64,7 @@ process_group_context::process_group_context(std::shared_ptr<ccl::host_communica
 
 process_group_context::~process_group_context() {}
 
-bool process_group_context::delegate_sync(const ccl::device_indices_t& thread_device_indices,
+bool process_group_context::delegate_sync(const ccl::device_indices_type& thread_device_indices,
                                           ccl::context_comm_addr& comm_addr) {
     // set thread id sequencially
     //comm_addr.thread_idx = process_device_topology.size();
@@ -88,7 +88,7 @@ bool process_group_context::sync_barrier(const ccl::device_mask_t& thread_device
     return sync_barrier(ccl_device_driver::get_device_indices(thread_device_mask), comm_addr);
 }
 
-bool process_group_context::sync_barrier(const ccl::device_indices_t& thread_device_indices,
+bool process_group_context::sync_barrier(const ccl::device_indices_type& thread_device_indices,
                                          ccl::context_comm_addr& comm_addr) {
     // sync all threads at first - blocking operation
     if (!delegate_sync(thread_device_indices, comm_addr)) {
@@ -96,7 +96,7 @@ bool process_group_context::sync_barrier(const ccl::device_indices_t& thread_dev
     }
 
     //barrie mutex is locked by MASTER thread
-    const ccl::process_device_indices_t& thread_indices =
+    const ccl::process_device_indices_type& thread_indices =
         thread_group_ctx->get_thread_group_device_indices();
 
     LOG_INFO("Process (",
@@ -105,12 +105,12 @@ bool process_group_context::sync_barrier(const ccl::device_indices_t& thread_dev
              process_count,
              ") reached process group communicator barrier");
 
-    ccl::device_indices_t process_aggregated_device_indices =
+    ccl::device_indices_type process_aggregated_device_indices =
         std::accumulate(thread_indices.begin(),
                         thread_indices.end(),
-                        ccl::device_indices_t(),
-                        [](ccl::device_indices_t& partial_indices,
-                           const typename ccl::process_device_indices_t::value_type& val) {
+                        ccl::device_indices_type(),
+                        [](ccl::device_indices_type& partial_indices,
+                           const typename ccl::process_device_indices_type::value_type& val) {
                             partial_indices.insert(val.second.begin(), val.second.end());
                             return partial_indices;
                         });
@@ -124,9 +124,9 @@ bool process_group_context::sync_barrier(const ccl::device_indices_t& thread_dev
         process_idx, process_count, *this, *gpu_device_storage);
 
     {
-        const ccl::process_device_indices_t& node_mask = get_node_afinity_indices(get_host_id());
+        const ccl::process_device_indices_type& node_mask = get_node_afinity_indices(get_host_id());
         std::stringstream ss;
-        details::adjacency_matrix p2p_dependency_graph =
+        detail::adjacency_matrix p2p_dependency_graph =
             ally_process_topology.build_p2p_capability_matrix(ss, node_mask);
         ss << "\nMatrix\n" << p2p_dependency_graph << std::endl;
         if (!ally_process_topology.build_all(ss,
@@ -182,7 +182,7 @@ std::shared_ptr<ccl::host_communicator> process_group_context::get_communicator(
 }
 
 bool process_group_context::build_cluster_affinity_table(
-    const ccl::device_indices_t& process_aggregated_device_indices) {
+    const ccl::device_indices_type& process_aggregated_device_indices) {
     LOG_INFO("Node: ", my_host_name, " start build affinity table for process idx: ", process_idx);
 
     //create cluster mask affinity
@@ -200,9 +200,13 @@ bool process_group_context::build_cluster_affinity_table(
     requests.reserve(hostname_indices_requests_count);
     {
         ccl::stream::impl_value_t empty_stream{};
-        requests.push_back(ccl_communicator->allgatherv_impl(
-            &send_hostname_size, 1, receive_hostname_sizes.data(), recv_counts,
-            empty_stream, ccl::default_allgatherv_attr, {}));
+        requests.push_back(ccl_communicator->allgatherv_impl(&send_hostname_size,
+                                                             1,
+                                                             receive_hostname_sizes.data(),
+                                                             recv_counts,
+                                                             empty_stream,
+                                                             ccl::default_allgatherv_attr,
+                                                             {}));
         LOG_TRACE("Request hostname sizes, process (",
                   ccl_communicator->rank(),
                   "/",
@@ -241,14 +245,14 @@ bool process_group_context::build_cluster_affinity_table(
     LOG_DEBUG("Memory required for device indices size: ", total_device_indices_count, " count");
 
     //Serialize own devices path data
-    auto serialized_indices = details::serialize::device_path_serializer::serialize_indices(
+    auto serialized_indices = detail::serialize::device_path_serializer::serialize_indices(
         process_aggregated_device_indices);
     // TODO assert(serialized_indices.size() == receive_process_indices_sizes[process_idx] && "Indices unexpected count");
 
     decltype(serialized_indices) affinity_indices;
     std::vector<char> hostnames;
     auto indices_count_to_bytes_converter = [](size_t elements) -> size_t {
-        return elements * details::serialize::device_path_serializable::device_index_size();
+        return elements * detail::serialize::device_path_serializable::device_index_size();
     };
 
     try {
@@ -256,9 +260,13 @@ bool process_group_context::build_cluster_affinity_table(
         hostnames.resize(total_hostname_size);
 
         ccl::stream::impl_value_t empty_stream{};
-        requests.push_back(ccl_communicator->allgatherv_impl(
-            my_host_name.data(), send_hostname_size, hostnames.data(), receive_hostname_sizes,
-            empty_stream, ccl::default_allgatherv_attr, {}));
+        requests.push_back(ccl_communicator->allgatherv_impl((int8_t*)my_host_name.data(),
+                                                             send_hostname_size,
+                                                             (int8_t*)hostnames.data(),
+                                                             receive_hostname_sizes,
+                                                             empty_stream,
+                                                             ccl::default_allgatherv_attr,
+                                                             {}));
         LOG_TRACE("Submit request for hostnames. Process (",
                   ccl_communicator->rank(),
                   "/",
@@ -276,9 +284,9 @@ bool process_group_context::build_cluster_affinity_table(
                        receive_process_indices_sizes.begin(),
                        indices_count_to_bytes_converter);
         requests.push_back(ccl_communicator->allgatherv_impl(
-            reinterpret_cast<const char*>(serialized_indices.data()),
+            reinterpret_cast<const int8_t*>(serialized_indices.data()),
             serialized_indices.size(),
-            reinterpret_cast<char*>(affinity_indices.data()),
+            reinterpret_cast<int8_t*>(affinity_indices.data()),
             receive_process_indices_sizes,
             empty_stream,
             ccl::default_allgatherv_attr,
@@ -335,7 +343,7 @@ bool process_group_context::build_cluster_affinity_table(
         }
 
         //get affinity
-        ccl::device_indices_t rank_indices = details::serialize::device_path_deserializer::
+        ccl::device_indices_type rank_indices = detail::serialize::device_path_deserializer::
             deserialize_indices<std::multiset, ccl::device_index_type>(
                 affinity_mask_from_iterator,
                 affinity_mask_from_iterator + receive_process_indices_sizes[rank_index]);
@@ -373,7 +381,7 @@ const ccl::host_id process_group_context::get_host_id() const {
 const ccl::cluster_aggregated_device_mask_t& process_group_context::get_afinity_mask() const {
     return global_mask;
 }
-const ccl::cluster_device_indices_t& process_group_context::get_affinity_indices() const {
+const ccl::cluster_device_indices_type& process_group_context::get_affinity_indices() const {
     return cluster_gpu_indices;
 }
 
@@ -388,20 +396,20 @@ const ccl::process_aggregated_device_mask_t& process_group_context::get_node_afi
     return it->second;
 }
 
-const ccl::process_device_indices_t& process_group_context::get_node_afinity_indices(
+const ccl::process_device_indices_type& process_group_context::get_node_afinity_indices(
     const ccl::host_id& host) const {
     auto it = cluster_gpu_indices.find(host);
     if (it == cluster_gpu_indices.end()) {
         LOG_ERROR("Cannot get affinity indices for node: ", host);
-        static const ccl::process_device_indices_t empty;
+        static const ccl::process_device_indices_type empty;
         return empty;
     }
     return it->second;
 }
 
 void process_group_context::set_node_afinity_indices(const ccl::host_id& host,
-                                                     size_t rank_id,
-                                                     const ccl::device_indices_t& indices) {
+                                                     int rank_id,
+                                                     const ccl::device_indices_type& indices) {
     /*
     ccl::device_mask_t rank_mask = ccl_device_driver::get_device_mask(indices);
     auto& per_host_mask = global_mask[host];
@@ -475,7 +483,7 @@ std::tuple<bool, std::string> process_group_context::check_device_mask_validity_
 */
 
 void process_group_context::dump_cluster_affinity_indices(
-    const ccl::cluster_device_indices_t& indices,
+    const ccl::cluster_device_indices_type& indices,
     std::ostream& out) {
     out << "Cluster nodes: " << indices.size() << "\n";
     for (const auto& node_indices : indices) {
@@ -496,7 +504,7 @@ void process_group_context::dump_node_aggregated_mask(
 }
 void process_group_context::dump_node_aggregated_indices(
     const std::string& node_name,
-    const ccl::process_device_indices_t& indices,
+    const ccl::process_device_indices_type& indices,
     std::ostream& out) {
     if (!node_name.empty()) {
         out << "Node: " << node_name << ", processes: " << indices.size() << "\n";
@@ -518,7 +526,7 @@ void process_group_context::dump_process_mask(size_t process_id,
 }
 
 void process_group_context::dump_process_indices(size_t process_id,
-                                                 const ccl::device_indices_t& indices,
+                                                 const ccl::device_indices_type& indices,
                                                  std::ostream& out) {
     out << "Process idx: " << process_id << ", affinity: ";
     for (const auto& path : indices) {
@@ -556,9 +564,9 @@ void process_group_context::dump_process_topologies(std::ostream& out) const {
     }
 }
 
-std::vector<ccl::device_indices_t> process_group_context::get_ipc_device_indices() const {
+std::vector<ccl::device_indices_type> process_group_context::get_ipc_device_indices() const {
     std::stringstream ss;
-    ccl::process_device_indices_t node_mask_to_reorder = get_node_afinity_indices(get_host_id());
+    ccl::process_device_indices_type node_mask_to_reorder = get_node_afinity_indices(get_host_id());
     if (node_mask_to_reorder.empty()) {
         ss << "process_group_context::get_ipc_device_indices failed: empty process affinities for hostname: "
            << get_host_id() << ", cluster topology:\n";
@@ -568,7 +576,7 @@ std::vector<ccl::device_indices_t> process_group_context::get_ipc_device_indices
         throw std::runtime_error(err);
     }
 
-    std::vector<ccl::device_indices_t> ipc_device_indices;
+    std::vector<ccl::device_indices_type> ipc_device_indices;
     try {
         ipc_device_indices =
             process_group_context::get_ipc_device_indices_for_id(process_idx, node_mask_to_reorder);
@@ -583,9 +591,9 @@ std::vector<ccl::device_indices_t> process_group_context::get_ipc_device_indices
     return ipc_device_indices;
 }
 
-std::vector<ccl::device_indices_t> process_group_context::get_ipc_device_indices_for_id(
+std::vector<ccl::device_indices_type> process_group_context::get_ipc_device_indices_for_id(
     size_t process_idx,
-    ccl::process_device_indices_t node_indices) {
+    ccl::process_device_indices_type node_indices) {
     std::stringstream ss;
     auto my_process_it = node_indices.find(process_idx);
     if (my_process_it == node_indices.end()) {
@@ -598,7 +606,7 @@ std::vector<ccl::device_indices_t> process_group_context::get_ipc_device_indices
 
     node_indices.erase(my_process_it); //self indices erase, other are ipc
 
-    std::vector<ccl::device_indices_t> ipc_device_indices;
+    std::vector<ccl::device_indices_type> ipc_device_indices;
     for (const auto& mask : node_indices) {
         ipc_device_indices.push_back(mask.second);
     }
@@ -606,9 +614,9 @@ std::vector<ccl::device_indices_t> process_group_context::get_ipc_device_indices
 }
 
 void process_group_context::collect_cluster_colored_plain_graphs(
-    const details::colored_plain_graph_list& send_graph,
-    details::global_sorted_colored_plain_graphs& received_graphs) {
-    using namespace details::serialize;
+    const detail::colored_plain_graph_list& send_graph,
+    detail::global_sorted_colored_plain_graphs& received_graphs) {
+    using namespace detail::serialize;
 
     LOG_DEBUG("Collect cluster colored plain graphs, my process index: ",
               process_idx,
@@ -631,8 +639,13 @@ void process_group_context::collect_cluster_colored_plain_graphs(
                   send_count);
         ccl::stream::impl_value_t empty_stream{};
         ccl_communicator
-            ->allgatherv_impl(&send_count, 1, recv_counts_process_graph_sizes.data(), recv_counts,
-            empty_stream, ccl::default_allgatherv_attr, {})
+            ->allgatherv_impl(&send_count,
+                              1,
+                              recv_counts_process_graph_sizes.data(),
+                              recv_counts,
+                              empty_stream,
+                              ccl::default_allgatherv_attr,
+                              {})
             .wait();
     }
 
@@ -648,9 +661,9 @@ void process_group_context::collect_cluster_colored_plain_graphs(
         recv_cluster_graphs.resize(global_graph_data_size);
         ccl::stream::impl_value_t empty_stream{};
         ccl_communicator
-            ->allgatherv_impl(reinterpret_cast<char*>(my_serialized_graph.data()),
+            ->allgatherv_impl(reinterpret_cast<int8_t*>(my_serialized_graph.data()),
                               send_count,
-                              reinterpret_cast<char*>(recv_cluster_graphs.data()),
+                              reinterpret_cast<int8_t*>(recv_cluster_graphs.data()),
                               recv_counts_process_graph_sizes,
                               empty_stream,
                               ccl::default_allgatherv_attr,
@@ -674,7 +687,7 @@ void process_group_context::collect_cluster_colored_plain_graphs(
     LOG_DEBUG("Deserialize recv_cluster_graphs");
     try {
         for (process_num = 0; process_num < ccl_communicator->size(); process_num++) {
-            details::colored_plain_graph_list graph =
+            detail::colored_plain_graph_list graph =
                 device_path_deserializer::deserialize_colored_graph_list_indices(
                     recv_cluster_graphs, deserialized_bytes, offset_bytes);
             LOG_DEBUG("Process index: ",
diff --git a/src/common/comm/l0/context/process_group_ctx.hpp b/src/common/comm/l0/context/process_group_ctx.hpp
index 2d839a123..9c08d9399 100644
--- a/src/common/comm/l0/context/process_group_ctx.hpp
+++ b/src/common/comm/l0/context/process_group_ctx.hpp
@@ -56,7 +56,7 @@ struct process_group_context
     virtual //TODO use stub
         ~process_group_context();
 
-    bool sync_barrier(const ccl::device_indices_t& thread_device_indices,
+    bool sync_barrier(const ccl::device_indices_type& thread_device_indices,
                       ccl::context_comm_addr& comm_addr);
     bool sync_barrier(const ccl::device_mask_t& thread_device_mask,
                       ccl::context_comm_addr& comm_addr);
@@ -82,24 +82,25 @@ struct process_group_context
     }
 
     const ccl::cluster_aggregated_device_mask_t& get_afinity_mask() const;
-    const ccl::cluster_device_indices_t& get_affinity_indices() const;
+    const ccl::cluster_device_indices_type& get_affinity_indices() const;
 
     const ccl::process_aggregated_device_mask_t& get_node_afinity_mask(
         const ccl::host_id& host) const;
-    const ccl::process_device_indices_t& get_node_afinity_indices(const ccl::host_id& host) const;
+    const ccl::process_device_indices_type& get_node_afinity_indices(
+        const ccl::host_id& host) const;
 
     void set_node_afinity_indices(const ccl::host_id& host,
-                                  size_t rank_id,
-                                  const ccl::device_indices_t& indices);
+                                  int rank_id,
+                                  const ccl::device_indices_type& indices);
 
     const ccl::host_id get_host_id() const;
 
     std::string to_string() const;
     device_storage& get_device_storage();
-    std::vector<ccl::device_indices_t> get_ipc_device_indices() const;
-    static std::vector<ccl::device_indices_t> get_ipc_device_indices_for_id(
+    std::vector<ccl::device_indices_type> get_ipc_device_indices() const;
+    static std::vector<ccl::device_indices_type> get_ipc_device_indices_for_id(
         size_t process_idx,
-        ccl::process_device_indices_t node_indices);
+        ccl::process_device_indices_type node_indices);
 
     static void dump_cluster_affinity_mask(const ccl::cluster_aggregated_device_mask_t& mask,
                                            std::ostream& out);
@@ -110,13 +111,13 @@ struct process_group_context
                                   const ccl::device_mask_t& mask,
                                   std::ostream& out);
 
-    static void dump_cluster_affinity_indices(const ccl::cluster_device_indices_t& mask,
+    static void dump_cluster_affinity_indices(const ccl::cluster_device_indices_type& mask,
                                               std::ostream& out);
     static void dump_node_aggregated_indices(const std::string& node_name,
-                                             const ccl::process_device_indices_t& mask,
+                                             const ccl::process_device_indices_type& mask,
                                              std::ostream& out);
     static void dump_process_indices(size_t process_id,
-                                     const ccl::device_indices_t& mask,
+                                     const ccl::device_indices_type& mask,
                                      std::ostream& out);
 
     void dump_process_topologies(std::ostream& out) const;
@@ -132,14 +133,14 @@ struct process_group_context
     virtual /*TODO use stub*/
         void
         collect_cluster_colored_plain_graphs(
-            const details::colored_plain_graph_list& send_graph,
-            details::global_sorted_colored_plain_graphs& received_graphs);
+            const detail::colored_plain_graph_list& send_graph,
+            detail::global_sorted_colored_plain_graphs& received_graphs);
 
 private:
-    bool delegate_sync(const ccl::device_indices_t& thread_device_indices,
+    bool delegate_sync(const ccl::device_indices_type& thread_device_indices,
                        ccl::context_comm_addr& comm_addr);
     bool build_cluster_affinity_table(
-        const ccl::device_indices_t& process_aggregated_device_indices);
+        const ccl::device_indices_type& process_aggregated_device_indices);
 
     std::shared_ptr<ccl::host_communicator> get_communicator();
 
@@ -147,7 +148,7 @@ struct process_group_context
     std::shared_ptr<thread_group_context> thread_group_ctx;
     ccl::host_id my_host_name;
     ccl::cluster_aggregated_device_mask_t global_mask;
-    ccl::cluster_device_indices_t cluster_gpu_indices;
+    ccl::cluster_device_indices_type cluster_gpu_indices;
 
     std::unique_ptr<device_storage> gpu_device_storage;
     topologies_storage process_device_topology;
diff --git a/src/common/comm/l0/context/thread_group_ctx.cpp b/src/common/comm/l0/context/thread_group_ctx.cpp
index 43f4c4070..fe231b076 100644
--- a/src/common/comm/l0/context/thread_group_ctx.cpp
+++ b/src/common/comm/l0/context/thread_group_ctx.cpp
@@ -25,7 +25,7 @@ namespace native {
 
 thread_group_context::~thread_group_context() {}
 
-bool thread_group_context::sync_barrier(const ccl::device_indices_t& device_indices_t,
+bool thread_group_context::sync_barrier(const ccl::device_indices_type& device_indices_t,
                                         ccl::context_comm_addr& comm_addr,
                                         device_storage& devices) {
     std::shared_ptr<specific_plain_device_storage> thread_device_list;
@@ -35,12 +35,23 @@ bool thread_group_context::sync_barrier(const ccl::device_indices_t& device_indi
     //comm_addr.thread_idx = thread_device_group_ctx.size();
     aggregate_device_indices(comm_addr.thread_idx, device_indices_t);
 
-    //check on group creation final condition
-    device_group_ctx_ptr group_ctx =
-        device_group_context::create(comm_addr, device_indices_t, devices);
-    if (false == thread_device_group_ctx.insert({ comm_addr.thread_idx, group_ctx }).second) {
-        LOG_ERROR("cannot register devices group ctx for thread idx: ", comm_addr.thread_idx);
-        abort();
+    //TODO refactore device_group_creation...(Each Device Group should have REAL device independently)
+    {
+        thread_local device_storage tls_device_storage;
+        //check on group creation final condition
+        device_group_ctx_ptr group_ctx =
+            device_group_context::create(comm_addr, device_indices_t, tls_device_storage);
+        if (false == thread_device_group_ctx.insert({ comm_addr.thread_idx, group_ctx }).second) {
+            LOG_ERROR("cannot register devices group ctx for thread idx: ", comm_addr.thread_idx);
+            abort();
+        }
+    }
+    //TODO refactore device_group_creation(Each Thread Group should have unique REAL device)
+    {
+        //check on group creation final condition
+        device_group_ctx_ptr group_ctx =
+            device_group_context::create(comm_addr, device_indices_t, devices);
+        (void)group_ctx;
     }
 
     LOG_DEBUG("Thread ", comm_addr.to_string(), " reached thread group communicator barrier");
@@ -95,20 +106,21 @@ bool thread_group_context::sync_barrier(const ccl::device_indices_t& device_indi
 }
 
 void thread_group_context::aggregate_device_indices(size_t thread_id,
-                                                    const ccl::device_indices_t& new_indices) {
+                                                    const ccl::device_indices_type& new_indices) {
     per_thread_indices.insert({ thread_id, new_indices });
 }
 
-const ccl::process_device_indices_t& thread_group_context::get_thread_group_device_indices() const {
+const ccl::process_device_indices_type& thread_group_context::get_thread_group_device_indices()
+    const {
     return per_thread_indices;
 }
 
-const ccl::device_indices_t& thread_group_context::get_device_group_indices(
+const ccl::device_indices_type& thread_group_context::get_device_group_indices(
     size_t thread_id) const {
     auto it = per_thread_indices.find(thread_id);
     if (it == per_thread_indices.end()) {
         LOG_ERROR("Cannot find device group for thread: ", thread_id, ". Empty indices");
-        static const ccl::device_indices_t empty;
+        static const ccl::device_indices_type empty;
         return empty;
     }
     return it->second;
diff --git a/src/common/comm/l0/context/thread_group_ctx.hpp b/src/common/comm/l0/context/thread_group_ctx.hpp
index a460718d4..4cfd0acf7 100644
--- a/src/common/comm/l0/context/thread_group_ctx.hpp
+++ b/src/common/comm/l0/context/thread_group_ctx.hpp
@@ -42,12 +42,12 @@ struct thread_group_context : numa_ctx<thread_group_context, SUPPORTED_TOPOLOGY_
     using device_group_ctx_storage = std::map<size_t, device_group_ctx_ptr>;
 
     ~thread_group_context();
-    bool sync_barrier(const ccl::device_indices_t& thread_device_mask,
+    bool sync_barrier(const ccl::device_indices_type& thread_device_mask,
                       ccl::context_comm_addr& comm_addr,
                       device_storage& devices);
 
-    const ccl::process_device_indices_t& get_thread_group_device_indices() const;
-    const ccl::device_indices_t& get_device_group_indices(size_t thread_id) const;
+    const ccl::process_device_indices_type& get_thread_group_device_indices() const;
+    const ccl::device_indices_type& get_device_group_indices(size_t thread_id) const;
 
     template <ccl::device_topology_type class_id>
     typename std::tuple_element<class_id, typename topologies::device_topologies_t>::type&
@@ -73,10 +73,10 @@ struct thread_group_context : numa_ctx<thread_group_context, SUPPORTED_TOPOLOGY_
     const scaling_context_base& get_numa_ctx() const;
 
 private:
-    ccl::process_device_indices_t per_thread_indices;
+    ccl::process_device_indices_type per_thread_indices;
     device_group_ctx_storage thread_device_group_ctx;
     topologies_storage thread_device_topology;
 
-    void aggregate_device_indices(size_t thread_id, const ccl::device_indices_t& new_indices);
+    void aggregate_device_indices(size_t thread_id, const ccl::device_indices_type& new_indices);
 };
 } // namespace native
diff --git a/src/common/comm/l0/device_community.hpp b/src/common/comm/l0/device_community.hpp
index 4c46cedf6..00bcfa2d9 100644
--- a/src/common/comm/l0/device_community.hpp
+++ b/src/common/comm/l0/device_community.hpp
@@ -14,8 +14,8 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
 #include "common/comm/l0/device_group_routing_schema.hpp"
 #include "common/comm/l0/devices/devices_declaration.hpp"
 #include "common/comm/l0/gpu_device_types.hpp"
@@ -93,7 +93,7 @@ struct device_community {
         }
 
         // find device in topology and obtain its rank/sie
-        details::rank_getter<group_id, schema_id> initializer(device_id, registered_device_id);
+        detail::rank_getter<group_id, schema_id> initializer(device_id, registered_device_id);
         ccl_tuple_for_each(get_device_storage(), initializer);
 
         // copy shared data from community addr
@@ -111,7 +111,7 @@ struct device_community {
     std::string to_string() const {
         std::stringstream result;
         result << "Topology: " << ::to_string(schema_id) << "\n";
-        native::details::printer<group_id, schema_id> p;
+        native::detail::printer<group_id, schema_id> p;
         if (devices) {
             ccl_tuple_for_each(*devices, p);
             result << p.to_string();
diff --git a/src/common/comm/l0/device_community_holder.hpp b/src/common/comm/l0/device_community_holder.hpp
index f91727d28..f10e038df 100644
--- a/src/common/comm/l0/device_community_holder.hpp
+++ b/src/common/comm/l0/device_community_holder.hpp
@@ -18,7 +18,7 @@
 #include <tuple>
 #include <vector>
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/comm/l0/device_community.hpp"
 
 namespace native {
diff --git a/src/common/comm/l0/device_community_holder_impl.hpp b/src/common/comm/l0/device_community_holder_impl.hpp
index bc383cb61..c7d44d834 100644
--- a/src/common/comm/l0/device_community_holder_impl.hpp
+++ b/src/common/comm/l0/device_community_holder_impl.hpp
@@ -19,9 +19,8 @@
 #include "common/comm/l0/device_community_holder_utils.hpp"
 
 namespace native {
-#define TEMPLATE_DECL_ARG \
-    ccl::group_split_type group_id, ccl::device_topology_type... class_id
-#define TEMPLATE_DEF_ARG group_id, class_id...
+#define TEMPLATE_DECL_ARG ccl::group_split_type group_id, ccl::device_topology_type... class_id
+#define TEMPLATE_DEF_ARG  group_id, class_id...
 
 // community impl
 template <ccl::device_topology_type class_id>
@@ -63,7 +62,7 @@ device_group_community_holder<TEMPLATE_DEF_ARG>::get_community() {
 template <TEMPLATE_DECL_ARG>
 std::string device_group_community_holder<TEMPLATE_DEF_ARG>::to_string() const {
     std::stringstream ss;
-    details::device_community_container_print_helper<group_id> p(ss);
+    detail::device_community_container_print_helper<group_id> p(ss);
     ccl_tuple_for_each(typed_communities, p);
     return ss.str();
 }
diff --git a/src/common/comm/l0/device_community_holder_utils.hpp b/src/common/comm/l0/device_community_holder_utils.hpp
index 924642086..db9f2a007 100644
--- a/src/common/comm/l0/device_community_holder_utils.hpp
+++ b/src/common/comm/l0/device_community_holder_utils.hpp
@@ -24,7 +24,7 @@ namespace native {
  * Declarations
  *
  */
-namespace details {
+namespace detail {
 /**
  * class for pretty topology printing
  */
@@ -42,14 +42,14 @@ struct device_community_container_print_helper {
 private:
     std::ostream& output;
 };
-} // namespace details
+} // namespace detail
 
 /**
  *
  * Definitions
  *
  */
-namespace details {
+namespace detail {
 
 /**
  * class for pretty topology printing definition
@@ -82,5 +82,5 @@ void device_community_container_print_helper<group_id>::operator()(
         output << "\t\t" << topology_container.torn_apart_rings[i]->template to_string<group_id>();
     }
 }
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/device_community_utils.hpp b/src/common/comm/l0/device_community_utils.hpp
index d28fd657d..f3f27debc 100644
--- a/src/common/comm/l0/device_community_utils.hpp
+++ b/src/common/comm/l0/device_community_utils.hpp
@@ -16,7 +16,7 @@
 #pragma once
 
 namespace native {
-namespace details {
+namespace detail {
 
 /**
  *
@@ -32,14 +32,14 @@ struct rank_getter {
     template <class device_t>
     void operator()(const native::plain_device_container<device_t>& container);
 
-    size_t get_assigned_rank() const;
-    size_t get_assigned_size() const;
+    int get_assigned_rank() const;
+    int get_assigned_size() const;
 
 private:
     ccl::device_index_type device_id;
     std::multiset<ccl::device_index_type>& registered_device_id;
-    size_t rank = 0;
-    size_t size = 0;
+    int rank = 0;
+    int size = 0;
     bool find = false;
     size_t enumerator = 0;
 };
@@ -109,7 +109,7 @@ void rank_getter<group_id, class_id>::operator()(
 }
 
 template <ccl::group_split_type group_id, ccl::device_topology_type class_id>
-size_t rank_getter<group_id, class_id>::get_assigned_rank() const {
+int rank_getter<group_id, class_id>::get_assigned_rank() const {
     if (!find) {
         throw std::runtime_error(
             std::string(__FUNCTION__) +
@@ -117,5 +117,5 @@ size_t rank_getter<group_id, class_id>::get_assigned_rank() const {
     }
     return rank;
 }
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/device_containers.hpp b/src/common/comm/l0/device_containers.hpp
index 6df9021bb..8bc4f22d1 100644
--- a/src/common/comm/l0/device_containers.hpp
+++ b/src/common/comm/l0/device_containers.hpp
@@ -50,7 +50,7 @@ struct indexed_device_container<ccl_thread_comm<ccl_virtual_gpu_comm>> : std::ma
 template <class... device_t>
 using indexed_device_storage = std::tuple<indexed_device_container<device_t>...>;
 
-namespace details {
+namespace detail {
 //TODO - use traits
 template <class device_t, class... total_devices_t>
 inline size_t get_size(const native::device_storage_t<total_devices_t...>& gpu_device_storage) {
@@ -79,5 +79,5 @@ inline size_t get_aggregated_size(const Container& gpu_device_storage) {
            get_aggregated_size<Container, Types...>(gpu_device_storage);
 }
 
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/device_containers_utils.hpp b/src/common/comm/l0/device_containers_utils.hpp
index 55b46c142..440c48b11 100644
--- a/src/common/comm/l0/device_containers_utils.hpp
+++ b/src/common/comm/l0/device_containers_utils.hpp
@@ -16,13 +16,13 @@
 #pragma once
 #include <sstream>
 
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
 #include "common/comm/l0/device_containers.hpp"
 
 namespace native {
 
-namespace details {
+namespace detail {
 /*
 struct splice_devices
 {
@@ -89,6 +89,6 @@ struct printer {
     }
     std::map<size_t, std::string> device_rank_descr;
 };
-} // namespace details
+} // namespace detail
 
 } // namespace native
diff --git a/src/common/comm/l0/device_group_routing_schema.hpp b/src/common/comm/l0/device_group_routing_schema.hpp
index bc4784b55..ddf036c5b 100644
--- a/src/common/comm/l0/device_group_routing_schema.hpp
+++ b/src/common/comm/l0/device_group_routing_schema.hpp
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <memory>
 #include <sstream>
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/utils/enums.hpp"
 #include "common/utils/tuple.hpp"
 #include "supported_topologies.hpp"
@@ -50,11 +50,11 @@ using topology_addr_ptr = std::unique_ptr<topology_addr<schema_id, class_id>>;
 template <ccl::group_split_type group_id, ccl::device_topology_type... class_ids>
 using topology_addr_pointers_tuple_t = std::tuple<topology_addr_ptr<group_id, class_ids>...>;
 
-namespace details {
+namespace detail {
 struct topology_printer {
     template <ccl::group_split_type type, ccl::device_topology_type... class_ids>
     void operator()(const topology_addr_pointers_tuple_t<type, class_ids...>& topology) {
-        details::topology_printer p;
+        detail::topology_printer p;
         ccl_tuple_for_each(topology, p);
         result << ::to_string(type) << "\n\t{ ";
         result << p.result.str() << " }";
@@ -74,7 +74,7 @@ struct topology_printer {
 
     std::stringstream result;
 };
-} // namespace details
+} // namespace detail
 
 struct aggregated_topology_addr {
     template <ccl::group_split_type schema_id,
@@ -105,7 +105,7 @@ struct aggregated_topology_addr {
 
     template <ccl::group_split_type schema_id, ccl::device_topology_type class_id>
     std::string to_string() const {
-        details::topology_printer p;
+        detail::topology_printer p;
         p(std::get<utils::enum_to_underlying(schema_id)>(web));
         return p.result.str();
     }
@@ -117,7 +117,7 @@ struct aggregated_topology_addr {
     }
 
     std::string to_string() const {
-        details::topology_printer p;
+        detail::topology_printer p;
         ccl_tuple_for_each(web, p);
         return p.result.str();
     }
diff --git a/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp
index 7cc36da55..38a3a3ef1 100644
--- a/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_concurrent_gpu_comm.hpp
@@ -34,9 +34,7 @@ class ccl_thread_comm : public ccl_gpu_base_comm<ccl_thread_comm<device_t>,
     using typename base::comm_rank_t;
     using impl_t = device_t;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_module_t =
         typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
 
diff --git a/src/common/comm/l0/devices/ccl_gpu_base_comm.hpp b/src/common/comm/l0/devices/ccl_gpu_base_comm.hpp
index 6bda85a5f..59071357b 100644
--- a/src/common/comm/l0/devices/ccl_gpu_base_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_gpu_base_comm.hpp
@@ -43,7 +43,7 @@ namespace native {
 template <class gpu_impl, gpu_types type>
 class ccl_gpu_base_comm {
 public:
-    using comm_rank_t = size_t;
+    using comm_rank_t = int;
     using type_idx_t = typename std::underlying_type<gpu_types>::type;
     ccl_gpu_base_comm(ccl_device& assigned_device, comm_rank_t idx)
             : index_in_group(idx),
@@ -124,8 +124,8 @@ class ccl_gpu_base_comm {
     aggregated_topology_addr device_routing_web;
     ccl_device& device;
 
-    mutable size_t rank; //TODO
-    mutable size_t size; //TODO
+    mutable int rank; //TODO
+    mutable int size; //TODO
 };
 
 } // namespace native
diff --git a/src/common/comm/l0/devices/ccl_gpu_comm.cpp b/src/common/comm/l0/devices/ccl_gpu_comm.cpp
index 1fa88ef10..3b59d2861 100644
--- a/src/common/comm/l0/devices/ccl_gpu_comm.cpp
+++ b/src/common/comm/l0/devices/ccl_gpu_comm.cpp
@@ -23,7 +23,8 @@
 
 namespace native {
 
-ccl_gpu_comm::ccl_gpu_comm(ccl_device& assigned_device, size_t idx) : base(assigned_device, idx) {
+ccl_gpu_comm::ccl_gpu_comm(ccl_device& assigned_device, comm_rank_t idx)
+        : base(assigned_device, idx) {
     auto queue_prop = ccl_device::get_default_queue_desc();
     queue_prop.ordinal = 0;
     std::shared_ptr<ccl_context> ctx;
@@ -35,7 +36,7 @@ ccl_gpu_comm::ccl_gpu_comm(ccl_device& assigned_device, size_t idx) : base(assig
 
 std::string ccl_gpu_comm::to_string_impl() const {
     std::string ret(name());
-    ret = ret + ", comm: " + comm_to_str() +
+    ret = ret + ", comm:\n" + comm_to_str() +
           ", virtual count: " + std::to_string(get_virtual_gpu_count());
     return ret;
 }
diff --git a/src/common/comm/l0/devices/ccl_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_gpu_comm.hpp
index a615246b2..7c698f915 100644
--- a/src/common/comm/l0/devices/ccl_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_gpu_comm.hpp
@@ -26,9 +26,7 @@ class ccl_gpu_comm : public ccl_gpu_base_comm<ccl_gpu_comm, gpu_types::REAL_GPU>
     using base::comm_rank_t;
     using impl_t = ccl_gpu_comm;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_module_t = device_coll_module<algo_type, group, mode>;
 
     template <ccl_coll_type algo_type,
@@ -74,9 +72,6 @@ class ccl_gpu_comm : public ccl_gpu_base_comm<ccl_gpu_comm, gpu_types::REAL_GPU>
               class native_data_type>
     gpu_kernel_t<module_type, group_id, class_id, native_data_type>& get_gpu_kernel() {
         auto& ptr = get_gpu_module<module_type, group_id, class_id>();
-        if (not std::is_same<native_data_type, float>::value) {
-            throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + "Only float is supported");
-        }
         return ptr.template get_main_function<native_data_type>();
     }
 
diff --git a/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp b/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp
index 79d2af4d9..55357dc61 100644
--- a/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp
+++ b/src/common/comm/l0/devices/ccl_gpu_scaleup_proxy.hpp
@@ -40,9 +40,7 @@ class ccl_gpu_scaleup_proxy
 
     using proxy_base = proxy_observer_specific<ccl_gpu_scaleup_proxy<device_t>>;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_module_t =
         typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
 
@@ -89,12 +87,11 @@ class ccl_gpu_scaleup_proxy
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <
-        class native_data_type,
-        ccl::group_split_type group_id,
-        ccl::device_topology_type class_id,
-        class gpu_entry,
-        class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
+    template <class native_data_type,
+              ccl::group_split_type group_id,
+              ccl::device_topology_type class_id,
+              class gpu_entry,
+              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
     gpu_kernel_t<gpu_entry::type(), group_id, class_id, native_data_type>& register_entry(
         gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
@@ -180,12 +177,11 @@ class ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>
             .template get_gpu_kernel<module_type, group_id, class_id, native_data_type>();
     }
 
-    template <
-        class native_data_type,
-        ccl::group_split_type group_id,
-        ccl::device_topology_type class_id,
-        class gpu_entry,
-        class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
+    template <class native_data_type,
+              ccl::group_split_type group_id,
+              ccl::device_topology_type class_id,
+              class gpu_entry,
+              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
     gpu_kernel_t<gpu_entry::type(), group_id, class_id, native_data_type>& register_entry(
         gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
diff --git a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.cpp b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.cpp
index 8b74bdb39..62f41ff33 100644
--- a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.cpp
+++ b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.cpp
@@ -24,8 +24,8 @@
 namespace native {
 
 ccl_ipc_gpu_comm::ccl_ipc_gpu_comm(ccl_device& assigned_device,
-                                   size_t idx,
-                                   size_t size,
+                                   int idx,
+                                   int size,
                                    ccl::group_split_type topology_type,
                                    ccl::device_topology_type class_id)
         : base(assigned_device, idx) {
@@ -40,13 +40,13 @@ ccl_ipc_gpu_comm::ccl_ipc_gpu_comm(ccl_device& assigned_device,
         case ccl::group_split_type::cluster: {
             switch (class_id) {
                 case ccl::device_topology_type::ring: {
-                    reset_rank<ccl::group_split_type::cluster,
-                               ccl::device_topology_type::ring>(idx, size);
+                    reset_rank<ccl::group_split_type::cluster, ccl::device_topology_type::ring>(
+                        idx, size);
                     break;
                 }
                 case ccl::device_topology_type::a2a: {
-                    reset_rank<ccl::group_split_type::cluster,
-                               ccl::device_topology_type::a2a>(idx, size);
+                    reset_rank<ccl::group_split_type::cluster, ccl::device_topology_type::a2a>(
+                        idx, size);
                     break;
                 }
                 default: {
@@ -67,7 +67,7 @@ ccl_ipc_gpu_comm::ccl_ipc_gpu_comm(ccl_device& assigned_device,
 
 std::string ccl_ipc_gpu_comm::to_string_impl() const {
     std::string ret(name_impl());
-    ret = ret + ", comm: " + comm_to_str();
+    ret = ret + ", comm:\n" + comm_to_str();
     return ret;
 }
 } // namespace native
diff --git a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp
index 3d42f042d..5dfbdb9b2 100644
--- a/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_ipc_gpu_comm.hpp
@@ -31,9 +31,7 @@ class ccl_ipc_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::I
     using base = ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::IPC_DESTINATION_GPU>;
     using base::comm_rank_t;
     using impl_t = ccl_ipc_gpu_comm;
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_module_t = ipc_dst_device_coll_module<algo_type, group, mode>;
 
     template <ccl_coll_type algo_type,
@@ -51,7 +49,7 @@ class ccl_ipc_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::I
 
     ccl_ipc_gpu_comm(ccl_device& assigned_device,
                      comm_rank_t idx,
-                     size_t size,
+                     int size,
                      ccl::group_split_type group_id,
                      ccl::device_topology_type class_id);
     ~ccl_ipc_gpu_comm() = default;
@@ -67,9 +65,6 @@ class ccl_ipc_gpu_comm : public ccl_gpu_base_comm<ccl_ipc_gpu_comm, gpu_types::I
             base::template get_gpu_module_unsafe<module_type, group_id, class_id, gpu_module_t>(
                 registered_modules);
         assert(ptr);
-        if (not std::is_same<native_data_type, float>::value) {
-            throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + "Only float is supported");
-        }
         return ptr->template get_main_function<native_data_type>();
     }
 
diff --git a/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp
index f5d2cf652..256c9c62c 100644
--- a/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_ipc_source_gpu_comm.hpp
@@ -34,9 +34,7 @@ class ccl_ipc_source_gpu_comm
                                    gpu_types::IPC_GPU + device_t::type_idx()>;
     using typename base::comm_rank_t;
     using impl_t = device_t;
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_module_t =
         typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
 
@@ -95,8 +93,7 @@ class ccl_ipc_source_gpu_comm
                     std::string("ccl_ipc_source_gpu_comm must be created") +
                     "for process-based topology, but requested: " +
                     std::to_string(
-                        static_cast<
-                            typename std::underlying_type<ccl::group_split_type>::type>(
+                        static_cast<typename std::underlying_type<ccl::group_split_type>::type>(
                             group_id)));
             }
         }
@@ -130,12 +127,11 @@ class ccl_ipc_source_gpu_comm
             .template get_gpu_kernel<module_type, group_id, class_id, native_data_type>();
     }
 
-    template <
-        class native_data_type,
-        ccl::group_split_type group_id,
-        ccl::device_topology_type class_id,
-        class gpu_entry,
-        class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
+    template <class native_data_type,
+              ccl::group_split_type group_id,
+              ccl::device_topology_type class_id,
+              class gpu_entry,
+              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
     gpu_kernel_t<gpu_entry::type(), group_id, class_id, native_data_type>& register_entry(
         gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr =
diff --git a/src/common/comm/l0/devices/ccl_numa_proxy.hpp b/src/common/comm/l0/devices/ccl_numa_proxy.hpp
index 84a9892a9..1ebbdf87e 100644
--- a/src/common/comm/l0/devices/ccl_numa_proxy.hpp
+++ b/src/common/comm/l0/devices/ccl_numa_proxy.hpp
@@ -38,9 +38,7 @@ class ccl_numa_proxy
     using impl_t = device_t;
     using proxy_base = proxy_observer_specific<ccl_numa_proxy<device_t>>;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_module_t =
         typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
 
@@ -82,12 +80,11 @@ class ccl_numa_proxy
             .template get_gpu_kernel<module_type, group_id, class_id, native_data_type>();
     }
 
-    template <
-        class native_data_type,
-        ccl::group_split_type group_id,
-        ccl::device_topology_type class_id,
-        class gpu_entry,
-        class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
+    template <class native_data_type,
+              ccl::group_split_type group_id,
+              ccl::device_topology_type class_id,
+              class gpu_entry,
+              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
     gpu_kernel_t<gpu_entry::type(), group_id, class_id, native_data_type>& register_entry(
         gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr =
diff --git a/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp b/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp
index 127e28336..5d04ad56a 100644
--- a/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp
+++ b/src/common/comm/l0/devices/ccl_scaleout_proxy.hpp
@@ -89,12 +89,11 @@ class ccl_scaleout_proxy
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <
-        class native_data_type,
-        ccl::group_split_type group_id,
-        ccl::device_topology_type class_id,
-        class gpu_entry,
-        class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
+    template <class native_data_type,
+              ccl::group_split_type group_id,
+              ccl::device_topology_type class_id,
+              class gpu_entry,
+              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
     gpu_kernel_t<gpu_entry::type(), group_id, class_id, native_data_type>& register_entry(
         gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
@@ -182,12 +181,11 @@ class ccl_scaleout_proxy<ccl_numa_proxy<device_t>>
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <
-        class native_data_type,
-        ccl::group_split_type group_id,
-        ccl::device_topology_type class_id,
-        class gpu_entry,
-        class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
+    template <class native_data_type,
+              ccl::group_split_type group_id,
+              ccl::device_topology_type class_id,
+              class gpu_entry,
+              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
     gpu_kernel_t<gpu_entry::type(), group_id, class_id, native_data_type>& register_entry(
         gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
@@ -273,12 +271,11 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<device_t>>
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <
-        class native_data_type,
-        ccl::group_split_type group_id,
-        ccl::device_topology_type class_id,
-        class gpu_entry,
-        class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
+    template <class native_data_type,
+              ccl::group_split_type group_id,
+              ccl::device_topology_type class_id,
+              class gpu_entry,
+              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
     gpu_kernel_t<gpu_entry::type(), group_id, class_id, native_data_type>& register_entry(
         gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
@@ -316,9 +313,7 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>
     using proxy_base = proxy_observer_specific<
         ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>>;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_module_t =
         typename device_t::template gpu_module_t<algo_type, group, mode>; //same as in-process GPU
 
@@ -366,12 +361,11 @@ class ccl_scaleout_proxy<ccl_gpu_scaleup_proxy<ccl_numa_proxy<device_t>>>
         return wrapped_gpu_comm.template get_comm_data<group_id, class_id>();
     }
 
-    template <
-        class native_data_type,
-        ccl::group_split_type group_id,
-        ccl::device_topology_type class_id,
-        class gpu_entry,
-        class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
+    template <class native_data_type,
+              ccl::group_split_type group_id,
+              ccl::device_topology_type class_id,
+              class gpu_entry,
+              class = typename std::enable_if<group_id == ccl::group_split_type::cluster>::type>
     gpu_kernel_t<gpu_entry::type(), group_id, class_id, native_data_type>& register_entry(
         gpu_entry& entry) {
         const topology_addr<group_id, class_id>& comm_addr = get_comm_data<group_id, class_id>();
diff --git a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.cpp b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.cpp
index f3149fd85..25bfcc1f6 100644
--- a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.cpp
+++ b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.cpp
@@ -31,7 +31,7 @@ ccl_virtual_gpu_comm::ccl_virtual_gpu_comm(ccl_device& device,
 
 std::string ccl_virtual_gpu_comm::to_string_impl() const {
     std::string ret(name_impl());
-    ret = ret + ", comm: " + comm_to_str();
+    ret = ret + ", comm:\n" + comm_to_str();
     return ret;
 }
 
diff --git a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp
index 5890cdc3e..e5fb54dce 100644
--- a/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp
+++ b/src/common/comm/l0/devices/ccl_virtual_gpu_comm.hpp
@@ -27,9 +27,7 @@ class ccl_virtual_gpu_comm : public ccl_gpu_base_comm<ccl_virtual_gpu_comm, gpu_
 
     using impl_t = ccl_virtual_gpu_comm;
 
-    template <ccl_coll_type algo_type,
-              ccl::group_split_type group,
-              ccl::device_topology_type mode>
+    template <ccl_coll_type algo_type, ccl::group_split_type group, ccl::device_topology_type mode>
     using gpu_module_t = virtual_device_coll_module<algo_type, group, mode>;
 
     template <ccl_coll_type algo_type,
@@ -64,9 +62,6 @@ class ccl_virtual_gpu_comm : public ccl_gpu_base_comm<ccl_virtual_gpu_comm, gpu_
             base::template get_gpu_module_unsafe<module_type, group_id, class_id, gpu_module_t>(
                 registered_modules);
         assert(ptr);
-        if (not std::is_same<native_data_type, float>::value) {
-            throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + "Only float is supported");
-        }
         return ptr->template get_main_function<native_data_type>();
     }
 
diff --git a/src/common/comm/l0/devices/communication_structs/communication_data_holder.hpp b/src/common/comm/l0/devices/communication_structs/communication_data_holder.hpp
new file mode 100644
index 000000000..e3af787ef
--- /dev/null
+++ b/src/common/comm/l0/devices/communication_structs/communication_data_holder.hpp
@@ -0,0 +1,33 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "include/oneapi/ccl/types.hpp"
+#include "coll/algorithm/algorithms_enum.hpp"
+
+namespace native {
+
+template <class... T>
+struct communiaction_data_holder {
+    template <class U, ccl_coll_type type>
+    struct data_for_algo_t {
+        U data;
+    };
+
+    template <class Data, ccl_coll_type... types>
+    using data_storage_t = std::tuple<data_for_algo_t<Data, types>...>;
+};
+} // namespace native
+CCL_COLL_TYPE_LIST
diff --git a/src/common/comm/l0/devices/communication_structs/communication_stream.cpp b/src/common/comm/l0/devices/communication_structs/communication_stream.cpp
new file mode 100644
index 000000000..8748ac8fd
--- /dev/null
+++ b/src/common/comm/l0/devices/communication_structs/communication_stream.cpp
@@ -0,0 +1,24 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/comm/l0/devices/communication_structs/communication_strea.hpp"
+
+namespace native {
+communication_stream::communication_stream(ccl_device& device, std::shared_ptr<ccl_context> ctx) {
+    device_queue = std::make_shared<ccl_device::device_queue>(device->create_cmd_queue(ctx));
+    referenced_communication_device_count = 0;
+}
+
+} // namespace native
diff --git a/src/common/comm/l0/devices/communication_structs/communication_stream.hpp b/src/common/comm/l0/devices/communication_structs/communication_stream.hpp
new file mode 100644
index 000000000..b71c434ed
--- /dev/null
+++ b/src/common/comm/l0/devices/communication_structs/communication_stream.hpp
@@ -0,0 +1,28 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include <atomic>
+#include "oneapi/ccl/native_device_api/export_api.hpp"
+
+namespace native {
+struct communication_stream {
+    communication_stream(ccl_device& device, std::shared_ptr<ccl_context> ctx);
+
+    std::shared_ptr<ccl_device::device_queue> device_queue;
+    std::atomic<size_t> referenced_communication_device_count;
+};
+
+} // namespace native
diff --git a/src/common/comm/l0/gpu_comm_attr.cpp b/src/common/comm/l0/gpu_comm_attr.cpp
index 732d6c0b2..8db19cd26 100644
--- a/src/common/comm/l0/gpu_comm_attr.cpp
+++ b/src/common/comm/l0/gpu_comm_attr.cpp
@@ -92,7 +92,7 @@ bool gpu_comm_attr::sync_register_communicator(std::shared_ptr<communicator_inte
 
 bool gpu_comm_attr::delegate_sync_register_communicator(
     std::shared_ptr<communicator_interface> comm) {
-    ccl::device_indices_t device_group_indices;
+    ccl::device_indices_type device_group_indices;
 
     std::unique_lock<std::mutex> lock(barrier.thread_group_mutex);
 
diff --git a/src/common/comm/l0/gpu_comm_attr.hpp b/src/common/comm/l0/gpu_comm_attr.hpp
index 9afcdffbc..0a8477ed8 100644
--- a/src/common/comm/l0/gpu_comm_attr.hpp
+++ b/src/common/comm/l0/gpu_comm_attr.hpp
@@ -19,7 +19,7 @@
 #include <mutex>
 #include <condition_variable>
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/comm/l0/device_group_routing_schema.hpp"
 #include "common/comm/l0/context/context_barrier.hpp"
 
@@ -31,14 +31,16 @@ struct thread_group_context;
 } // namespace native
 
 namespace ccl {
-class host_communicator;
+namespace v1 {
 class communicator;
+}
+class host_communicator;
 struct communicator_interface;
 struct context_comm_addr {
     size_t thread_idx = 0;
     size_t thread_count = 0;
-    size_t comm_rank = 0;
-    size_t comm_size = 0;
+    int comm_rank = 0;
+    int comm_size = 0;
 
     std::string to_string() const;
 };
diff --git a/src/common/comm/l0/modules/base_entry_module.cpp b/src/common/comm/l0/modules/base_entry_module.cpp
index 736edd81f..2723dc635 100644
--- a/src/common/comm/l0/modules/base_entry_module.cpp
+++ b/src/common/comm/l0/modules/base_entry_module.cpp
@@ -38,6 +38,7 @@ void gpu_module_base::release() {
 
     if (module) {
         zeModuleDestroy(module);
+        module = nullptr;
     }
     functions.clear();
 }
diff --git a/src/common/comm/l0/modules/base_entry_module.hpp b/src/common/comm/l0/modules/base_entry_module.hpp
index ebd5377dd..2bb35ecc4 100644
--- a/src/common/comm/l0/modules/base_entry_module.hpp
+++ b/src/common/comm/l0/modules/base_entry_module.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 #include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "supported_topologies.hpp"
 #include "common/comm/l0/modules/kernel_functions.hpp"
 
@@ -90,9 +90,7 @@ struct virtual_device_coll_module {
     std::shared_ptr<device_coll_module<type, topology, mode>> real_module_ref;
 };
 
-template <ccl_coll_type type,
-          ccl::group_split_type group_id,
-          ccl::device_topology_type class_id>
+template <ccl_coll_type type, ccl::group_split_type group_id, ccl::device_topology_type class_id>
 struct coll_module_traits {
     static constexpr ccl_coll_type coll_type() {
         return type;
diff --git a/src/common/comm/l0/modules/gpu_typed_module.hpp b/src/common/comm/l0/modules/gpu_typed_module.hpp
index 62824f92b..b1af29ecf 100644
--- a/src/common/comm/l0/modules/gpu_typed_module.hpp
+++ b/src/common/comm/l0/modules/gpu_typed_module.hpp
@@ -46,13 +46,13 @@ struct real_gpu_typed_module : private gpu_module_base {
                   ", modules handle: ",
                   (void*)module);
         ccl_tuple_for_each(kernel_main_functions,
-                           detail::kernel_entry_initializer(
+                           detail::kernel_entry_initializer<type>(
                                [this](const std::string& name) -> gpu_module_base::kernel_handle {
                                    return this->import_kernel(name);
                                }));
 
         ccl_tuple_for_each(kernel_numa_functions,
-                           detail::kernel_entry_initializer(
+                           detail::kernel_entry_initializer<type>(
                                [this](const std::string& name) -> gpu_module_base::kernel_handle {
                                    return this->import_kernel(name);
                                }));
@@ -110,7 +110,7 @@ struct ipc_gpu_typed_module : private gpu_module_base {
     ipc_gpu_typed_module(handle module_handle) : gpu_module_base(nullptr) {
         LOG_DEBUG("Remote gpu module created: ", ccl_coll_type_to_str(type));
         ccl_tuple_for_each(kernel_main_functions,
-                           detail::kernel_entry_initializer(
+                           detail::kernel_entry_initializer<type>(
                                [](const std::string& name) -> gpu_module_base::kernel_handle {
                                    return nullptr;
                                }));
@@ -164,12 +164,12 @@ struct virtual_gpu_typed_module : private gpu_module_base {
               real_module_ref(real_module) {
         LOG_DEBUG("Virtual gpu module created:", ccl_coll_type_to_str(type));
         ccl_tuple_for_each(kernel_main_functions,
-                           detail::kernel_entry_initializer(
+                           detail::kernel_entry_initializer<type>(
                                [this](const std::string& name) -> gpu_module_base::kernel_handle {
                                    return this->import_kernel(name);
                                }));
         ccl_tuple_for_each(kernel_numa_functions,
-                           detail::kernel_entry_initializer(
+                           detail::kernel_entry_initializer<type>(
                                [this](const std::string& name) -> gpu_module_base::kernel_handle {
                                    return this->import_kernel(name);
                                }));
diff --git a/src/common/comm/l0/modules/kernel_argument_policies.hpp b/src/common/comm/l0/modules/kernel_argument_policies.hpp
index 3212bb4fb..2548395d4 100644
--- a/src/common/comm/l0/modules/kernel_argument_policies.hpp
+++ b/src/common/comm/l0/modules/kernel_argument_policies.hpp
@@ -88,6 +88,13 @@ struct arg_access_policy_atomic {
     std::atomic<bool> charged{ false };
 };
 
+template <size_t pos, class ArgType, bool must_exist = true>
+struct arg_access_policy_atomic_uncached : arg_access_policy_atomic<pos, ArgType, must_exist> {
+    using base_t = arg_access_policy_atomic<pos, ArgType, must_exist>;
+    using arg_type = typename base_t::arg_type;
+    using return_t = typename base_t::return_t;
+};
+
 template <size_t pos, class ArgType, bool must_exist = true>
 struct arg_access_policy_atomic_move {
     using arg_type = ArgType;
diff --git a/src/common/comm/l0/modules/kernel_argument_types.hpp b/src/common/comm/l0/modules/kernel_argument_types.hpp
index a479b84db..bb05542d7 100644
--- a/src/common/comm/l0/modules/kernel_argument_types.hpp
+++ b/src/common/comm/l0/modules/kernel_argument_types.hpp
@@ -30,6 +30,11 @@ struct kernel_arg : public policy_impl {
 template <size_t pos, class type>
 using thread_safe_arg = kernel_arg<pos, arg_access_policy_atomic<pos, type, false>>;
 
+// thread-safe unchashed argument: used for concurrent read/write applications
+template <size_t pos, class type>
+using thread_safe_uncached_arg =
+    kernel_arg<pos, arg_access_policy_atomic_uncached<pos, type, false>>;
+
 // thread-safe destructive-copying argument (rechargable): used for concurrent read/write applications, where reader take-away exising value
 template <size_t pos, class type>
 using thread_exchangable_arg = kernel_arg<pos, arg_access_policy_atomic_move<pos, type, false>>;
@@ -43,7 +48,7 @@ template <size_t pos>
 using stub_arg = kernel_arg<pos, arg_no_access_policy<pos>>;
 
 // utilities
-namespace details {
+namespace detail {
 struct args_printer {
     args_printer(std::stringstream& ss) : out(ss) {}
 
@@ -88,5 +93,5 @@ struct args_printer {
         }
     }
 };
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/modules/kernel_functions.hpp b/src/common/comm/l0/modules/kernel_functions.hpp
index 93ed8c987..1c91deae5 100644
--- a/src/common/comm/l0/modules/kernel_functions.hpp
+++ b/src/common/comm/l0/modules/kernel_functions.hpp
@@ -60,7 +60,7 @@ struct kernel_data_storage {
     std::string to_string() const {
         std::stringstream ss;
         ss << "handle: " << handle << "\n{\n";
-        details::args_printer func(ss);
+        detail::args_printer func(ss);
         ccl_tuple_for_each(args, func);
         ss << "}" << std::endl;
         return ss.str();
@@ -72,17 +72,17 @@ enum main_kernel_args { rank_index = 0, size_index = 1, args_start_index };
 
 //main kernel - used for GPU program execution
 template <class Impl, class... arguments>
-struct execution_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, size_t>,
-                                                     arg<main_kernel_args::size_index, size_t>,
+struct execution_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>,
+                                                     arg<main_kernel_args::size_index, int>,
                                                      arguments...> {
-    using base = kernel_data_storage<arg<main_kernel_args::rank_index, size_t>,
-                                     arg<main_kernel_args::size_index, size_t>,
+    using base = kernel_data_storage<arg<main_kernel_args::rank_index, int>,
+                                     arg<main_kernel_args::size_index, int>,
                                      arguments...>;
     using base::args;
     using base::handle;
 
-    using rank_type = size_t;
-    using size_type = size_t;
+    using rank_type = int;
+    using size_type = int;
 
     static constexpr const char* name() {
         return Impl::specific_name();
@@ -173,21 +173,19 @@ struct execution_kernel : public kernel_data_storage<arg<main_kernel_args::rank_
 
     template <class... kernel_argument>
     void set_args(typename kernel_argument::arg_type... new_val) {
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-value"
-        std::array<bool, sizeof...(kernel_argument)>{ (
+        std::array<bool, sizeof...(kernel_argument)> expander{ (
             this->template set_arg<kernel_argument>(new_val), true)... };
-#pragma clang diagnostic pop
+        (void)expander;
     }
 };
 
 // ipc_kernel - used for GPU data synchronization only
 template <class Impl, class... arguments>
-struct ipc_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, size_t>,
-                                               arg<main_kernel_args::size_index, size_t>,
+struct ipc_kernel : public kernel_data_storage<arg<main_kernel_args::rank_index, int>,
+                                               arg<main_kernel_args::size_index, int>,
                                                arguments...> {
-    using base = kernel_data_storage<arg<main_kernel_args::rank_index, size_t>,
-                                     arg<main_kernel_args::size_index, size_t>,
+    using base = kernel_data_storage<arg<main_kernel_args::rank_index, int>,
+                                     arg<main_kernel_args::size_index, int>,
                                      arguments...>;
     using base::args;
     using base::handle;
diff --git a/src/common/comm/l0/modules/modules_utils.hpp b/src/common/comm/l0/modules/modules_utils.hpp
index 524dba8c1..13c39cedd 100644
--- a/src/common/comm/l0/modules/modules_utils.hpp
+++ b/src/common/comm/l0/modules/modules_utils.hpp
@@ -14,12 +14,14 @@
  limitations under the License.
 */
 #pragma once
+
 #include "common/comm/l0/modules/base_entry_module.hpp"
 #include "common/utils/tuple.hpp"
 
 namespace native {
 namespace detail {
 
+template <ccl_coll_type type>
 struct kernel_entry_initializer {
     using loader_t =
         std::function<gpu_module_base::kernel_handle(const std::string& function_name)>;
@@ -36,5 +38,42 @@ struct kernel_entry_initializer {
 private:
     loader_t functor;
 };
+
+template <>
+struct kernel_entry_initializer<ccl_coll_allreduce> {
+    using loader_t =
+        std::function<gpu_module_base::kernel_handle(const std::string& function_name)>;
+
+    kernel_entry_initializer(loader_t&& f) : functor(std::move(f)) {}
+
+    template <class typed_kernel>
+    void operator()(typed_kernel& kernel) {
+        kernel.handle =
+            functor(std::string(typed_kernel::name()) + "_" +
+                    ccl::native_type_info<typename typed_kernel::processing_type>::name() + "_add");
+    }
+
+private:
+    loader_t functor;
+};
+
+template <>
+struct kernel_entry_initializer<ccl_coll_reduce> {
+    using loader_t =
+        std::function<gpu_module_base::kernel_handle(const std::string& function_name)>;
+
+    kernel_entry_initializer(loader_t&& f) : functor(std::move(f)) {}
+
+    template <class typed_kernel>
+    void operator()(typed_kernel& kernel) {
+        kernel.handle =
+            functor(std::string(typed_kernel::name()) + "_" +
+                    ccl::native_type_info<typename typed_kernel::processing_type>::name() + "_add");
+    }
+
+private:
+    loader_t functor;
+};
+
 } // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp b/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp
index bd15e451a..b454abe92 100644
--- a/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp
+++ b/src/common/comm/l0/modules/ring/allreduce_export_functions.hpp
@@ -24,9 +24,9 @@ struct ring_allreduce_kernel
               arg<main_kernel_args::args_start_index, size_t>,
               arg<main_kernel_args::args_start_index + 1, native_type*>,
               arg<main_kernel_args::args_start_index + 2, native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 3, native_type*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 4, int*>,
-              thread_safe_arg<main_kernel_args::args_start_index + 5, int*>,
+              thread_safe_uncached_arg<main_kernel_args::args_start_index + 3, native_type*>,
+              thread_safe_uncached_arg<main_kernel_args::args_start_index + 4, int*>,
+              thread_safe_uncached_arg<main_kernel_args::args_start_index + 5, int*>,
               arg<main_kernel_args::args_start_index + 6, int*>,
               thread_safe_arg<main_kernel_args::args_start_index + 7, native_type*>,
               thread_safe_arg<main_kernel_args::args_start_index + 8, int*>,
@@ -50,13 +50,15 @@ struct ring_allreduce_kernel
     using recv_buf_arg_type = typename recv_buf_arg::arg_type;
 
     using tmp_recv_buf_arg =
-        thread_safe_arg<main_kernel_args::args_start_index + 3, processing_type*>;
+        thread_safe_uncached_arg<main_kernel_args::args_start_index + 3, processing_type*>;
     using tmp_recv_buf_arg_type = typename tmp_recv_buf_arg::arg_type;
 
-    using income_data_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 4, int*>;
+    using income_data_flag_arg =
+        thread_safe_uncached_arg<main_kernel_args::args_start_index + 4, int*>;
     using income_data_flag_arg_type = typename income_data_flag_arg::arg_type;
 
-    using ready_to_recv_flag_arg = thread_safe_arg<main_kernel_args::args_start_index + 5, int*>;
+    using ready_to_recv_flag_arg =
+        thread_safe_uncached_arg<main_kernel_args::args_start_index + 5, int*>;
     using ready_to_recv_flag_arg_type = typename ready_to_recv_flag_arg::arg_type;
 
     using local_barrier_flag_arg = arg<main_kernel_args::args_start_index + 6, int*>;
diff --git a/src/common/comm/l0/scheduler/allied_process_group_scheduler.hpp b/src/common/comm/l0/scheduler/allied_process_group_scheduler.hpp
index 7a288f044..2a2b119c2 100644
--- a/src/common/comm/l0/scheduler/allied_process_group_scheduler.hpp
+++ b/src/common/comm/l0/scheduler/allied_process_group_scheduler.hpp
@@ -55,6 +55,7 @@ struct allied_process_group_scheduler : public thread_group_scheduler {
                                      size_t thread_id,
                                      device_community<class_id>& device_topology,
                                      device_t& device,
+                                     native::ccl_driver_context_ptr ctx,
                                      Arguments&&... args) {
         const topology_addr<group_id, class_id>& comm_data =
             device->template get_comm_data<group_id, class_id>();
@@ -88,6 +89,7 @@ struct allied_process_group_scheduler : public thread_group_scheduler {
             entry_factory::make_ordered_entry<EntryType, mode>(current_thread_schedule.get(),
                                                                device,
                                                                device_topology.get_device_storage(),
+                                                               ctx,
                                                                std::forward<Arguments>(args)...);
         LOG_DEBUG("do initial entry progress");
         created_entry->start();
@@ -119,6 +121,7 @@ struct allied_process_group_scheduler : public thread_group_scheduler {
                                          size_t thread_id,
                                          device_community<class_id>& device_topology,
                                          device_t& device,
+                                         native::ccl_driver_context_ptr ctx,
                                          Arguments&&... args) {
         const topology_addr<group_id, class_id>& comm_data =
             device->template get_comm_data<group_id, class_id>();
@@ -155,6 +158,7 @@ struct allied_process_group_scheduler : public thread_group_scheduler {
             entry_factory::make_ordered_entry<EntryType, mode>(current_thread_schedule.get(),
                                                                device,
                                                                device_topology.get_device_storage(),
+                                                               ctx,
                                                                std::forward<Arguments>(args)...);
 
         auto ipc_allgather_entry =
@@ -163,6 +167,7 @@ struct allied_process_group_scheduler : public thread_group_scheduler {
                                                                    device,
                                                                    ccl_communicator,
                                                                    node_total_devices,
+                                                                   ctx,
                                                                    created_entry->get_ipc_data());
 
         LOG_DEBUG("do initial entry progress");
@@ -187,7 +192,7 @@ struct allied_process_group_scheduler : public thread_group_scheduler {
         //if sched is not ready - send NULL
         return thread_schedule_ptr();
         auto req = submit_entry<EntryType, mode, group_id, class_id>(
-            process_id, thread_id, device_topology, device, std::forward<Arguments>(args)...);
+            process_id, thread_id, device_topology, device, ctx, std::forward<Arguments>(args)...);
         return req;
     }
 
diff --git a/src/common/comm/l0/scheduler/device_group_scheduler.hpp b/src/common/comm/l0/scheduler/device_group_scheduler.hpp
index e0da9a281..c63852a15 100644
--- a/src/common/comm/l0/scheduler/device_group_scheduler.hpp
+++ b/src/common/comm/l0/scheduler/device_group_scheduler.hpp
@@ -40,6 +40,7 @@ struct device_group_scheduler {
               class... Arguments>
     schedule_ptr submit_entry(device_community<class_id>& device_topology,
                               device_t& device,
+                              native::ccl_driver_context_ptr ctx,
                               Arguments&&... args) {
         //create schedule
         size_t group_size =
@@ -56,6 +57,7 @@ struct device_group_scheduler {
             entry_factory::make_ordered_entry<EntryType, mode>(current_schedule.get(),
                                                                device,
                                                                device_topology.get_device_storage(),
+                                                               ctx,
                                                                std::forward<Arguments>(args)...);
         LOG_DEBUG("do initial progress");
 
diff --git a/src/common/comm/l0/scheduler/thread_group_scheduler.hpp b/src/common/comm/l0/scheduler/thread_group_scheduler.hpp
index 614e85e39..c895522cd 100644
--- a/src/common/comm/l0/scheduler/thread_group_scheduler.hpp
+++ b/src/common/comm/l0/scheduler/thread_group_scheduler.hpp
@@ -55,6 +55,7 @@ struct thread_group_scheduler {
     thread_schedule_ptr submit_entry(size_t thread_id,
                                      device_community<class_id>& device_topology,
                                      device_t& device,
+                                     native::ccl_driver_context_ptr ctx,
                                      Arguments&&... args) {
         const topology_addr<group_id, class_id>& comm_data =
             device->template get_comm_data<group_id, class_id>();
@@ -80,6 +81,7 @@ struct thread_group_scheduler {
             entry_factory::make_ordered_entry<EntryType, mode>(current_thread_schedule.get(),
                                                                device,
                                                                device_topology.get_device_storage(),
+                                                               ctx,
                                                                std::forward<Arguments>(args)...);
         LOG_DEBUG("do initial entry progress");
         created_entry->start();
diff --git a/src/common/comm/l0/topology/cluster_device_utils.hpp b/src/common/comm/l0/topology/cluster_device_utils.hpp
index 29973b696..edb16fab5 100644
--- a/src/common/comm/l0/topology/cluster_device_utils.hpp
+++ b/src/common/comm/l0/topology/cluster_device_utils.hpp
@@ -16,15 +16,15 @@
 #pragma once
 #include "common/comm/l0/topology/topology_construction_utils.hpp"
 namespace native {
-namespace details {
+namespace detail {
 namespace cluster_utils {
 
 inline global_sorted_plain_graphs extract_full_node_plain_graphs(
     std::ostream& out,
-    const ccl::cluster_device_indices_t& cluster_indices,
+    const ccl::cluster_device_indices_type& cluster_indices,
     const std::string& hostname,
-    const details::global_sorted_plain_graphs& cluster_graphs) {
-    details::global_sorted_plain_graphs ret;
+    const detail::global_sorted_plain_graphs& cluster_graphs) {
+    detail::global_sorted_plain_graphs ret;
 
     out << "Find host: " << hostname << " in cluster size: " << cluster_indices.size() << std::endl;
     auto node_it = cluster_indices.find(hostname);
@@ -34,7 +34,7 @@ inline global_sorted_plain_graphs extract_full_node_plain_graphs(
     }
 
     //iterate over all allied processes on the same host
-    const ccl::process_device_indices_t& processes = node_it->second;
+    const ccl::process_device_indices_type& processes = node_it->second;
     out << "Find processes count: " << processes.size() << " on node: " << hostname << std::endl;
     for (const auto& process_val : processes) {
         auto process_id = process_val.first;
@@ -53,5 +53,5 @@ inline global_sorted_plain_graphs extract_full_node_plain_graphs(
     return ret;
 }
 } // namespace cluster_utils
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/topology/ring/cluster_group_device_creator.hpp b/src/common/comm/l0/topology/ring/cluster_group_device_creator.hpp
index f4f6d425c..bc9836e61 100644
--- a/src/common/comm/l0/topology/ring/cluster_group_device_creator.hpp
+++ b/src/common/comm/l0/topology/ring/cluster_group_device_creator.hpp
@@ -45,26 +45,26 @@ class cluster_group_device_creator {
     static size_t default_property_p2p_rating_calculator(const ccl_device& lhs,
                                                          const ccl_device& rhs);
 
-    static details::adjacency_matrix build_p2p_capability_matrix(
+    static detail::adjacency_matrix build_p2p_capability_matrix(
         std::ostream& out,
-        const ccl::process_device_indices_t& single_node_device_indices,
-        details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+        const ccl::process_device_indices_type& single_node_device_indices,
+        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
     bool build_all(std::ostream& out,
                    const ccl::context_comm_addr& comm_addr,
-                   const ccl::process_device_indices_t& cur_process_per_thread_device_indices,
-                   const details::adjacency_matrix& single_node_matrix,
-                   details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+                   const ccl::process_device_indices_type& cur_process_per_thread_device_indices,
+                   const detail::adjacency_matrix& single_node_matrix,
+                   detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 
     template <ccl::device_topology_type class_id>
     bool build_impl(
         std::ostream& out,
         const ccl::context_comm_addr& comm_addr,
-        const ccl::process_device_indices_t& cur_process_per_thread_device_indices,
-        const details::adjacency_matrix& single_node_matrix,
-        const std::vector<std::vector<details::colored_indexed_data<size_t>>>& syntetic_devices,
-        details::colored_plain_graph_list& graph_list,
+        const ccl::process_device_indices_type& cur_process_per_thread_device_indices,
+        const detail::adjacency_matrix& single_node_matrix,
+        const std::vector<std::vector<detail::colored_indexed_data<size_t>>>& syntetic_devices,
+        detail::colored_plain_graph_list& graph_list,
         std::map<size_t, size_t> process_device_rank_offset,
         size_t cluster_device_total_size,
-        details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 };
 } // namespace native
diff --git a/src/common/comm/l0/topology/ring/cluster_group_device_creator_impl.hpp b/src/common/comm/l0/topology/ring/cluster_group_device_creator_impl.hpp
index 24320fea5..09fa6bbef 100644
--- a/src/common/comm/l0/topology/ring/cluster_group_device_creator_impl.hpp
+++ b/src/common/comm/l0/topology/ring/cluster_group_device_creator_impl.hpp
@@ -36,19 +36,19 @@ inline cluster_group_device_creator::cluster_group_device_creator(size_t process
 inline size_t cluster_group_device_creator::default_property_p2p_rating_calculator(
     const ccl_device& lhs,
     const ccl_device& rhs) {
-    return details::property_p2p_rating_calculator(lhs, rhs, PROCESS_GROUP_WEIGHT);
+    return detail::property_p2p_rating_calculator(lhs, rhs, PROCESS_GROUP_WEIGHT);
 }
 
-inline details::adjacency_matrix cluster_group_device_creator::build_p2p_capability_matrix(
+inline detail::adjacency_matrix cluster_group_device_creator::build_p2p_capability_matrix(
     std::ostream& out,
-    const ccl::process_device_indices_t& single_node_device_indices,
-    details::p2p_rating_function ping) {
+    const ccl::process_device_indices_type& single_node_device_indices,
+    detail::p2p_rating_function ping) {
     // Build adjacency matrix with P2P capability:
     // Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
     // element values - is a weight of P2P activity: 0 means - devices are not connected
     // If values is not 0 - than two devies can be combined together
 
-    details::adjacency_matrix ring_p2p_matrix;
+    detail::adjacency_matrix ring_p2p_matrix;
     if (single_node_device_indices.empty()) {
         out << "No indices nothing to build" << std::endl;
         return ring_p2p_matrix;
@@ -64,14 +64,14 @@ inline details::adjacency_matrix cluster_group_device_creator::build_p2p_capabil
 inline bool cluster_group_device_creator::build_all(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_t& cur_process_per_thread_device_indices,
-    const details::adjacency_matrix& single_node_matrix,
-    details::p2p_rating_function ping) {
+    const ccl::process_device_indices_type& cur_process_per_thread_device_indices,
+    const detail::adjacency_matrix& single_node_matrix,
+    detail::p2p_rating_function ping) {
     out << "\n/************* \"" << cluster_group_device_creator::name()
         << "\" for threads: " << context.process_device_topology.size() << "*************/\n"
         << std::endl;
 
-    details::plain_graph_list my_device_graphs = details::graph_list_resolver(
+    detail::plain_graph_list my_device_graphs = detail::graph_list_resolver(
         single_node_matrix, cur_process_per_thread_device_indices, ping);
 
     size_t size = my_device_graphs.size();
@@ -82,26 +82,26 @@ inline bool cluster_group_device_creator::build_all(
     }
 
     out << "Transform graph to colored with process color: " << process_index << "\n";
-    details::colored_plain_graph_list my_colored_graphs =
-        details::create_colored(my_device_graphs, process_index);
+    detail::colored_plain_graph_list my_colored_graphs =
+        detail::create_colored(my_device_graphs, process_index);
 
-    out << "Process graphs:\n" << details::to_string(my_colored_graphs) << std::endl;
+    out << "Process graphs:\n" << detail::to_string(my_colored_graphs) << std::endl;
 
-    details::global_sorted_colored_plain_graphs global_graphs;
+    detail::global_sorted_colored_plain_graphs global_graphs;
     context.collect_cluster_colored_plain_graphs(my_colored_graphs, global_graphs);
 
     //calculate my devicses offset (rank) from cluster devices
     std::map<size_t, size_t> process_device_rank_offset;
     size_t accumulated_offset = 0;
-    for (typename details::global_sorted_colored_plain_graphs::value_type& process_graphs :
+    for (typename detail::global_sorted_colored_plain_graphs::value_type& process_graphs :
          global_graphs) {
         size_t process_num = process_graphs.first;
-        const details::colored_plain_graph_list& proc_graphs = process_graphs.second;
+        const detail::colored_plain_graph_list& proc_graphs = process_graphs.second;
 
         process_device_rank_offset[process_num] = accumulated_offset; //offset for iter process
         out << "Process idx: " << process_num << ", rank_offset: " << accumulated_offset
             << std::endl;
-        for (const details::colored_plain_graph& graph : proc_graphs) {
+        for (const detail::colored_plain_graph& graph : proc_graphs) {
             accumulated_offset += graph.size();
         }
     }
@@ -118,12 +118,12 @@ inline bool cluster_group_device_creator::build_all(
     ipc_devices_on_node.reserve(context.cluster_gpu_indices.size());
     processes_on_node.reserve(context.cluster_gpu_indices.size());
 
-    ccl::device_indices_t ipc_devices_candidates;
+    ccl::device_indices_type ipc_devices_candidates;
     for (const auto& node_conf : context.cluster_gpu_indices) {
         const ccl::host_id& hostname = node_conf.first;
-        const ccl::process_device_indices_t& processes = node_conf.second;
+        const ccl::process_device_indices_type& processes = node_conf.second;
 
-        ccl::device_indices_t node_device_intersection; //shared devics
+        ccl::device_indices_type node_device_intersection; //shared devics
 
         // each node should have the same processes count
         if (!processes_on_node.empty()) {
@@ -138,7 +138,7 @@ inline bool cluster_group_device_creator::build_all(
 
         //find shared devices for processes on node.
         for (auto it = processes.begin(); it != processes.end() && symm_test; ++it) {
-            ccl::device_indices_t result_intersection;
+            ccl::device_indices_type result_intersection;
             std::set_intersection(it->second.begin(),
                                   it->second.end(),
                                   node_device_intersection.begin(),
@@ -175,7 +175,7 @@ inline bool cluster_group_device_creator::build_all(
 
     // additional device types to inject in a final topology
     using thread_idx_t = size_t;
-    using colored_device_per_thread = details::colored_indexed_data<thread_idx_t>;
+    using colored_device_per_thread = detail::colored_indexed_data<thread_idx_t>;
 
     std::vector<colored_device_per_thread> ipc_devices;
     size_t ipc_links_per_proc = 0;
@@ -306,18 +306,18 @@ template <ccl::device_topology_type class_id>
 inline bool cluster_group_device_creator::build_impl(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_t& cur_process_per_thread_device_indices,
-    const details::adjacency_matrix& single_node_matrix,
-    const std::vector<std::vector<details::colored_indexed_data<size_t>>>& syntetic_devices,
-    details::colored_plain_graph_list& graph_list,
+    const ccl::process_device_indices_type& cur_process_per_thread_device_indices,
+    const detail::adjacency_matrix& single_node_matrix,
+    const std::vector<std::vector<detail::colored_indexed_data<size_t>>>& syntetic_devices,
+    detail::colored_plain_graph_list& graph_list,
     std::map<size_t, size_t> process_device_rank_offset,
     size_t cluster_device_total_size,
-    details::p2p_rating_function ping /* = default_property_p2p_rating_calculator*/) {
+    detail::p2p_rating_function ping /* = default_property_p2p_rating_calculator*/) {
     size_t ring_index = 0;
     out << "Start building topology: " << ::to_string(class_id)
         << ", for graphs: " << graph_list.size() << "\n"
         << "ring index: " << ring_index << std::endl;
-    out << details::to_string(graph_list);
+    out << detail::to_string(graph_list);
 
     auto& ctx_per_thread_data = context.process_device_topology;
     (void)ctx_per_thread_data;
@@ -348,7 +348,7 @@ inline bool cluster_group_device_creator::build_impl(
             std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
                 devices_factory.thread_gpu_comms.find(thread_id)->second;
             // create device comm wrappers and upgrade last devices in list up to numa type
-            details::color_t process;
+            detail::color_t process;
             (void)process;
             ccl::device_index_type last_in_graph_index;
             const auto& tmp = *id_ring.rbegin();
@@ -367,7 +367,7 @@ inline bool cluster_group_device_creator::build_impl(
                 }
 
                 auto proxy_virt =
-                    details::add_numa_proxy_device<ccl_virtual_gpu_comm, group_id(), class_id>(
+                    detail::add_numa_proxy_device<ccl_virtual_gpu_comm, group_id(), class_id>(
                         *non_indexed_plain_devices, last_in_graph_index, context, devices_factory);
                 if (proxy_virt) {
                     created_cpu_context_indices.insert(last_in_graph_index);
@@ -376,7 +376,7 @@ inline bool cluster_group_device_creator::build_impl(
                 }
                 else {
                     auto proxy_real =
-                        details::add_numa_proxy_device<ccl_gpu_comm, group_id(), class_id>(
+                        detail::add_numa_proxy_device<ccl_gpu_comm, group_id(), class_id>(
                             *non_indexed_plain_devices,
                             last_in_graph_index,
                             context,
@@ -446,7 +446,7 @@ inline bool cluster_group_device_creator::build_impl(
 
             // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
             auto rank_builder =
-                create_device_functor<details::colored_graph_ring_indexer<group_id(), class_id>>(
+                create_device_functor<detail::colored_graph_ring_indexer<group_id(), class_id>>(
                     id_ring,
                     thread_id,
                     process_index,
@@ -458,7 +458,7 @@ inline bool cluster_group_device_creator::build_impl(
             ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
 
             // print partial topology enumeration for 'graph' from 'graph_list'
-            details::printer<group_id(), class_id> p;
+            detail::printer<group_id(), class_id> p;
             ccl_tuple_for_each(out_indexed_devices->get_device_storage(), p);
             out << "Indexer result for devices in thread idx (" << thread_id << "/"
                 << ctx_per_thread_data.size() << "):\n"
@@ -498,19 +498,19 @@ inline bool cluster_group_device_creator::build_impl(
 
             //find max device rank in current thread devices
             const auto& curr_real =
-                details::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
+                detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_virt =
-                details::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
+                detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
                     indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_real = details::
+            const auto& curr_scale_real = detail::
                 get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, group_id(), class_id>(
                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_scale_virt =
-                details::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                  group_id(),
-                                                  class_id>(indexed_devices_for_current_thread,
-                                                            id_ring);
+                detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
+                                                 group_id(),
+                                                 class_id>(indexed_devices_for_current_thread,
+                                                           id_ring);
 
             size_t tg_max_rank = std::max({ std::get<0>(curr_real),
                                             std::get<0>(curr_virt),
@@ -547,19 +547,19 @@ inline bool cluster_group_device_creator::build_impl(
                 auto& next_thread_ring_topology = community->get_device_storage();
 
                 const auto& real =
-                    details::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
+                    detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
                         next_thread_ring_topology, id_ring);
                 const auto& virt =
-                    details::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
+                    detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
                         next_thread_ring_topology, id_ring);
                 const auto& scale_real =
-                    details::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>,
-                                                      group_id(),
-                                                      class_id>(next_thread_ring_topology, id_ring);
+                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>,
+                                                     group_id(),
+                                                     class_id>(next_thread_ring_topology, id_ring);
                 const auto& scale_virt =
-                    details::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                      group_id(),
-                                                      class_id>(next_thread_ring_topology, id_ring);
+                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
+                                                     group_id(),
+                                                     class_id>(next_thread_ring_topology, id_ring);
                 if (next_rank != std::min({ std::get<0>(real),
                                             std::get<0>(virt),
                                             std::get<0>(scale_real),
@@ -577,7 +577,7 @@ inline bool cluster_group_device_creator::build_impl(
                     << ")" << std::endl;
                 if (next_rank == std::get<0>(real)) {
                     auto locker =
-                        details::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
+                        detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
                             next_rank,
                             index_offset_for_graphs[graph_num],
                             real,
@@ -589,7 +589,7 @@ inline bool cluster_group_device_creator::build_impl(
                         << locker->to_string() << std::endl;
                 }
                 else if (next_rank == std::get<0>(virt)) {
-                    auto locker = details::
+                    auto locker = detail::
                         add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), class_id>(
                             next_rank,
                             index_offset_for_graphs[graph_num],
@@ -646,13 +646,13 @@ inline bool cluster_group_device_creator::build_impl(
                     auto& out_indexed_devices = community->get_device_storage();
 
                     size_t inserted_device_type_index =
-                        details::inject_scaleup_device<group_id(),
-                                                       class_id,
-                                                       process_group_context,
-                                                       ccl_gpu_comm,
-                                                       ccl_virtual_gpu_comm,
-                                                       ccl_numa_proxy<ccl_gpu_comm>,
-                                                       ccl_numa_proxy<ccl_virtual_gpu_comm>>(
+                        detail::inject_scaleup_device<group_id(),
+                                                      class_id,
+                                                      process_group_context,
+                                                      ccl_gpu_comm,
+                                                      ccl_virtual_gpu_comm,
+                                                      ccl_numa_proxy<ccl_gpu_comm>,
+                                                      ccl_numa_proxy<ccl_virtual_gpu_comm>>(
                             out_indexed_devices, idx.index, context, devices_factory);
                     if (inserted_device_type_index != std::numeric_limits<size_t>::max()) {
                         out << "Inject scaleUp device by order: " << inserted_device_type_index
@@ -687,7 +687,7 @@ inline bool cluster_group_device_creator::build_impl(
 
                     auto& out_indexed_devices = community->get_device_storage();
 
-                    size_t inserted_device_type_index = details::inject_scaleout_device<
+                    size_t inserted_device_type_index = detail::inject_scaleout_device<
                         group_id(),
                         class_id,
                         process_group_context,
@@ -730,7 +730,7 @@ inline bool cluster_group_device_creator::build_impl(
          ++per_thread_it) {
         size_t thread_id = per_thread_it->first;
 
-        details::printer<group_id(), class_id> p;
+        detail::printer<group_id(), class_id> p;
 
         std::shared_ptr<device_community<class_id>> community;
         if (graph_list.size() == 1) {
diff --git a/src/common/comm/l0/topology/ring/device_group_ring_creator.cpp b/src/common/comm/l0/topology/ring/device_group_ring_creator.cpp
index f41ca95c0..bfadfe3d6 100644
--- a/src/common/comm/l0/topology/ring/device_group_ring_creator.cpp
+++ b/src/common/comm/l0/topology/ring/device_group_ring_creator.cpp
@@ -25,13 +25,13 @@ device_group_ring_topology::device_group_ring_topology(device_group_context& com
 
 size_t device_group_ring_topology::default_property_p2p_rating_calculator(const ccl_device& lhs,
                                                                           const ccl_device& rhs) {
-    return details::property_p2p_rating_calculator(lhs, rhs, DEVICE_GROUP_WEIGHT);
+    return detail::property_p2p_rating_calculator(lhs, rhs, DEVICE_GROUP_WEIGHT);
 }
 
-details::adjacency_matrix device_group_ring_topology::build_p2p_capability_matrix(
+detail::adjacency_matrix device_group_ring_topology::build_p2p_capability_matrix(
     std::ostream& out,
-    const ccl::device_indices_t& group_device_indices,
-    details::p2p_rating_function ping) {
+    const ccl::device_indices_type& group_device_indices,
+    detail::p2p_rating_function ping) {
     // Build adjacency matrix between devices using `ping` function:
     // Default ping function is checking P2P access capabilities in a way:
     // 1) Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
@@ -45,10 +45,10 @@ details::adjacency_matrix device_group_ring_topology::build_p2p_capability_matri
     return get_platform().calculate_device_access_metric(group_device_indices, ping);
 }
 
-details::adjacency_matrix device_group_ring_topology::build_p2p_capability_matrix(
+detail::adjacency_matrix device_group_ring_topology::build_p2p_capability_matrix(
     std::ostream& out,
     const ccl::device_mask_t& group_device_masks,
-    details::p2p_rating_function ping) {
+    detail::p2p_rating_function ping) {
     // Build adjacency matrix between devices using `ping` function:
     // Default ping function is checking P2P access capabilities in a way:
     // 1) Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
@@ -62,13 +62,13 @@ details::adjacency_matrix device_group_ring_topology::build_p2p_capability_matri
 
 bool device_group_ring_topology::build(std::ostream& out,
                                        const ccl::context_comm_addr& comm_addr,
-                                       const ccl::device_indices_t& group_device_indices,
-                                       const details::adjacency_matrix& matrix) {
+                                       const ccl::device_indices_type& group_device_indices,
+                                       const detail::adjacency_matrix& matrix) {
     out << "\n/*************\"" << device_group_ring_topology::name() << "\"*************/\n"
         << std::endl;
 
     out << "Resolve device graph" << std::endl;
-    details::plain_graph_list id_rings = graph_list_resolver(matrix, group_device_indices);
+    detail::plain_graph_list id_rings = graph_list_resolver(matrix, group_device_indices);
 
     size_t size = id_rings.size();
     out << "Resolved graphs count: " << size << "\n";
@@ -91,7 +91,7 @@ bool device_group_ring_topology::build(std::ostream& out,
 bool device_group_ring_topology::build(std::ostream& out,
                                        const ccl::context_comm_addr& comm_addr,
                                        const ccl::device_mask_t& group_device_masks,
-                                       const details::adjacency_matrix& matrix) {
+                                       const detail::adjacency_matrix& matrix) {
     return build(
         out, comm_addr, native::ccl_device_driver::get_device_indices(group_device_masks), matrix);
 }
@@ -100,10 +100,10 @@ template <ccl::device_topology_type class_id>
 bool device_group_ring_topology::build_specific_topology(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_t& group_device_indices,
-    const details::plain_graph& graph) {
+    const ccl::device_indices_type& group_device_indices,
+    const detail::plain_graph& graph) {
     out << "Start building topology: " << ::to_string(class_id) << ", for graph:\n";
-    out << details::to_string(graph);
+    out << detail::to_string(graph);
 
     size_t thread_id = comm_addr.thread_idx;
     auto topology_comm_addr = comm_addr;
@@ -111,16 +111,16 @@ bool device_group_ring_topology::build_specific_topology(
     auto device_topology = std::make_shared<device_community<class_id>>(topology_comm_addr);
 
     out << "\nStart indexer for thread: " << thread_id << std::endl;
-    details::id_thread_table assigned_ids;
-    std::vector<details::marked_idx> marked_id_ring = details::create_marked(graph);
-    auto rank_builder = create_device_functor<details::graph_ring_indexer<group_id(), class_id>>(
+    detail::id_thread_table assigned_ids;
+    std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(graph);
+    auto rank_builder = create_device_functor<detail::graph_ring_indexer<group_id(), class_id>>(
         marked_id_ring, assigned_ids, thread_id, device_topology->get_device_storage());
     std::shared_ptr<specific_plain_device_storage> group_gpu_comms =
         devices_factory.create_devices_by_indices(thread_id, group_device_indices);
 
     ccl_tuple_for_each(*group_gpu_comms, rank_builder);
 
-    details::printer<group_id(), class_id> p;
+    detail::printer<group_id(), class_id> p;
     ccl_tuple_for_each(*group_gpu_comms, p);
     out << "Indexer result: \n" << p.to_string();
 
@@ -131,16 +131,17 @@ bool device_group_ring_topology::build_specific_topology(
     return true;
 }
 
-bool device_group_ring_topology::build_specific(std::ostream& out,
-                                                const ccl::context_comm_addr& comm_addr,
-                                                const ccl::device_indices_t& group_device_indices,
-                                                const details::plain_graph& graph,
-                                                const details::adjacency_matrix& matrix) {
+bool device_group_ring_topology::build_specific(
+    std::ostream& out,
+    const ccl::context_comm_addr& comm_addr,
+    const ccl::device_indices_type& group_device_indices,
+    const detail::plain_graph& graph,
+    const detail::adjacency_matrix& matrix) {
     bool result = build_specific_topology<ccl::device_topology_type::ring>(
         out, comm_addr, group_device_indices, graph);
     /*
     // check a2a possibility
-    bool a2a_capable = details::check_graph_a2a_capable(graph, matrix,out);
+    bool a2a_capable = detail::check_graph_a2a_capable(graph, matrix,out);
     if (a2a_capable)
     {
         // a2a should starts from real device
@@ -162,11 +163,11 @@ template <ccl::device_topology_type class_id>
 bool device_group_ring_topology::build_scale_up_specific_topology(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_t& group_device_indices,
-    const details::plain_graph_list& graph_list) {
+    const ccl::device_indices_type& group_device_indices,
+    const detail::plain_graph_list& graph_list) {
     out << "Start building topology: " << ::to_string(class_id)
         << ", for graphs: " << graph_list.size() << "\n";
-    out << details::to_string(graph_list);
+    out << detail::to_string(graph_list);
 
     size_t thread_id = comm_addr.thread_idx;
     size_t graph_num = 0;
@@ -174,7 +175,7 @@ bool device_group_ring_topology::build_scale_up_specific_topology(
 
     // create all required device wrappers
     // these wrappers would be used for ALL context at the next iteration
-    ccl::device_indices_t total_device_indices;
+    ccl::device_indices_type total_device_indices;
     for (const auto& graph : graph_list) {
         total_device_indices.insert(graph.begin(), graph.end());
     }
@@ -193,10 +194,10 @@ bool device_group_ring_topology::build_scale_up_specific_topology(
         out << "\nStart indexer for graph num: " << graph_num << ", thread: " << thread_id
             << std::endl;
 
-        details::id_thread_table assigned_ids;
-        std::vector<details::marked_idx> marked_id_ring = details::create_marked(graph);
+        detail::id_thread_table assigned_ids;
+        std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(graph);
         auto rank_builder =
-            create_device_functor<details::graph_ring_indexer_unique_index<group_id(), class_id>>(
+            create_device_functor<detail::graph_ring_indexer_unique_index<group_id(), class_id>>(
                 marked_id_ring,
                 assigned_ids,
                 thread_id,
@@ -208,15 +209,14 @@ bool device_group_ring_topology::build_scale_up_specific_topology(
         // all loca group devices in different graph would be linked by scale_up_proxy
         // each local group ( in graph) must have at least one scale_up_proxy device
         const ccl::device_index_type& last_in_graph_index = *graph.rbegin();
-        auto scale_virt =
-            details::add_numa_proxy_device<ccl_virtual_gpu_comm, group_id(), class_id>(
-                *group_gpu_comms, last_in_graph_index, context, devices_factory);
+        auto scale_virt = detail::add_numa_proxy_device<ccl_virtual_gpu_comm, group_id(), class_id>(
+            *group_gpu_comms, last_in_graph_index, context, devices_factory);
         if (scale_virt) {
             out << "Added scaleup virtual device:\n"
                 << scale_virt->to_string() << "\nby idx: " << last_in_graph_index << std::endl;
         }
         else {
-            auto scale_real = details::add_numa_proxy_device<ccl_gpu_comm, group_id(), class_id>(
+            auto scale_real = detail::add_numa_proxy_device<ccl_gpu_comm, group_id(), class_id>(
                 *group_gpu_comms, last_in_graph_index, context, devices_factory);
             if (scale_real) {
                 out << "Added scaleup real device:\n"
@@ -243,7 +243,7 @@ bool device_group_ring_topology::build_scale_up_specific_topology(
         ccl_tuple_for_each(*group_gpu_comms, rank_builder);
 
         // just print partial topology progress for current 'graph'
-        details::printer<group_id(), class_id> p;
+        detail::printer<group_id(), class_id> p;
         ccl_tuple_for_each(device_topology->get_device_storage(), p);
         out << "\nIndexer for graph num: " << graph_num++ << ", result: \n" << p.to_string();
 
@@ -255,7 +255,7 @@ bool device_group_ring_topology::build_scale_up_specific_topology(
     // remember constructed topology
     context.device_topology.get_community<class_id>().set_additiona_topology(device_topology);
 
-    details::printer<group_id(), class_id> p;
+    detail::printer<group_id(), class_id> p;
     ccl_tuple_for_each(device_topology->get_device_storage(), p);
     out << "\nFinal topology: \n" << p.to_string();
     return true;
@@ -264,9 +264,9 @@ bool device_group_ring_topology::build_scale_up_specific_topology(
 bool device_group_ring_topology::build_scale_up_specific(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
-    const ccl::device_indices_t& group_device_indices,
-    const details::plain_graph_list& graph_list,
-    const details::adjacency_matrix& matrix) {
+    const ccl::device_indices_type& group_device_indices,
+    const detail::plain_graph_list& graph_list,
+    const detail::adjacency_matrix& matrix) {
     bool result = build_scale_up_specific_topology<ccl::device_topology_type::ring>(
         out, comm_addr, group_device_indices, graph_list);
     /*
@@ -274,7 +274,7 @@ bool device_group_ring_topology::build_scale_up_specific(
     bool a2a_capable = true;
     for (const auto& graph : graph_list)
     {
-        a2a_capable &= details::check_graph_a2a_capable(graph, matrix, out);
+        a2a_capable &= detail::check_graph_a2a_capable(graph, matrix, out);
     }
 
     if (a2a_capable)
diff --git a/src/common/comm/l0/topology/ring/device_group_ring_creator.hpp b/src/common/comm/l0/topology/ring/device_group_ring_creator.hpp
index 50a741a21..26962f1ba 100644
--- a/src/common/comm/l0/topology/ring/device_group_ring_creator.hpp
+++ b/src/common/comm/l0/topology/ring/device_group_ring_creator.hpp
@@ -39,47 +39,47 @@ class device_group_ring_topology {
 
     static size_t default_property_p2p_rating_calculator(const ccl_device& lhs,
                                                          const ccl_device& rhs);
-    static details::adjacency_matrix build_p2p_capability_matrix(
+    static detail::adjacency_matrix build_p2p_capability_matrix(
         std::ostream& out,
-        const ccl::device_indices_t& group_device_indices,
-        details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+        const ccl::device_indices_type& group_device_indices,
+        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 
-    static details::adjacency_matrix build_p2p_capability_matrix(
+    static detail::adjacency_matrix build_p2p_capability_matrix(
         std::ostream& out,
         const ccl::device_mask_t& group_device_masks,
-        details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
     bool build(std::ostream& out,
                const ccl::context_comm_addr& comm_addr,
                const ccl::device_mask_t& group_device_masks,
-               const details::adjacency_matrix& matrix);
+               const detail::adjacency_matrix& matrix);
     bool build(std::ostream& out,
                const ccl::context_comm_addr& comm_addr,
-               const ccl::device_indices_t& group_device_indices,
-               const details::adjacency_matrix& matrix);
+               const ccl::device_indices_type& group_device_indices,
+               const detail::adjacency_matrix& matrix);
 
 private:
     bool build_specific(std::ostream& out,
                         const ccl::context_comm_addr& comm_addr,
-                        const ccl::device_indices_t& group_device_indices,
-                        const details::plain_graph& graph,
-                        const details::adjacency_matrix& matrix);
+                        const ccl::device_indices_type& group_device_indices,
+                        const detail::plain_graph& graph,
+                        const detail::adjacency_matrix& matrix);
 
     template <ccl::device_topology_type topology_type>
     bool build_specific_topology(std::ostream& out,
                                  const ccl::context_comm_addr& comm_addr,
-                                 const ccl::device_indices_t& group_device_indices,
-                                 const details::plain_graph& graph);
+                                 const ccl::device_indices_type& group_device_indices,
+                                 const detail::plain_graph& graph);
 
     bool build_scale_up_specific(std::ostream& out,
                                  const ccl::context_comm_addr& comm_addr,
-                                 const ccl::device_indices_t& group_device_indices,
-                                 const details::plain_graph_list& graph_list,
-                                 const details::adjacency_matrix& matrix);
+                                 const ccl::device_indices_type& group_device_indices,
+                                 const detail::plain_graph_list& graph_list,
+                                 const detail::adjacency_matrix& matrix);
 
     template <ccl::device_topology_type topology_type>
     bool build_scale_up_specific_topology(std::ostream& out,
                                           const ccl::context_comm_addr& comm_addr,
-                                          const ccl::device_indices_t& group_device_indices,
-                                          const details::plain_graph_list& graph);
+                                          const ccl::device_indices_type& group_device_indices,
+                                          const detail::plain_graph_list& graph);
 };
 } // namespace native
diff --git a/src/common/comm/l0/topology/ring/process_group_ring_creator.cpp b/src/common/comm/l0/topology/ring/process_group_ring_creator.cpp
index f90244b40..3b3436880 100644
--- a/src/common/comm/l0/topology/ring/process_group_ring_creator.cpp
+++ b/src/common/comm/l0/topology/ring/process_group_ring_creator.cpp
@@ -41,7 +41,7 @@ size_t
     allied_process_group_ring_topology::default_property_p2p_rating_calculator(const ccl_device &lhs,
                                                                                const ccl_device &rhs)
 {
-    return details::property_p2p_rating_calculator(lhs, rhs, PROCESS_GROUP_WEIGHT);
+    return detail::property_p2p_rating_calculator(lhs, rhs, PROCESS_GROUP_WEIGHT);
 }
 
 std::pair<size_t, size_t>
@@ -110,12 +110,12 @@ allied_process_group_ring_topology::calculate_rank_offset_with_size(size_t proce
     }
 
 
-details::adjacency_matrix
+detail::adjacency_matrix
         allied_process_group_ring_topology::build_p2p_capability_matrix(std::ostream& out,
                                                                         const ccl::process_aggregated_device_mask_t& node_device_masks,
-                                                                        details::p2p_rating_function ping)
+                                                                        detail::p2p_rating_function ping)
 {
-    ccl::process_device_indices_t per_process_device_indices;
+    ccl::process_device_indices_type per_process_device_indices;
     for(const auto& mask : node_device_masks)
     {
         per_process_device_indices.insert({mask.first, ccl_device_driver::get_device_indices(mask.second)});
@@ -125,17 +125,17 @@ details::adjacency_matrix
                                        ping);
 }
 
-details::adjacency_matrix
+detail::adjacency_matrix
     allied_process_group_ring_topology::build_p2p_capability_matrix(std::ostream& out,
-                                                                    const ccl::process_device_indices_t& node_device_indices,
-                                                                    details::p2p_rating_function ping)
+                                                                    const ccl::process_device_indices_type& node_device_indices,
+                                                                    detail::p2p_rating_function ping)
 {
     // Build adjacency matrix with P2P capability:
     // Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
     // element values - is a weight of P2P activity: 0 means - devices are not connected
     // If values is not 0 - than two devies can be combined together
 
-    details::adjacency_matrix ring_p2p_matrix;
+    detail::adjacency_matrix ring_p2p_matrix;
     if (node_device_indices.empty())
     {
         out << "No indices nothing to build" << std::endl;
@@ -153,17 +153,17 @@ details::adjacency_matrix
 bool allied_process_group_ring_topology::build(std::ostream& out,
                                                 const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
                                                 const std::vector<ccl::device_mask_t>& ipc_device_mask,
-                                                const details::adjacency_matrix& matrix,
-                                                details::p2p_rating_function ping)
+                                                const detail::adjacency_matrix& matrix,
+                                                detail::p2p_rating_function ping)
 {
 
-    ccl::process_device_indices_t per_thread_device_indices;
+    ccl::process_device_indices_type per_thread_device_indices;
     for(const auto& mask : per_thread_device_masks)
     {
         per_thread_device_indices.insert({mask.first, ccl_device_driver::get_device_indices(mask.second)});
     }
 
-    std::vector<ccl::device_indices_t> ipc_device_indices;
+    std::vector<ccl::device_indices_type> ipc_device_indices;
     for(const auto& mask : ipc_device_mask)
     {
         ipc_device_indices.push_back(ccl_device_driver::get_device_indices(mask));
@@ -172,17 +172,17 @@ bool allied_process_group_ring_topology::build(std::ostream& out,
 }
 
 bool allied_process_group_ring_topology::build(std::ostream& out,
-               const ccl::process_device_indices_t& per_thread_device_indices,
-               const std::vector<ccl::device_indices_t>& ipc_device_indices,
-               const details::adjacency_matrix& matrix,
-               details::p2p_rating_function ping)
+               const ccl::process_device_indices_type& per_thread_device_indices,
+               const std::vector<ccl::device_indices_type>& ipc_device_indices,
+               const detail::adjacency_matrix& matrix,
+               detail::p2p_rating_function ping)
 {
     out << "\n/************* \"" << allied_process_group_ring_topology::name()
         << "\" for threads: " << context.process_device_topology.size()
         << "*************/\n" << std::endl;
 
     // let's emulate process as thread, because topology builder is similar with thread topology
-    ccl::process_device_indices_t full_device_indices = per_thread_device_indices;
+    ccl::process_device_indices_type full_device_indices = per_thread_device_indices;
     size_t max_current_thread_id = per_thread_device_indices.rbegin()->first;
     out << "Assign specific-mock thread id for ipc_devices, count: "
         << ipc_device_indices.size() << std::endl;
@@ -205,7 +205,7 @@ bool allied_process_group_ring_topology::build(std::ostream& out,
 
     // build ring, based on p2p for device hw id
     out << "Resolve device graph" << std::endl;
-    details::plain_graph_list id_rings = graph_list_resolver(matrix, full_device_indices, ping);
+    detail::plain_graph_list id_rings = graph_list_resolver(matrix, full_device_indices, ping);
     size_t size = id_rings.size();
     out << "Resolved graphs count: " << size << "\n";
     if (!size)
@@ -223,16 +223,16 @@ bool allied_process_group_ring_topology::build(std::ostream& out,
 }
 
 bool allied_process_group_ring_topology::build_all(std::ostream& out,
-                                                  const ccl::process_device_indices_t& per_thread_device_indices,
-                                                  const std::vector<ccl::device_indices_t>& ipc_device_indices,
-                                                  const details::adjacency_matrix& matrix,
-                                                  details::p2p_rating_function ping)
+                                                  const ccl::process_device_indices_type& per_thread_device_indices,
+                                                  const std::vector<ccl::device_indices_type>& ipc_device_indices,
+                                                  const detail::adjacency_matrix& matrix,
+                                                  detail::p2p_rating_function ping)
 {
     out << "\n/************* \"" << allied_process_group_ring_topology::name()
         << "\" for threads: " << context.process_device_topology.size()
         << "*************/\n" << std::endl;
 
-    details::plain_graph_list my_rings = create_my_process_graphs(out,
+    detail::plain_graph_list my_rings = create_my_process_graphs(out,
                                                                   per_thread_device_indices,
                                                                   matrix,
                                                                   ping);
@@ -245,34 +245,34 @@ bool allied_process_group_ring_topology::build_all(std::ostream& out,
     }
 
     out << "Graph for process: " << process_index << "\n";
-    out << details::to_string(my_rings) << std::endl;
+    out << detail::to_string(my_rings) << std::endl;
 
     out << "Transform graph to colored with process color: " << process_index << "\n";
-    details::colored_plain_graph_list my_colored_ring = details::create_colored(my_rings, process_index);
+    detail::colored_plain_graph_list my_colored_ring = detail::create_colored(my_rings, process_index);
 
-    details::global_sorted_colored_plain_graphs global_graphs =
+    detail::global_sorted_colored_plain_graphs global_graphs =
                                     collect_cluster_colored_plain_graphs(out,
                                                                  context.get_communicator(),
                                                                  process_index, my_colored_ring);
 
     std::map<size_t, size_t> process_device_rank_offset;
     size_t accumulated_offset = 0;
-    for (typename details::global_sorted_colored_plain_graphs::value_type& process_graphs : global_graphs)
+    for (typename detail::global_sorted_colored_plain_graphs::value_type& process_graphs : global_graphs)
     {
         size_t process_num = process_graphs.first;
-        const details::colored_plain_graph_list& proc_graphs = process_graphs.second;
+        const detail::colored_plain_graph_list& proc_graphs = process_graphs.second;
 
         process_device_rank_offset[process_num] = accumulated_offset;  //offset for iter process
         out << "Process idx: " << process_num
             << ", rank_offset: " << accumulated_offset << std::endl;
-        for (const details::colored_plain_graph& graph : proc_graphs)
+        for (const detail::colored_plain_graph& graph : proc_graphs)
         {
             accumulated_offset += graph.size();
         }
     }
 
     out << "Cluster device size: " << accumulated_offset << std::endl;
-    details::global_colored_plain_graphs merged_cluster_graphs =
+    detail::global_colored_plain_graphs merged_cluster_graphs =
                                     merge_allied_nodes_in_colored_plain_graphs(out,
                                                                     context.cluster_gpu_indices,
                                                                     process_index, process_count,
@@ -280,23 +280,23 @@ bool allied_process_group_ring_topology::build_all(std::ostream& out,
                                                                     ping);
 
     out << "Cluster merged graphs result on process idx: " << process_index << std::endl;
-    out << details::to_string(merged_cluster_graphs) << std::endl;
+    out << detail::to_string(merged_cluster_graphs) << std::endl;
 
-    details::colored_plain_graph_list my_merged_rings =
+    detail::colored_plain_graph_list my_merged_rings =
             resize_merged_colored_graphs_for_process(process_index, process_count,
                                                      merged_cluster_graphs,
                                                      my_colored_ring, out);
 
     out << "Resized merged graph list on process idx: " << process_index << std::endl;
-    out << details::to_string(my_merged_rings) << std::endl;
+    out << detail::to_string(my_merged_rings) << std::endl;
 
     out << "Notify merged graphs changes for cluster\n";
-    details::global_sorted_colored_plain_graphs global_merged_graphs =
+    detail::global_sorted_colored_plain_graphs global_merged_graphs =
                                     collect_cluster_colored_plain_graphs(out,
                                                                  context.get_communicator(),
                                                                  process_index, my_merged_rings);
 
-    ccl::process_device_indices_t scaleout_devices =
+    ccl::process_device_indices_type scaleout_devices =
                         create_scaleout_devices_in_colored_graphs_for_process(
                                                                 process_index,
                                                                 process_count,
@@ -316,7 +316,7 @@ bool allied_process_group_ring_topology::build_all(std::ostream& out,
     }
     out << std::endl;
 
-    ccl::process_device_indices_t ipc_devices =
+    ccl::process_device_indices_type ipc_devices =
                         create_ipc_devices_in_colored_graphs_for_process(
                                                                 process_index,
                                                                 process_count,
@@ -339,7 +339,7 @@ bool allied_process_group_ring_topology::build_all(std::ostream& out,
     my_merged_rings = global_merged_graphs.find(process_index)->second;
     out << "Final process idx: " << process_index
         << ", has got colored graphs count: " << my_merged_rings.size() << std::endl;
-    out << details::to_string(my_merged_rings) << std::endl;
+    out << detail::to_string(my_merged_rings) << std::endl;
 
     // enumerate as usual
     if (scaleout_devices.empty())
@@ -368,24 +368,24 @@ bool allied_process_group_ring_topology::build_all(std::ostream& out,
                           my_merged_rings, process_device_rank_offset);
 }
 
-details::plain_graph_list
+detail::plain_graph_list
         allied_process_group_ring_topology::create_my_process_graphs(
                                 std::ostream& out,
-                                const ccl::process_device_indices_t& per_thread_device_indices,
-                                const details::adjacency_matrix& matrix,
-                                details::p2p_rating_function ping)
+                                const ccl::process_device_indices_type& per_thread_device_indices,
+                                const detail::adjacency_matrix& matrix,
+                                detail::p2p_rating_function ping)
 {
     out << "Build device graphs, from threads: " << per_thread_device_indices.size() << std::endl;
-    return details::graph_list_resolver(matrix, per_thread_device_indices, ping);
+    return detail::graph_list_resolver(matrix, per_thread_device_indices, ping);
 
 }
-details::global_sorted_plain_graphs
+detail::global_sorted_plain_graphs
         allied_process_group_ring_topology::collect_cluster_plain_graphs(std::ostream& out,
                                                                          std::shared_ptr<ccl::communicator> comm,
                                                                          size_t process_index,
-                                                                         const details::plain_graph_list& my_process_graph)
+                                                                         const detail::plain_graph_list& my_process_graph)
 {
-    using namespace details::serialize;
+    using namespace detail::serialize;
 
     out << "Collect cluster plain graphs, my process index: " << process_index
         << ", graphs count: " << my_process_graph.size() << std::endl;
@@ -397,7 +397,7 @@ details::global_sorted_plain_graphs
     size_t send_count = my_serialized_graph.size();
     std::vector<size_t> receive_process_graph_sizes(comm->size());
 
-    //std::vector<ccl::communicator::coll_request_t> requests;
+    //std::vector<ccl::event> requests;
     out << "Ask graph lists sizes by process index: " << process_index
         << ", serialized size: " << send_count << std::endl;
     auto req = ccl::allgatherv(&send_count, 1,
@@ -432,12 +432,12 @@ details::global_sorted_plain_graphs
 
     size_t deserialized_bytes = 0;
     size_t offset_bytes = 0;
-    details::global_sorted_plain_graphs global_ret;
+    detail::global_sorted_plain_graphs global_ret;
 
     out << "Deserialize graph_lists" << std::endl;
     for(size_t i = 0; i < comm->size(); i++)
     {
-        details::plain_graph_list graph =
+        detail::plain_graph_list graph =
                 device_path_deserializer::deserialize_graph_list_indices(global_serialized_graph,
                                                                          deserialized_bytes,
                                                                          offset_bytes);
@@ -451,14 +451,14 @@ details::global_sorted_plain_graphs
     return global_ret;
 }
 
-details::global_sorted_colored_plain_graphs
+detail::global_sorted_colored_plain_graphs
         allied_process_group_ring_topology::collect_cluster_colored_plain_graphs(
                                                     std::ostream& out,
                                                     std::shared_ptr<ccl::communicator> comm,
                                                     size_t process_index,
-                                                    const details::colored_plain_graph_list& my_process_graph)
+                                                    const detail::colored_plain_graph_list& my_process_graph)
 {
-    using namespace details::serialize;
+    using namespace detail::serialize;
 
     out << "Collect cluster colored plain graphs, my process index: " << process_index
         << ", graphs count: " << my_process_graph.size() << std::endl;
@@ -470,7 +470,7 @@ details::global_sorted_colored_plain_graphs
     size_t send_count = my_serialized_graph.size();
     std::vector<size_t> receive_process_graph_sizes(comm->size());
 
-    //std::vector<ccl::communicator::coll_request_t> requests;
+    //std::vector<ccl::event> requests;
     out << "Ask graph lists sizes by process index: " << process_index
         << ", serialized size: " << send_count << std::endl;
     auto req = ccl::allgatherv(&send_count, 1,
@@ -504,12 +504,12 @@ details::global_sorted_colored_plain_graphs
 
     size_t deserialized_bytes = 0;
     size_t offset_bytes = 0;
-    details::global_sorted_colored_plain_graphs global_ret;
+    detail::global_sorted_colored_plain_graphs global_ret;
 
     out << "Deserialize colored_graph_lists" << std::endl;
     for(size_t i = 0; i < comm->size(); i++)
     {
-        details::colored_plain_graph_list graph =
+        detail::colored_plain_graph_list graph =
                 device_path_deserializer::deserialize_colored_graph_list_indices(global_serialized_graph,
                                                                                  deserialized_bytes,
                                                                                  offset_bytes);
@@ -524,26 +524,26 @@ details::global_sorted_colored_plain_graphs
 }
 
 
-details::global_plain_graphs
+detail::global_plain_graphs
         allied_process_group_ring_topology::merge_allied_nodes_plain_graphs(std::ostream& out,
-                                                                            const ccl::cluster_device_indices_t &cluster_indices,
+                                                                            const ccl::cluster_device_indices_type &cluster_indices,
                                                                             size_t process_index,
-                                                                            const details::global_sorted_plain_graphs& cluster_graphs,
-                                                                            details::p2p_rating_function ping)
+                                                                            const detail::global_sorted_plain_graphs& cluster_graphs,
+                                                                            detail::p2p_rating_function ping)
 {
     out << "Merge global graphs from processes: " << cluster_graphs.size() << std::endl;
-    details::global_plain_graphs ret;
+    detail::global_plain_graphs ret;
     for (const auto &host_process_id_pair : cluster_indices)
     {
         const ccl::host_id& hostname = host_process_id_pair.first;
 
         //iterate over all allied processes on the same host
-        const ccl::process_device_indices_t& processes = host_process_id_pair.second;
+        const ccl::process_device_indices_type& processes = host_process_id_pair.second;
         out << "Try to merge graphs for host: " << hostname << ", allied processes count: "
             << processes.size() << std::endl;
 
         //collect graphs for all allied processes in lists for merge trying
-        std::list<details::plain_graph_list> tmp_allied_processes_graphs;
+        std::list<detail::plain_graph_list> tmp_allied_processes_graphs;
         for (const auto& process_val : processes)
         {
             auto process_id = process_val.first;
@@ -564,7 +564,7 @@ details::global_plain_graphs
         for (const auto& process_val : processes)
         {
             //merge_lists is stable, let's my process graph list at first in merge result
-            std::list<details::plain_graph_list> rotated = tmp_allied_processes_graphs;
+            std::list<detail::plain_graph_list> rotated = tmp_allied_processes_graphs;
             /* TODO rotate ? */
             auto process_index = process_val.first;
 
@@ -573,7 +573,7 @@ details::global_plain_graphs
             std::rotate(rotated.begin(), new_begin_it, rotated.end());
 
             ret.push_back(std::make_pair(process_val.first,
-                                         details::merge_graph_lists_stable(rotated,
+                                         detail::merge_graph_lists_stable(rotated,
                                                                            ping)));
         }
 
@@ -582,28 +582,28 @@ details::global_plain_graphs
     return ret;
 }
 
-details::global_colored_plain_graphs
+detail::global_colored_plain_graphs
         allied_process_group_ring_topology::merge_allied_nodes_in_colored_plain_graphs(
                                                 std::ostream& out,
-                                                const ccl::cluster_device_indices_t &cluster_indices,
+                                                const ccl::cluster_device_indices_type &cluster_indices,
                                                 size_t process_index,
                                                 size_t process_count,
-                                                const details::global_sorted_colored_plain_graphs& cluster_graphs,
-                                                details::p2p_rating_function ping)
+                                                const detail::global_sorted_colored_plain_graphs& cluster_graphs,
+                                                detail::p2p_rating_function ping)
 {
     out << "Merge global colored graphs from processes: " << cluster_graphs.size() << std::endl;
-    details::global_colored_plain_graphs ret;
+    detail::global_colored_plain_graphs ret;
     for (const auto &host_process_id_pair : cluster_indices)
     {
         const ccl::host_id& hostname = host_process_id_pair.first;
 
         //iterate over all allied processes on the same host
-        const ccl::process_device_indices_t& processes = host_process_id_pair.second;
+        const ccl::process_device_indices_type& processes = host_process_id_pair.second;
         out << "Try to merge colored graphs for host: " << hostname << ", allied processes count: "
             << processes.size() << std::endl;
 
         //collect graphs for all allied processes in lists for merge trying
-        std::list<details::colored_plain_graph_list> tmp_allied_processes_graphs;
+        std::list<detail::colored_plain_graph_list> tmp_allied_processes_graphs;
 
         size_t terminator_process_index = 0;// TODO LIMITATION on MAX PROCESSES COUNT
         for (const auto& process_val : processes)
@@ -638,7 +638,7 @@ details::global_colored_plain_graphs
             //turn right
             auto new_begin_it = tmp_allied_processes_graphs.begin();
             std::advance(new_begin_it, process_index);
-            std::list<details::colored_plain_graph_list> to_right_part(new_begin_it,
+            std::list<detail::colored_plain_graph_list> to_right_part(new_begin_it,
                                                                   tmp_allied_processes_graphs.end());
 
             //use terminator!
@@ -647,15 +647,15 @@ details::global_colored_plain_graphs
                 if (process_index == processes.size() - 1)
                 {
                     //set terminator for right side
-                    details::colored_plain_graph_list terminated_list = *tmp_allied_processes_graphs.begin();
+                    detail::colored_plain_graph_list terminated_list = *tmp_allied_processes_graphs.begin();
                     reset_color(terminated_list, terminator_process_index);
                     to_right_part.push_back(std::move(terminated_list));
                 }
             }
 
             size_t merged_from_right = 0;
-            details::colored_plain_graph_list to_right =
-                    details::merge_graph_lists_stable_for_process(to_right_part, ping,
+            detail::colored_plain_graph_list to_right =
+                    detail::merge_graph_lists_stable_for_process(to_right_part, ping,
                                                                   true, merged_from_right);
             if (to_right.empty())   //i am the rightest process
             {
@@ -667,7 +667,7 @@ details::global_colored_plain_graphs
             size_t merged_from_left = 0;
             auto new_end_it = tmp_allied_processes_graphs.begin();
             std::advance(new_end_it, process_index + 1);
-            std::list<details::colored_plain_graph_list> to_left_part(tmp_allied_processes_graphs.begin(),
+            std::list<detail::colored_plain_graph_list> to_left_part(tmp_allied_processes_graphs.begin(),
                                                                  new_end_it);
             std::reverse(to_left_part.begin(), to_left_part.end());
             if(to_left_part.empty())
@@ -685,7 +685,7 @@ details::global_colored_plain_graphs
                 if (process_index == 0)
                 {
                     //set terminator for right side
-                    details::colored_plain_graph_list terminated_list = *tmp_allied_processes_graphs.rbegin();
+                    detail::colored_plain_graph_list terminated_list = *tmp_allied_processes_graphs.rbegin();
                     reset_color(terminated_list, terminator_process_index);
                     to_left_part.push_back(std::move(terminated_list));
                 }
@@ -696,8 +696,8 @@ details::global_colored_plain_graphs
             }
             *to_left_part.begin() = to_right;
 
-            details::colored_plain_graph_list to_left_right =
-                    details::merge_graph_lists_stable_for_process(to_left_part, ping,
+            detail::colored_plain_graph_list to_left_right =
+                    detail::merge_graph_lists_stable_for_process(to_left_part, ping,
                                                                   false, merged_from_left);
             ret.push_back(std::make_pair(process_val.first,
                                          to_left_right));
@@ -708,16 +708,16 @@ details::global_colored_plain_graphs
     return ret;
 }
 
-details::plain_graph_list
+detail::plain_graph_list
         allied_process_group_ring_topology::resize_merged_graphs_for_process(
                                                     size_t process_index,
-                                                    const details::global_plain_graphs& merged_cluster_graphs,
-                                                    const details::plain_graph_list& original_graph_list,
+                                                    const detail::global_plain_graphs& merged_cluster_graphs,
+                                                    const detail::plain_graph_list& original_graph_list,
                                                     std::ostream& out)
 {
     out << "remove foreign chains from my merged graphs for process idx: " << process_index <<"\n";
     auto it = std::find_if(merged_cluster_graphs.begin(), merged_cluster_graphs.end(),
-                           [process_index] (const typename details::global_plain_graphs::value_type& val)
+                           [process_index] (const typename detail::global_plain_graphs::value_type& val)
                            {
                                return val.first == process_index;
                            });
@@ -732,7 +732,7 @@ details::plain_graph_list
                                                  ss.str());
     }
 
-    details::plain_graph_list my_merged_rings_copy = it->second;
+    detail::plain_graph_list my_merged_rings_copy = it->second;
     {
         size_t new_size = my_merged_rings_copy.size();
         size_t old_size = original_graph_list.size();
@@ -750,17 +750,17 @@ details::plain_graph_list
     return my_merged_rings_copy;
 }
 
-details::colored_plain_graph_list
+detail::colored_plain_graph_list
         allied_process_group_ring_topology::resize_merged_colored_graphs_for_process(
                                             size_t process_index,
                                             size_t process_size,
-                                            const details::global_colored_plain_graphs& merged_cluster_graphs,
-                                            const details::colored_plain_graph_list& original_graph_list,
+                                            const detail::global_colored_plain_graphs& merged_cluster_graphs,
+                                            const detail::colored_plain_graph_list& original_graph_list,
                                             std::ostream& out)
 {
     out << "remove foreign chains from my colored merged graphs for process idx: " << process_index <<"\n";
     auto it = std::find_if(merged_cluster_graphs.begin(), merged_cluster_graphs.end(),
-              [process_index] (const typename details::global_colored_plain_graphs::value_type& val)
+              [process_index] (const typename detail::global_colored_plain_graphs::value_type& val)
               {
                    return val.first == process_index;
               });
@@ -774,7 +774,7 @@ details::colored_plain_graph_list
                                                  ss.str());
     }
 
-    details::colored_plain_graph_list my_merged_rings_copy = it->second;
+    detail::colored_plain_graph_list my_merged_rings_copy = it->second;
     {
         size_t new_size = my_merged_rings_copy.size();
         size_t old_size = original_graph_list.size();
@@ -795,8 +795,8 @@ details::colored_plain_graph_list
     for(auto& graph : my_merged_rings_copy)
     {
         std::stable_sort(graph.begin(), graph.end(), [process_index, process_size]
-                                                        (const details::colored_idx& lhs,
-                                                         const details::colored_idx& rhs)
+                                                        (const detail::colored_idx& lhs,
+                                                         const detail::colored_idx& rhs)
         {
             //size_t right_index = (process_index + 1 ) % process_size;
             //size_t left_index = ( process_index == 0 ?  process_size : process_index - 1);
@@ -807,11 +807,11 @@ details::colored_plain_graph_list
     return my_merged_rings_copy;
 }
 
-ccl::process_device_indices_t
+ccl::process_device_indices_type
         allied_process_group_ring_topology::create_scaleout_devices_in_graphs_for_process(
                                                         size_t process_idx,
                                                         size_t cluster_size,
-                                                        details::global_sorted_plain_graphs& cluster_graphs,
+                                                        detail::global_sorted_plain_graphs& cluster_graphs,
                                                         std::ostream& out)
 {
     size_t left_process_idx = (process_idx == 0
@@ -822,7 +822,7 @@ ccl::process_device_indices_t
         << ", left_process_idx: " << left_process_idx
         << ", right_process_idx: " << right_process_idx << std::endl;
 
-    ccl::process_device_indices_t scaleout_devices;
+    ccl::process_device_indices_type scaleout_devices;
     auto me = cluster_graphs.find(process_idx)->second;
 
     if (process_idx > left_process_idx)
@@ -854,12 +854,12 @@ ccl::process_device_indices_t
     return scaleout_devices;
 }
 
-ccl::process_device_indices_t
+ccl::process_device_indices_type
                 allied_process_group_ring_topology::create_scaleout_devices_in_colored_graphs_for_process(
                                         size_t process_idx,
                                         size_t cluster_size,
-                                        details::global_sorted_colored_plain_graphs& cluster_graphs,
-                                        details::global_sorted_colored_plain_graphs& initial_cluster_graphs,
+                                        detail::global_sorted_colored_plain_graphs& cluster_graphs,
+                                        detail::global_sorted_colored_plain_graphs& initial_cluster_graphs,
                                         std::ostream& out)
 
 {
@@ -875,7 +875,7 @@ ccl::process_device_indices_t
         << ", left_process_idx: " << left_process_idx.second
         << ", right_process_idx: " << right_process_idx.second << std::endl;
 
-    ccl::process_device_indices_t scaleout_devices;
+    ccl::process_device_indices_type scaleout_devices;
     // process corner cases
     if(left_process_idx == right_process_idx)
     {
@@ -899,9 +899,9 @@ ccl::process_device_indices_t
     auto& me = cluster_graphs.find(process_idx)->second;
 
     std::unique_ptr<size_t> color_to_find(new size_t);
-    auto find_in_list_by_color = [&color_to_find](const details::colored_plain_graph& graph) -> bool
+    auto find_in_list_by_color = [&color_to_find](const detail::colored_plain_graph& graph) -> bool
     {
-        auto it = std::find_if(graph.begin(), graph.end(), [&color_to_find](const details::colored_idx& idx)
+        auto it = std::find_if(graph.begin(), graph.end(), [&color_to_find](const detail::colored_idx& idx)
         {
             return (idx.color == *color_to_find);
         });
@@ -985,12 +985,12 @@ ccl::process_device_indices_t
     return scaleout_devices;
 }
 
-ccl::process_device_indices_t
+ccl::process_device_indices_type
                 allied_process_group_ring_topology::create_ipc_devices_in_colored_graphs_for_process(
                                         size_t process_idx,
                                         size_t cluster_size,
-                                        details::global_sorted_colored_plain_graphs& cluster_graphs,
-                                        details::global_sorted_colored_plain_graphs& initial_cluster_graphs,
+                                        detail::global_sorted_colored_plain_graphs& cluster_graphs,
+                                        detail::global_sorted_colored_plain_graphs& initial_cluster_graphs,
                                         std::ostream& out)
 {
     (void)initial_cluster_graphs;
@@ -1007,7 +1007,7 @@ ccl::process_device_indices_t
         << ", left_process_idx: " << left_process_idx.second
         << ", right_process_idx: " << right_process_idx.second << std::endl;
 
-    ccl::process_device_indices_t ipc_devices;
+    ccl::process_device_indices_type ipc_devices;
     // process corner cases
     if(left_process_idx == right_process_idx)
     {
@@ -1031,14 +1031,14 @@ ccl::process_device_indices_t
     auto& me = cluster_graphs.find(process_idx)->second;
 
     std::unique_ptr<size_t> color_to_find(new size_t);
-    std::vector<details::colored_idx> devices_to_remember;
+    std::vector<detail::colored_idx> devices_to_remember;
 
     //TODO limitation: all graphs ipc devices would be merged into one vector
     auto filter_list_by_color =
-    [&color_to_find, &devices_to_remember] (const details::colored_plain_graph& graph) -> void
+    [&color_to_find, &devices_to_remember] (const detail::colored_plain_graph& graph) -> void
     {
         std::copy_if(graph.begin(), graph.end(), std::back_inserter(devices_to_remember),
-                     [&color_to_find](const details::colored_idx& idx)
+                     [&color_to_find](const detail::colored_idx& idx)
         {
             return (idx.color == *color_to_find);
         });
@@ -1090,8 +1090,8 @@ ccl::process_device_indices_t
 }
 
 bool allied_process_group_ring_topology::build_specific(std::ostream& out,
-                                                        const ccl::process_device_indices_t& per_thread_device_indices,
-                                                        const details::plain_graph& id_ring)
+                                                        const ccl::process_device_indices_type& per_thread_device_indices,
+                                                        const detail::plain_graph& id_ring)
 {
     constexpr ccl::group_split_type topology_type = ccl::group_split_type::cluster;
 
@@ -1104,8 +1104,8 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
     // id_ring - inter-thread ring
     out << "\nStart indexer:" << std::endl;
     auto& ctx_per_thread_data = context.process_device_topology;
-    details::id_thread_table assigned_ids;
-    std::vector<details::marked_idx> marked_id_ring = details::create_marked(id_ring);
+    detail::id_thread_table assigned_ids;
+    std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(id_ring);
     for (auto per_thread_it = ctx_per_thread_data.begin(); per_thread_it != ctx_per_thread_data.end();
          ++per_thread_it)
     {
@@ -1118,7 +1118,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                                                     devices.thread_gpu_comms.find(thread_id)->second;
 
         auto rank_builder =
-                    create_device_functor<details::graph_ring_indexer_ext<topology_type>>(marked_id_ring,
+                    create_device_functor<detail::graph_ring_indexer_ext<topology_type>>(marked_id_ring,
                                                                                           assigned_ids,
                                                                                           thread_id,
                                                                                           out_indexed_devices,
@@ -1127,7 +1127,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                                                                                           device_cluster_size);
         ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
 
-        details::printer<topology_type> p;
+        detail::printer<topology_type> p;
         ccl_tuple_for_each(*non_indexed_plain_devices, p);
         out << "Indexer result for devices in thread idx ("
             << thread_id << "/" << ctx_per_thread_data.size() << "):\n"
@@ -1135,8 +1135,8 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
     }
 
     //allocate IPC devices pool with rank from unassigned IDs
-    details::ipc_devices_pool ipc_comms =
-                    details::create_ipc_gpu_comms<topology_type>(assigned_ids, id_ring, devices,
+    detail::ipc_devices_pool ipc_comms =
+                    detail::create_ipc_gpu_comms<topology_type>(assigned_ids, id_ring, devices,
                                                                  device_cluster_size,
                                                                  device_cluster_rank_offset);
     out << "Created IPC devices: " << ipc_comms.size() << ", for cluster_size: " << device_cluster_size
@@ -1153,8 +1153,8 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
         auto& indexed_devices_for_current_thread =
                     context.get_process_topology<topology_type>(process_index,
                                                                 current_thread_idx)->get_device_storage();
-        const auto& curr_real = details::get_device_with_min_rank<ccl_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
-        const auto& curr_virt = details::get_device_with_min_rank<ccl_virtual_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
+        const auto& curr_real = detail::get_device_with_min_rank<ccl_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
+        const auto& curr_virt = detail::get_device_with_min_rank<ccl_virtual_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
 
         size_t tg_max_rank = std::max({std::get<0>(curr_real), std::get<0>(curr_virt)});
 
@@ -1181,8 +1181,8 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
             auto& next_thread_ring_topology =
                         context.get_process_topology<topology_type>(process_index,
                                                                     next_thread_id)->get_device_storage();
-            const auto& real = details::get_device_with_max_rank<ccl_gpu_comm, topology_type>(next_thread_ring_topology, id_ring);
-            const auto& virt = details::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(next_thread_ring_topology, id_ring);
+            const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, topology_type>(next_thread_ring_topology, id_ring);
+            const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(next_thread_ring_topology, id_ring);
 
             if (next_rank != std::min({std::get<0>(real), std::get<0>(virt)}))
             {
@@ -1198,7 +1198,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
             if (next_rank == std::get<0>(real))
             {
                 auto locker =
-                    details::add_concurrent_locker_device<ccl_gpu_comm, topology_type>(next_rank,
+                    detail::add_concurrent_locker_device<ccl_gpu_comm, topology_type>(next_rank,
                                                                                        0,
                                                                                        real,
                                                                                        devices,indexed_devices_for_current_thread);
@@ -1209,7 +1209,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
             else if (next_rank == std::get<0>(virt))
             {
                 auto locker =
-                    details::add_concurrent_locker_device<ccl_virtual_gpu_comm, topology_type>(next_rank,
+                    detail::add_concurrent_locker_device<ccl_virtual_gpu_comm, topology_type>(next_rank,
                                                                                                0,
                                                                                                virt,
                                                                                                devices,indexed_devices_for_current_thread);
@@ -1250,8 +1250,8 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
         //upgrade left gpu device to IPC SOURCE type
         if (!ipc_comms.empty()/*has another IPC Device*/ and current_thread_idx == 0 /* left comm is IPC comm for last process*/ )
         {
-            const auto& real = details::get_device_with_max_rank<ccl_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
-            const auto& virt = details::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
+            const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
+            const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
 
             size_t left_ipc_source_rank = std::min({std::get<0>(real), std::get<0>(virt)});
             out << "Upgrade thread id: " << current_thread_idx
@@ -1261,7 +1261,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
             if(left_ipc_source_rank == std::get<0>(real))
             {
                 auto locker =
-                            details::add_ipc_source_locker_device<ccl_gpu_comm,
+                            detail::add_ipc_source_locker_device<ccl_gpu_comm,
                                                                   topology_type>(next_rank,
                                                                                  0,
                                                                                  real,
@@ -1273,7 +1273,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
             else if (left_ipc_source_rank == std::get<0>(virt))
             {
                 auto locker =
-                            details::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
+                            detail::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
                                                                   topology_type>(next_rank,
                                                                                  0,
                                                                                  virt,
@@ -1288,9 +1288,9 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
 }
 
 bool allied_process_group_ring_topology::build_specific_colored(std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const ccl::process_device_indices_t& ipc_device_indices,
-                        details::colored_plain_graph& id_ring,
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const ccl::process_device_indices_type& ipc_device_indices,
+                        detail::colored_plain_graph& id_ring,
                         const std::map<size_t, size_t>& process_device_rank_offset)
 {
     //continuous ring, without scale-up devices
@@ -1299,7 +1299,7 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
     constexpr ccl::group_split_type topology_type = ccl::group_split_type::cluster;
 
     out << "Start building topology: " << ::to_string(topology_type) << ", for colored graph:\n"
-        << details::to_string(id_ring) << std::endl;
+        << detail::to_string(id_ring) << std::endl;
 
     // id_ring - inter-thread ring
     out << "\nStart indexer:" << std::endl;
@@ -1326,8 +1326,8 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
                                                     devices.thread_gpu_comms.find(thread_id)->second;
 
         //allocate IPC devices pool(if needed)
-        details::cluster_ipc_devices_pool ipc_comms =
-                    details::create_filtered_ipc_destination_gpu_comms<topology_type>(
+        detail::cluster_ipc_devices_pool ipc_comms =
+                    detail::create_filtered_ipc_destination_gpu_comms<topology_type>(
                                             id_ring,
                                             ipc_device_indices,
                                             process_index,
@@ -1336,7 +1336,7 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
                                             *non_indexed_plain_devices);
 
         auto rank_builder =
-                    create_device_functor<details::smart_ring_indexer<topology_type>>(
+                    create_device_functor<detail::smart_ring_indexer<topology_type>>(
                                             id_ring,
                                             process_index,
                                             process_count,
@@ -1344,11 +1344,11 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
                                             devices,
                                             out_indexed_devices,
                                             ipc_device_indices,
-                                            ccl::process_device_indices_t{});
+                                            ccl::process_device_indices_type{});
         //start indexer
         ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
 
-        details::printer<topology_type> p;
+        detail::printer<topology_type> p;
         ccl_tuple_for_each(*non_indexed_plain_devices, p);
         out << "Indexer result for devices in thread idx ("
             << thread_id << "/" << ctx_per_thread_data.size() << "):\n"
@@ -1363,10 +1363,10 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
                     context.get_process_topology<topology_type>(process_index,
                                                                 current_thread_idx)->get_device_storage();
         const auto& curr_real =
-                    details::get_device_with_min_rank<ccl_gpu_comm, topology_type>(
+                    detail::get_device_with_min_rank<ccl_gpu_comm, topology_type>(
                                         indexed_devices_for_current_thread, id_ring);
         const auto& curr_virt =
-                    details::get_device_with_min_rank<ccl_virtual_gpu_comm, topology_type>(
+                    detail::get_device_with_min_rank<ccl_virtual_gpu_comm, topology_type>(
                                         indexed_devices_for_current_thread, id_ring);
 
         size_t tg_max_rank = std::max({std::get<0>(curr_real), std::get<0>(curr_virt)});
@@ -1395,10 +1395,10 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
                         context.get_process_topology<topology_type>(process_index,
                                                                     next_thread_id)->get_device_storage();
             const auto& real =
-                    details::get_device_with_max_rank<ccl_gpu_comm, topology_type>(
+                    detail::get_device_with_max_rank<ccl_gpu_comm, topology_type>(
                                         next_thread_ring_topology, id_ring);
             const auto& virt =
-                    details::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(
+                    detail::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(
                                         next_thread_ring_topology, id_ring);
 
             if (next_rank != std::min({std::get<0>(real), std::get<0>(virt)}))
@@ -1415,7 +1415,7 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
             if (next_rank == std::get<0>(real))
             {
                 auto locker =
-                    details::add_concurrent_locker_device<ccl_gpu_comm, topology_type>(next_rank,
+                    detail::add_concurrent_locker_device<ccl_gpu_comm, topology_type>(next_rank,
                                                                                        0,
                                                                                        real,
                                                                                        devices,indexed_devices_for_current_thread);
@@ -1426,7 +1426,7 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
             else if (next_rank == std::get<0>(virt))
             {
                 auto locker =
-                    details::add_concurrent_locker_device<ccl_virtual_gpu_comm, topology_type>(next_rank,
+                    detail::add_concurrent_locker_device<ccl_virtual_gpu_comm, topology_type>(next_rank,
                                                                                                0,
                                                                                                virt,
                                                                                                devices,indexed_devices_for_current_thread);
@@ -1480,8 +1480,8 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
             //upgrade left gpu device to IPC SOURCE type
             if (!ipc_comms.empty()/ *has another IPC Device* / and current_thread_idx == 0 / * left comm is IPC comm for last process* / )
             {
-                const auto& real = details::get_device_with_max_rank<ccl_gpu_comm, topology_type>(*indexed_devices_for_current_thread, id_ring);
-                const auto& virt = details::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(*indexed_devices_for_current_thread, id_ring);
+                const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, topology_type>(*indexed_devices_for_current_thread, id_ring);
+                const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(*indexed_devices_for_current_thread, id_ring);
                 size_t left_ipc_source_rank = std::min({std::get<0>(real), std::get<0>(virt)});
                 out << "Upgrade thread id: " << current_thread_idx
                     << " GPU by rank: " << left_ipc_source_rank
@@ -1489,7 +1489,7 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
                 if(left_ipc_source_rank == std::get<0>(real))
                 {
                     auto locker =
-                            details::add_ipc_source_locker_device<ccl_gpu_comm,
+                            detail::add_ipc_source_locker_device<ccl_gpu_comm,
                                                                   topology_type>(next_rank,
                                                                                  0,
                                                                                  real,
@@ -1502,7 +1502,7 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
                 else if (left_ipc_source_rank == std::get<0>(virt))
                 {
                     auto locker =
-                            details::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
+                            detail::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
                                                                   topology_type>(next_rank,
                                                                                  0,
                                                                                  virt,
@@ -1520,8 +1520,8 @@ bool allied_process_group_ring_topology::build_specific_colored(std::ostream& ou
 }
 
 bool allied_process_group_ring_topology::build_specific(std::ostream& out,
-                                                        const ccl::process_device_indices_t& per_thread_device_indices,
-                                                        const details::plain_graph_list& graph_list)
+                                                        const ccl::process_device_indices_type& per_thread_device_indices,
+                                                        const detail::plain_graph_list& graph_list)
 {
      constexpr ccl::group_split_type topology_type =
                                         ccl::group_split_type::process_group_torn_apart_ring;
@@ -1563,7 +1563,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                     continue;
                 }
 
-                auto scale_virt = details::add_numa_proxy_device<ccl_virtual_gpu_comm, topology_type>(
+                auto scale_virt = detail::add_numa_proxy_device<ccl_virtual_gpu_comm, topology_type>(
                                                                         *non_indexed_plain_devices,
                                                                         last_in_graph_index,
                                                                         context,
@@ -1576,7 +1576,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                 }
                 else
                 {
-                    auto scale_real = details::add_numa_proxy_device<ccl_gpu_comm, topology_type>(
+                    auto scale_real = detail::add_numa_proxy_device<ccl_gpu_comm, topology_type>(
                                                                         *non_indexed_plain_devices,
                                                                         last_in_graph_index,
                                                                         context,
@@ -1602,15 +1602,15 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
 
     // id_ring - inter-thread ring
     out << "\nStart indexer:" << std::endl;
-    details::ipc_devices_pool ipc_comms;
+    detail::ipc_devices_pool ipc_comms;
     size_t accumulated_index_offset_for_graph = 0;
     size_t graph_num = 0;
     std::map<size_t/*graph_num*/, size_t /*offset*/> index_offset_for_graphs;
     for (const auto& id_ring : graph_list)
     {
-        details::id_thread_table assigned_ids;  //device_id -> thread_id
+        detail::id_thread_table assigned_ids;  //device_id -> thread_id
 
-        std::vector<details::marked_idx> marked_id_ring = details::create_marked(id_ring);  // marked graph
+        std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(id_ring);  // marked graph
 
         size_t index_offset = accumulated_index_offset_for_graph;
         for (auto per_thread_it = ctx_per_thread_data.begin(); per_thread_it != ctx_per_thread_data.end();
@@ -1627,7 +1627,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
 
             // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
             auto rank_builder =
-                    create_device_functor<details::graph_ring_indexer_unique_index_ext<topology_type>>(marked_id_ring,
+                    create_device_functor<detail::graph_ring_indexer_unique_index_ext<topology_type>>(marked_id_ring,
                                                                                       assigned_ids,
                                                                                       thread_id,
                                                                                       out_indexed_devices,
@@ -1638,7 +1638,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
 
             ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
 
-            details::printer<topology_type> p;
+            detail::printer<topology_type> p;
             ccl_tuple_for_each(out_indexed_devices, p);
             out << "Indexer result for devices in thread idx ("
                 << thread_id << "/" << ctx_per_thread_data.size() << "):\n"
@@ -1655,8 +1655,8 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
             << graph_num << std::endl;
 
         //allocate IPC devices pool with rank from unassigned IDs
-        details::ipc_devices_pool tmp_ipc_comms =
-                        details::create_ipc_gpu_comms<topology_type>(assigned_ids, id_ring, devices,
+        detail::ipc_devices_pool tmp_ipc_comms =
+                        detail::create_ipc_gpu_comms<topology_type>(assigned_ids, id_ring, devices,
                                                                      device_cluster_size,
                                                                      device_cluster_rank_offset);
         out << "Created Tmp IPC devices: " << tmp_ipc_comms.size()
@@ -1684,16 +1684,16 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                     context.get_process_topology<topology_type>(process_index,
                                                                 current_thread_idx)->get_device_storage();
             const auto& curr_real =
-                    details::get_device_with_min_rank<ccl_gpu_comm, topology_type>(
+                    detail::get_device_with_min_rank<ccl_gpu_comm, topology_type>(
                                                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_virt =
-                    details::get_device_with_min_rank<ccl_virtual_gpu_comm, topology_type>(
+                    detail::get_device_with_min_rank<ccl_virtual_gpu_comm, topology_type>(
                                                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_scale_real =
-                    details::get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, topology_type>(
+                    detail::get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, topology_type>(
                                                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_scale_virt =
-                    details::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, topology_type>(
+                    detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, topology_type>(
                                                     indexed_devices_for_current_thread, id_ring);
 
             size_t tg_max_rank = std::max({std::get<0>(curr_real), std::get<0>(curr_virt),
@@ -1722,16 +1722,16 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                         context.get_process_topology<topology_type>(process_index,
                                                                     next_thread_id)->get_device_storage();
                 const auto& real =
-                        details::get_device_with_max_rank<ccl_gpu_comm, topology_type>(
+                        detail::get_device_with_max_rank<ccl_gpu_comm, topology_type>(
                                                             next_thread_ring_topology, id_ring);
                 const auto& virt =
-                        details::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(
+                        detail::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(
                                                             next_thread_ring_topology, id_ring);
                 const auto& scale_real =
-                        details::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>, topology_type>(
+                        detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>, topology_type>(
                                                             next_thread_ring_topology, id_ring);
                 const auto& scale_virt =
-                        details::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, topology_type>(
+                        detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, topology_type>(
                                                             next_thread_ring_topology, id_ring);
                 if (next_rank != std::min({std::get<0>(real), std::get<0>(virt),
                                            std::get<0>(scale_real), std::get<0>(scale_virt)}))
@@ -1751,7 +1751,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                 if (next_rank == std::get<0>(real))
                 {
                     auto locker =
-                        details::add_concurrent_locker_device<ccl_gpu_comm, topology_type>(next_rank,
+                        detail::add_concurrent_locker_device<ccl_gpu_comm, topology_type>(next_rank,
                                                                                        0,
                                                                                        real,
                                                                                        devices,indexed_devices_for_current_thread);
@@ -1762,7 +1762,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                 else if (next_rank == std::get<0>(virt))
                 {
                     auto locker =
-                        details::add_concurrent_locker_device<ccl_virtual_gpu_comm, topology_type>(next_rank,
+                        detail::add_concurrent_locker_device<ccl_virtual_gpu_comm, topology_type>(next_rank,
                                                                                                0,
                                                                                                virt,
                                                                                                devices,indexed_devices_for_current_thread);
@@ -1813,8 +1813,8 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
             //upgrade left gpu device to IPC SOURCE type
             if (!ipc_comms.empty() /*has another IPC Device*/ and current_thread_idx == 0 /* left comm is IPC comm for last process*/ )
             {
-                const auto& real = details::get_device_with_max_rank<ccl_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
-                const auto& virt = details::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
+                const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
+                const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
 
                 size_t left_ipc_source_rank = std::min({std::get<0>(real), std::get<0>(virt)});
                 out << "Upgrade thread id: " << current_thread_idx
@@ -1824,7 +1824,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                 if(left_ipc_source_rank == std::get<0>(real))
                 {
                     auto locker =
-                                details::add_ipc_source_locker_device<ccl_gpu_comm,
+                                detail::add_ipc_source_locker_device<ccl_gpu_comm,
                                                                     topology_type>(next_rank,
                                                                                    0,
                                                                                    real,
@@ -1836,7 +1836,7 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
                 else if (left_ipc_source_rank == std::get<0>(virt))
                 {
                     auto locker =
-                                details::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
+                                detail::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
                                                                   topology_type>(next_rank,
                                                                                  0,
                                                                                  virt,
@@ -1853,9 +1853,9 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
 }
 
 bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const ccl::process_device_indices_t& ipc_device_indices,
-                        details::colored_plain_graph_list& graph_list,
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const ccl::process_device_indices_type& ipc_device_indices,
+                        detail::colored_plain_graph_list& graph_list,
                         const std::map<size_t, size_t>& process_device_rank_offset)
 {
     constexpr ccl::group_split_type topology_type =
@@ -1863,7 +1863,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
 
     out << "Start building topology: " << ::to_string(topology_type)
         << ", for colored graphs: " << graph_list.size() << "\n";
-    out << details::to_string(graph_list) << std::endl;
+    out << detail::to_string(graph_list) << std::endl;
 
     auto& ctx_per_thread_data = context.process_device_topology;
     out << "\nStart gpu comm transformation scale-up for graph list count: "
@@ -1879,7 +1879,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
             std::shared_ptr<specific_plain_device_storage> non_indexed_plain_devices =
                                                 devices.thread_gpu_comms.find(thread_id)->second;
             // create device comm wrappers and upgrade last devices in list up to scale_up_proxy type
-            details::color_t process;
+            detail::color_t process;
             ccl::device_index_type last_in_graph_index;
             auto tmp = *id_ring.rbegin();
             process = tmp.color;
@@ -1895,7 +1895,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                     continue;
                 }
 
-                auto scale_virt = details::add_numa_proxy_device<ccl_virtual_gpu_comm, topology_type>(
+                auto scale_virt = detail::add_numa_proxy_device<ccl_virtual_gpu_comm, topology_type>(
                                                                         *non_indexed_plain_devices,
                                                                         last_in_graph_index,
                                                                         context,
@@ -1908,7 +1908,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                 }
                 else
                 {
-                    auto scale_real = details::add_numa_proxy_device<ccl_gpu_comm, topology_type>(
+                    auto scale_real = detail::add_numa_proxy_device<ccl_gpu_comm, topology_type>(
                                                                         *non_indexed_plain_devices,
                                                                         last_in_graph_index,
                                                                         context,
@@ -1965,8 +1965,8 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                                                     devices.thread_gpu_comms.find(thread_id)->second;
 
             //allocate IPC devices pool(if needed)
-            details::cluster_ipc_devices_pool ipc_comms =
-                    details::create_filtered_ipc_destination_gpu_comms<topology_type>(
+            detail::cluster_ipc_devices_pool ipc_comms =
+                    detail::create_filtered_ipc_destination_gpu_comms<topology_type>(
                                             id_ring,
                                             ipc_device_indices,
                                             process_index,
@@ -1975,7 +1975,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                                             *non_indexed_plain_devices);
 
             auto rank_builder =
-                    create_device_functor<details::smart_ring_indexer<topology_type>>(
+                    create_device_functor<detail::smart_ring_indexer<topology_type>>(
                                             id_ring,
                                             process_index,
                                             process_count,
@@ -1983,11 +1983,11 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                                             devices,
                                             out_indexed_devices,
                                             ipc_device_indices,
-                                            ccl::process_device_indices_t{});
+                                            ccl::process_device_indices_type{});
 
             // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
            /* auto rank_builder =
-                    create_device_functor<details::colored_graph_ring_indexer<topology_type>>(id_ring,
+                    create_device_functor<detail::colored_graph_ring_indexer<topology_type>>(id_ring,
                                                                                       thread_id,
                                                                                       process_index,
                                                                                       out_indexed_devices,
@@ -1998,7 +1998,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
 */
             ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
 
-            details::printer<topology_type> p;
+            detail::printer<topology_type> p;
             ccl_tuple_for_each(out_indexed_devices, p);
             out << "Indexer result for devices in thread idx ("
                 << thread_id << "/" << ctx_per_thread_data.size() << "):\n"
@@ -2013,8 +2013,8 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
     }
 
     //allocate IPC devices pool with rank from unassigned IDs
-    details::cluster_ipc_devices_pool ipc_comms =
-                    details::create_ipc_gpu_comms<topology_type>(graph_list, process_index, devices,
+    detail::cluster_ipc_devices_pool ipc_comms =
+                    detail::create_ipc_gpu_comms<topology_type>(graph_list, process_index, devices,
                                                                  device_cluster_size,
                                                                  device_cluster_rank_offset);
     out << "Created IPC devices for processes: " << ipc_comms.size() << ", for cluster_size: " << device_cluster_size
@@ -2041,16 +2041,16 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                     context.get_process_topology<topology_type>(process_index,
                                                                 current_thread_idx)->get_device_storage();
             const auto& curr_real =
-                    details::get_device_with_min_rank<ccl_gpu_comm, topology_type>(
+                    detail::get_device_with_min_rank<ccl_gpu_comm, topology_type>(
                                                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_virt =
-                    details::get_device_with_min_rank<ccl_virtual_gpu_comm, topology_type>(
+                    detail::get_device_with_min_rank<ccl_virtual_gpu_comm, topology_type>(
                                                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_scale_real =
-                    details::get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, topology_type>(
+                    detail::get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, topology_type>(
                                                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_scale_virt =
-                    details::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, topology_type>(
+                    detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, topology_type>(
                                                     indexed_devices_for_current_thread, id_ring);
 
             size_t tg_max_rank = std::max({std::get<0>(curr_real), std::get<0>(curr_virt),
@@ -2079,16 +2079,16 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                         context.get_process_topology<topology_type>(process_index,
                                                                     next_thread_id)->get_device_storage();
                 const auto& real =
-                        details::get_device_with_max_rank<ccl_gpu_comm, topology_type>(
+                        detail::get_device_with_max_rank<ccl_gpu_comm, topology_type>(
                                                             next_thread_ring_topology, id_ring);
                 const auto& virt =
-                        details::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(
+                        detail::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(
                                                             next_thread_ring_topology, id_ring);
                 const auto& scale_real =
-                        details::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>, topology_type>(
+                        detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>, topology_type>(
                                                             next_thread_ring_topology, id_ring);
                 const auto& scale_virt =
-                        details::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, topology_type>(
+                        detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>, topology_type>(
                                                             next_thread_ring_topology, id_ring);
                 if (next_rank != std::min({std::get<0>(real), std::get<0>(virt),
                                            std::get<0>(scale_real), std::get<0>(scale_virt)}))
@@ -2108,7 +2108,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                 if (next_rank == std::get<0>(real))
                 {
                     auto locker =
-                        details::add_concurrent_locker_device<ccl_gpu_comm, topology_type>(next_rank,
+                        detail::add_concurrent_locker_device<ccl_gpu_comm, topology_type>(next_rank,
                                                                                        0,
                                                                                        real,
                                                                                        devices,indexed_devices_for_current_thread);
@@ -2119,7 +2119,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                 else if (next_rank == std::get<0>(virt))
                 {
                     auto locker =
-                        details::add_concurrent_locker_device<ccl_virtual_gpu_comm, topology_type>(next_rank,
+                        detail::add_concurrent_locker_device<ccl_virtual_gpu_comm, topology_type>(next_rank,
                                                                                                0,
                                                                                                virt,
                                                                                                devices,indexed_devices_for_current_thread);
@@ -2181,8 +2181,8 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                 //upgrade left gpu device to IPC SOURCE type
                 if ( current_thread_idx == 0 /* left comm is IPC comm for last process*/ )
                 {
-                    const auto& real = details::get_device_with_max_rank<ccl_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
-                    const auto& virt = details::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
+                    const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
+                    const auto& virt = detail::get_device_with_max_rank<ccl_virtual_gpu_comm, topology_type>(indexed_devices_for_current_thread, id_ring);
 
                     size_t left_ipc_source_rank = std::min({std::get<0>(real), std::get<0>(virt)});
                     out << "Upgrade thread id: " << current_thread_idx
@@ -2192,7 +2192,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                     if(left_ipc_source_rank == std::get<0>(real))
                     {
                         auto locker =
-                                    details::add_ipc_source_locker_device<ccl_gpu_comm,
+                                    detail::add_ipc_source_locker_device<ccl_gpu_comm,
                                                                         topology_type>(next_rank,
                                                                                    0,
                                                                                    real,
@@ -2204,7 +2204,7 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
                     else if (left_ipc_source_rank == std::get<0>(virt))
                     {
                         auto locker =
-                                details::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
+                                detail::add_ipc_source_locker_device<ccl_virtual_gpu_comm,
                                                                   topology_type>(next_rank,
                                                                                  0,
                                                                                  virt,
@@ -2222,9 +2222,9 @@ bool allied_process_group_ring_topology::build_specific_scale_up(std::ostream& o
 }
 
 bool allied_process_group_ring_topology::build_specific(std::ostream& out,
-                                                        const ccl::process_device_indices_t& per_thread_device_indices,
-                                                        const ccl::device_indices_t& scaleout_device_indices,
-                                                        const details::plain_graph_list& graph_list)
+                                                        const ccl::process_device_indices_type& per_thread_device_indices,
+                                                        const ccl::device_indices_type& scaleout_device_indices,
+                                                        const detail::plain_graph_list& graph_list)
 {
     out << "TODO: Not implemented";
     return false;
@@ -2232,26 +2232,26 @@ bool allied_process_group_ring_topology::build_specific(std::ostream& out,
 
 bool allied_process_group_ring_topology::build_specific_scale_up_out(
                         std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const ccl::process_device_indices_t& scaleout_device_indices,
-                        const ccl::process_device_indices_t& ipc_device_indices,
-                        details::colored_plain_graph_list& graph_list,
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const ccl::process_device_indices_type& scaleout_device_indices,
+                        const ccl::process_device_indices_type& ipc_device_indices,
+                        detail::colored_plain_graph_list& graph_list,
                         const std::map<size_t, size_t>& process_device_rank_offset)
 {
     out << "TODO: Not implemented";
     return false;
 }
-details::global_sorted_plain_graphs
+detail::global_sorted_plain_graphs
         allied_process_group_ring_topology::global_graph_list_resolver(
-                                const details::adjacency_matrix& matrix,
-                                const ccl::process_device_indices_t& per_process_device_indexes,
-                                const ccl::process_device_indices_t& foreign_processes_device_indexes,
-                                details::p2p_rating_function ping)
+                                const detail::adjacency_matrix& matrix,
+                                const ccl::process_device_indices_type& per_process_device_indexes,
+                                const ccl::process_device_indices_type& foreign_processes_device_indexes,
+                                detail::p2p_rating_function ping)
 {
-    details::global_sorted_plain_graphs global_graph_list;
+    detail::global_sorted_plain_graphs global_graph_list;
 
     {
-        details::plain_graph_list my_process_list = details::graph_list_resolver(matrix,
+        detail::plain_graph_list my_process_list = detail::graph_list_resolver(matrix,
                                                                                  per_process_device_indexes,
                                                                                  ping);
         global_graph_list.emplace(process_index, std::move(my_process_list));
diff --git a/src/common/comm/l0/topology/ring/process_group_ring_creator.hpp b/src/common/comm/l0/topology/ring/process_group_ring_creator.hpp
index dbb788380..ecb648471 100644
--- a/src/common/comm/l0/topology/ring/process_group_ring_creator.hpp
+++ b/src/common/comm/l0/topology/ring/process_group_ring_creator.hpp
@@ -54,126 +54,126 @@ class allied_process_group_ring_topology
                                         const ccl::cluster_aggregated_device_mask_t& cluster_affinity_mask);
 
     static size_t default_property_p2p_rating_calculator(const ccl_device &lhs, const ccl_device &rhs);
-    static details::adjacency_matrix build_p2p_capability_matrix(std::ostream& out,
+    static detail::adjacency_matrix build_p2p_capability_matrix(std::ostream& out,
                                                           const ccl::process_aggregated_device_mask_t &node_device_masks,
-                                                          details::p2p_rating_function ping =
+                                                          detail::p2p_rating_function ping =
                                                                         default_property_p2p_rating_calculator);
-    static details::adjacency_matrix build_p2p_capability_matrix(std::ostream& out,
-                                                          const ccl::process_device_indices_t& node_device_indices,
-                                                           details::p2p_rating_function ping =
+    static detail::adjacency_matrix build_p2p_capability_matrix(std::ostream& out,
+                                                          const ccl::process_device_indices_type& node_device_indices,
+                                                           detail::p2p_rating_function ping =
                                                                         default_property_p2p_rating_calculator);
     bool build(std::ostream& out,
                const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
                const std::vector<ccl::device_mask_t>& ipc_device_indices,
-               const details::adjacency_matrix& matrix,
-               details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+               const detail::adjacency_matrix& matrix,
+               detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 
     bool build(std::ostream& out,
-               const ccl::process_device_indices_t& per_thread_device_indices,
-               const std::vector<ccl::device_indices_t>& ipc_device_indices,
-               const details::adjacency_matrix& matrix,
-               details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+               const ccl::process_device_indices_type& per_thread_device_indices,
+               const std::vector<ccl::device_indices_type>& ipc_device_indices,
+               const detail::adjacency_matrix& matrix,
+               detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 
     bool build_all(std::ostream& out,
-                   const ccl::process_device_indices_t& per_thread_device_indices,
-                   const std::vector<ccl::device_indices_t>& ipc_device_indices,
-                   const details::adjacency_matrix& matrix,
-                   details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+                   const ccl::process_device_indices_type& per_thread_device_indices,
+                   const std::vector<ccl::device_indices_type>& ipc_device_indices,
+                   const detail::adjacency_matrix& matrix,
+                   detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 private:
     bool build_specific(std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const details::plain_graph& graph);
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const detail::plain_graph& graph);
     bool build_specific(std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const details::plain_graph_list& graph_list);
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const detail::plain_graph_list& graph_list);
     bool build_specific(std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const ccl::device_indices_t& scaleout_device_indices,
-                        const details::plain_graph_list& graph_list);
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const ccl::device_indices_type& scaleout_device_indices,
+                        const detail::plain_graph_list& graph_list);
 
     bool build_specific_colored(std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const ccl::process_device_indices_t& ipc_device_indices,
-                        details::colored_plain_graph& graph,
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const ccl::process_device_indices_type& ipc_device_indices,
+                        detail::colored_plain_graph& graph,
                         const std::map<size_t, size_t>& process_device_rank_offset);
     bool build_specific_scale_up(std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const ccl::process_device_indices_t& ipc_device_indices,
-                        details::colored_plain_graph_list& graph_list,
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const ccl::process_device_indices_type& ipc_device_indices,
+                        detail::colored_plain_graph_list& graph_list,
                         const std::map<size_t, size_t>& process_device_rank_offset);
     bool build_specific_scale_up_out(std::ostream& out,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const ccl::process_device_indices_t& scaleout_device_indices,
-                        const ccl::process_device_indices_t& ipc_device_indices,
-                        details::colored_plain_graph_list& graph_list,
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const ccl::process_device_indices_type& scaleout_device_indices,
+                        const ccl::process_device_indices_type& ipc_device_indices,
+                        detail::colored_plain_graph_list& graph_list,
                         const std::map<size_t, size_t>& process_device_rank_offset);
 
-    details::plain_graph_list
+    detail::plain_graph_list
             create_my_process_graphs(std::ostream& out,
-                                     const ccl::process_device_indices_t& per_thread_device_indices,
-                                     const details::adjacency_matrix& matrix,
-                                     details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+                                     const ccl::process_device_indices_type& per_thread_device_indices,
+                                     const detail::adjacency_matrix& matrix,
+                                     detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 
-    details::global_sorted_plain_graphs collect_cluster_plain_graphs(std::ostream& out,
+    detail::global_sorted_plain_graphs collect_cluster_plain_graphs(std::ostream& out,
                                                                   std::shared_ptr<ccl::communicator> comm,
                                                                   size_t process_index,
-                                                                  const details::plain_graph_list& my_process_graph);
-    details::global_sorted_colored_plain_graphs
+                                                                  const detail::plain_graph_list& my_process_graph);
+    detail::global_sorted_colored_plain_graphs
                     collect_cluster_colored_plain_graphs(std::ostream& out,
                                                          std::shared_ptr<ccl::communicator> comm,
                                                          size_t process_index,
-                                                         const details::colored_plain_graph_list& my_process_graph);
+                                                         const detail::colored_plain_graph_list& my_process_graph);
 
-    virtual details::global_plain_graphs merge_allied_nodes_plain_graphs(std::ostream& out,
-                                                                 const ccl::cluster_device_indices_t &cluster_indices,
+    virtual detail::global_plain_graphs merge_allied_nodes_plain_graphs(std::ostream& out,
+                                                                 const ccl::cluster_device_indices_type &cluster_indices,
                                                                  size_t process_index,
-                                                                 const details::global_sorted_plain_graphs& cluster_graphs,
-                                                                 details::p2p_rating_function ping = default_property_p2p_rating_calculator);
-    virtual details::global_colored_plain_graphs
+                                                                 const detail::global_sorted_plain_graphs& cluster_graphs,
+                                                                 detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
+    virtual detail::global_colored_plain_graphs
                     merge_allied_nodes_in_colored_plain_graphs(std::ostream& out,
-                                                               const ccl::cluster_device_indices_t &cluster_indices,
+                                                               const ccl::cluster_device_indices_type &cluster_indices,
                                                                size_t process_index,
                                                                size_t process_count,
-                                                               const details::global_sorted_colored_plain_graphs& cluster_graphs,
-                                                               details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+                                                               const detail::global_sorted_colored_plain_graphs& cluster_graphs,
+                                                               detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 
-    details::plain_graph_list resize_merged_graphs_for_process(size_t process_index,
-                                                               const details::global_plain_graphs& merged_cluster_graphs,
-                                                               const details::plain_graph_list& original_graph_list,
+    detail::plain_graph_list resize_merged_graphs_for_process(size_t process_index,
+                                                               const detail::global_plain_graphs& merged_cluster_graphs,
+                                                               const detail::plain_graph_list& original_graph_list,
                                                                std::ostream& out);
-    details::colored_plain_graph_list
+    detail::colored_plain_graph_list
                     resize_merged_colored_graphs_for_process(
                                         size_t process_index,
                                         size_t process_count,
-                                        const details::global_colored_plain_graphs& merged_cluster_graphs,
-                                        const details::colored_plain_graph_list& original_graph_list,
+                                        const detail::global_colored_plain_graphs& merged_cluster_graphs,
+                                        const detail::colored_plain_graph_list& original_graph_list,
                                         std::ostream& out);
 
-    virtual ccl::process_device_indices_t
+    virtual ccl::process_device_indices_type
                     create_scaleout_devices_in_graphs_for_process(
                                         size_t process_index,
                                         size_t cluster_size,
-                                        details::global_sorted_plain_graphs& cluster_graphs,
+                                        detail::global_sorted_plain_graphs& cluster_graphs,
                                         std::ostream& out);
-    virtual ccl::process_device_indices_t
+    virtual ccl::process_device_indices_type
                     create_scaleout_devices_in_colored_graphs_for_process(
                                         size_t process_index,
                                         size_t cluster_size,
-                                        details::global_sorted_colored_plain_graphs& cluster_graphs,
-                                        details::global_sorted_colored_plain_graphs& initial_cluster_graphs,
+                                        detail::global_sorted_colored_plain_graphs& cluster_graphs,
+                                        detail::global_sorted_colored_plain_graphs& initial_cluster_graphs,
                                         std::ostream& out);
-    virtual ccl::process_device_indices_t
+    virtual ccl::process_device_indices_type
                     create_ipc_devices_in_colored_graphs_for_process(
                                         size_t process_idx,
                                         size_t cluster_size,
-                                        details::global_sorted_colored_plain_graphs& cluster_graphs,
-                                        details::global_sorted_colored_plain_graphs& initial_cluster_graphs,
+                                        detail::global_sorted_colored_plain_graphs& cluster_graphs,
+                                        detail::global_sorted_colored_plain_graphs& initial_cluster_graphs,
                                         std::ostream& out);
 
-    details::global_sorted_plain_graphs global_graph_list_resolver(const details::adjacency_matrix& matrix,
-                                                       const ccl::process_device_indices_t& per_process_device_indexes,
-                                                       const ccl::process_device_indices_t& foreign_processes_device_indexes,
-                                                       details::p2p_rating_function ping);
+    detail::global_sorted_plain_graphs global_graph_list_resolver(const detail::adjacency_matrix& matrix,
+                                                       const ccl::process_device_indices_type& per_process_device_indexes,
+                                                       const ccl::process_device_indices_type& foreign_processes_device_indexes,
+                                                       detail::p2p_rating_function ping);
 };
 }
 #endif
diff --git a/src/common/comm/l0/topology/ring/ring_construction_utils.hpp b/src/common/comm/l0/topology/ring/ring_construction_utils.hpp
index 834e9ae30..8f624645a 100644
--- a/src/common/comm/l0/topology/ring/ring_construction_utils.hpp
+++ b/src/common/comm/l0/topology/ring/ring_construction_utils.hpp
@@ -18,7 +18,7 @@
 #include <memory>
 
 #include "common/comm/l0/topology/topology_construction_utils.hpp"
-#include "oneapi/ccl/ccl_config.h"
+#include "oneapi/ccl/config.h"
 
 #include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
 #include "common/comm/l0/devices/devices_declaration.hpp"
@@ -37,7 +37,7 @@
 
 namespace native {
 
-namespace details {
+namespace detail {
 /*REFACTORING*/
 template <class device_t,
           ccl::group_split_type group_id,
@@ -138,7 +138,7 @@ struct graph_ring_indexer {
                 throw std::logic_error(std::string("Unknown device in id ring vector: ") +
                                        ccl::to_string(id));
             }
-            size_t rank = std::distance(id_array.begin(), it);
+            int rank = std::distance(id_array.begin(), it);
 
             size_t already_assigned_ids_count = assigned_ids.count(id);
             //rank += already_assigned_ids_count; TODO
@@ -211,7 +211,7 @@ struct colored_graph_ring_indexer {
             }
 
             //rank in local graph_ring
-            size_t rank = std::distance(id_array.begin(), it);
+            int rank = std::distance(id_array.begin(), it);
             size_t size = id_array.size();
 
             //apply offsets
@@ -242,12 +242,12 @@ struct colored_graph_ring_indexer {
 };
 
 static constexpr color_t marked_color = std::numeric_limits<color_t>::max();
-inline void separate_ipc_devices(const ccl::process_device_indices_t& ipc_indices,
+inline void separate_ipc_devices(const ccl::process_device_indices_type& ipc_indices,
                                  size_t process_idx,
                                  size_t process_num,
                                  const colored_plain_graph& id_array,
-                                 ccl::process_device_indices_t& ipc_src_indices,
-                                 ccl::process_device_indices_t& ipc_dst_indices,
+                                 ccl::process_device_indices_type& ipc_src_indices,
+                                 ccl::process_device_indices_type& ipc_dst_indices,
                                  color_t exclude_color = marked_color) {
     // find right ipcs
     do {
@@ -348,8 +348,8 @@ struct smart_ring_indexer {
                        size_t process_device_rank_offset,
                        device_storage& device_factory,
                        specific_indexed_device_storage& device_topology,
-                       const ccl::process_device_indices_t& ipc_device,
-                       const ccl::process_device_indices_t& scaleout_device_indices)
+                       const ccl::process_device_indices_type& ipc_device,
+                       const ccl::process_device_indices_type& scaleout_device_indices)
             : id_array(id_ring_vector),
               process_idx(process_id),
               process_num(process_count),
@@ -388,7 +388,7 @@ struct smart_ring_indexer {
             }
 
             //rank in local graph_ring
-            size_t rank = std::distance(id_array.begin(), it);
+            int rank = std::distance(id_array.begin(), it);
             size_t size = id_array.size();
 
             //Check on IPC source candidate at first
@@ -455,7 +455,7 @@ struct smart_ring_indexer {
             }
 
             //rank in local graph_ring
-            size_t rank = std::distance(id_array.begin(), it);
+            int rank = std::distance(id_array.begin(), it);
             size_t size = id_array.size();
 
             //apply offsets
@@ -475,30 +475,30 @@ struct smart_ring_indexer {
     size_t device_index_offset;
     device_storage& factory;
     specific_indexed_device_storage& topology;
-    ccl::process_device_indices_t ipc_src_indices;
-    ccl::process_device_indices_t ipc_dst_indices;
-    const ccl::process_device_indices_t& scaleout_indices;
+    ccl::process_device_indices_type ipc_src_indices;
+    ccl::process_device_indices_type ipc_dst_indices;
+    const ccl::process_device_indices_type& scaleout_indices;
     size_t marked_indices_count;
 
 private:
     template <class device_t>
-    bool try_as_ipc_source(std::shared_ptr<device_t> gpu_device, size_t rank, size_t size) {
+    bool try_as_ipc_source(std::shared_ptr<device_t> gpu_device, int rank, size_t size) {
         //concurrent device is not IPC source
         return false;
     }
 
-    bool try_as_ipc_source(std::shared_ptr<ccl_gpu_comm> gpu_device, size_t rank, size_t size) {
+    bool try_as_ipc_source(std::shared_ptr<ccl_gpu_comm> gpu_device, int rank, size_t size) {
         return try_as_ipc_source_impl(gpu_device, rank, size);
     }
 
     bool try_as_ipc_source(std::shared_ptr<ccl_virtual_gpu_comm> gpu_device,
-                           size_t rank,
+                           int rank,
                            size_t size) {
         return try_as_ipc_source_impl(gpu_device, rank, size);
     }
 
     template <class device_t>
-    bool try_as_ipc_source_impl(std::shared_ptr<device_t> gpu_device, size_t rank, size_t size) {
+    bool try_as_ipc_source_impl(std::shared_ptr<device_t> gpu_device, int rank, size_t size) {
         //Check on IPC source candidate at first
         const ccl::device_index_type& id = gpu_device->get_device().get_device_path();
         auto process_set = ipc_src_indices.find(process_idx);
@@ -563,7 +563,7 @@ struct graph_ring_indexer_ext : public graph_ring_indexer<group_id, class_id> {
                 throw std::logic_error(std::string("Unknown device in id ring vector: ") +
                                        ccl::to_string(id));
             }
-            size_t rank = std::distance(id_array.begin(), it);
+            int rank = std::distance(id_array.begin(), it);
             rank += rank_offset;
 
             size_t already_assigned_ids_count = assigned_ids.count(id);
@@ -640,7 +640,7 @@ struct graph_ring_indexer_unique_index : public graph_ring_indexer<group_id, cla
                 continue;
             }
 
-            size_t rank = std::distance(id_array.begin(), it);
+            int rank = std::distance(id_array.begin(), it);
             size_t already_assigned_ids_count = assigned_ids.count(id);
             //rank += already_assigned_ids_count; TODO
             (void)already_assigned_ids_count;
@@ -715,7 +715,7 @@ struct graph_ring_indexer_unique_index_ext : public graph_ring_indexer<group_id,
                 continue;
             }
 
-            size_t rank = std::distance(id_array.begin(), it);
+            int rank = std::distance(id_array.begin(), it);
             rank += rank_offset;
 
             size_t already_assigned_ids_count = assigned_ids.count(id);
@@ -1135,7 +1135,7 @@ inline ipc_devices_pool create_ipc_gpu_comms(id_thread_table assigned_ids_copy,
         }
 
         //find unassigned_device
-        size_t rank = std::distance(id_ring.begin(), graph_it);
+        int rank = std::distance(id_ring.begin(), graph_it);
         size_t size = size_override_value;
 
         //recalculate rank to apply offset for other processes count
@@ -1157,7 +1157,7 @@ using cluster_ipc_devices_pool = std::map<size_t/*process_id*/, ipc_devices_pool
 
 template<ccl::group_split_type topology>
 inline cluster_ipc_devices_pool create_filtered_ipc_gpu_comms(const colored_plain_graph& id_ring,
-                                                     const ccl::process_device_indices_t& ipc_indices,
+                                                     const ccl::process_device_indices_type& ipc_indices,
                                                      size_t process_idx,
                                                      size_t process_size,
                                                      device_storage& device_factory)
@@ -1189,7 +1189,7 @@ inline cluster_ipc_devices_pool create_filtered_ipc_gpu_comms(const colored_plai
             }
 
             //device is IPC
-            size_t rank = std::distance(id_ring.begin(), graph_it);
+            int rank = std::distance(id_ring.begin(), graph_it);
             size_t size = id_ring.size();
 
             ccl_device_driver::device_ptr ipc_device = get_runtime_device(graph_it->index);
@@ -1210,7 +1210,7 @@ inline cluster_ipc_devices_pool create_filtered_ipc_gpu_comms(const colored_plai
 template<ccl::group_split_type topology>
 inline cluster_ipc_devices_pool create_filtered_ipc_destination_gpu_comms(
                                             const colored_plain_graph& id_ring,
-                                            const ccl::process_device_indices_t& ipc_indices,
+                                            const ccl::process_device_indices_type& ipc_indices,
                                             size_t process_idx,
                                             size_t process_size,
                                             device_storage& device_factory,
@@ -1239,7 +1239,7 @@ inline cluster_ipc_devices_pool create_filtered_ipc_destination_gpu_comms(
             }
 
             //device is IPC
-            size_t rank = std::distance(id_ring.begin(), graph_it);
+            int rank = std::distance(id_ring.begin(), graph_it);
             size_t size = id_ring.size();
 
             ccl_device_driver::device_ptr ipc_device = get_runtime_device(graph_it->index);
@@ -1269,7 +1269,7 @@ inline cluster_ipc_devices_pool create_ipc_gpu_comms(const colored_plain_graph&
         if (graph_it->color != colored_graph_ring_indexer<topology>::marked_color and
             graph_it->color != process_idx)
         {
-            size_t rank = std::distance(id_ring.begin(), graph_it);
+            int rank = std::distance(id_ring.begin(), graph_it);
             size_t size = size_override_value;
 
             //recalculate rank to apply offset for other processes count
@@ -1322,5 +1322,5 @@ inline std::vector<size_t> get_ipc_proceses(const cluster_ipc_devices_pool& ipc_
     return ipc_processes_id;
 }
 #endif
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/topology/ring/thread_group_ring_creator.cpp b/src/common/comm/l0/topology/ring/thread_group_ring_creator.cpp
index 96a2bb43b..87af7841c 100644
--- a/src/common/comm/l0/topology/ring/thread_group_ring_creator.cpp
+++ b/src/common/comm/l0/topology/ring/thread_group_ring_creator.cpp
@@ -25,14 +25,14 @@ thread_group_ring_topology::thread_group_ring_topology(thread_group_context& ctx
 
 size_t thread_group_ring_topology::default_property_p2p_rating_calculator(const ccl_device& lhs,
                                                                           const ccl_device& rhs) {
-    return details::property_p2p_rating_calculator(lhs, rhs, THREAD_GROUP_WEIGHT);
+    return detail::property_p2p_rating_calculator(lhs, rhs, THREAD_GROUP_WEIGHT);
 }
 
-details::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matrix(
+detail::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matrix(
     std::ostream& out,
     const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
-    details::p2p_rating_function ping) {
-    ccl::process_device_indices_t per_thread_device_indices;
+    detail::p2p_rating_function ping) {
+    ccl::process_device_indices_type per_thread_device_indices;
     for (const auto& mask : per_thread_device_masks) {
         per_thread_device_indices.insert(
             { mask.first, ccl_device_driver::get_device_indices(mask.second) });
@@ -41,16 +41,16 @@ details::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matri
     return build_p2p_capability_matrix(out, per_thread_device_indices, ping);
 }
 
-details::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matrix(
+detail::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matrix(
     std::ostream& out,
-    const ccl::process_device_indices_t& per_thread_device_indices,
-    details::p2p_rating_function ping) {
+    const ccl::process_device_indices_type& per_thread_device_indices,
+    detail::p2p_rating_function ping) {
     // Build adjacency matrix with P2P capability:
     // Rows & columnn is a device IDs ( froms 0 to CCL_GPU_DEVICES_AFFINITY_MASK_SIZE)
     // element values - is a weight of P2P activity: 0 means - devices are not connected
     // If values is not 0 - than two devies can be combined together
 
-    details::adjacency_matrix ring_p2p_matrix;
+    detail::adjacency_matrix ring_p2p_matrix;
     if (per_thread_device_indices.empty()) {
         out << "No indices - nothing to build" << std::endl;
         return ring_p2p_matrix;
@@ -59,12 +59,12 @@ details::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matri
     out << "Build adjacency matrix by: " << thread_group_ring_topology::name()
         << " - threads count: " << per_thread_device_indices.size() << std::endl;
 
-    ccl::device_indices_t aggregated_thread_indices = std::accumulate(
+    ccl::device_indices_type aggregated_thread_indices = std::accumulate(
         per_thread_device_indices.begin(),
         per_thread_device_indices.end(),
-        ccl::device_indices_t(),
-        [](ccl::device_indices_t& partial_mask,
-           const std::pair<size_t, ccl::device_indices_t>& thread_mask) {
+        ccl::device_indices_type(),
+        [](ccl::device_indices_type& partial_mask,
+           const std::pair<size_t, ccl::device_indices_type>& thread_mask) {
             partial_mask.insert(thread_mask.second.begin(), thread_mask.second.end());
             return partial_mask;
         });
@@ -81,15 +81,15 @@ details::adjacency_matrix thread_group_ring_topology::build_p2p_capability_matri
 bool thread_group_ring_topology::build(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_t& per_thread_device_indices,
-    const details::adjacency_matrix& matrix,
-    details::p2p_rating_function ping) {
+    const ccl::process_device_indices_type& per_thread_device_indices,
+    const detail::adjacency_matrix& matrix,
+    detail::p2p_rating_function ping) {
     out << "\n/*************\"" << thread_group_ring_topology::name()
         << "\" for threads: " << context.thread_device_topology.size() << "*************/\n"
         << std::endl;
 
     out << "Resolve device graph: " << std::endl;
-    details::plain_graph_list id_rings =
+    detail::plain_graph_list id_rings =
         graph_list_resolver(matrix, per_thread_device_indices, ping);
 
     size_t size = id_rings.size();
@@ -111,9 +111,9 @@ bool thread_group_ring_topology::build(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
     const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
-    const details::adjacency_matrix& matrix,
-    details::p2p_rating_function ping) {
-    ccl::process_device_indices_t per_thread_device_indices;
+    const detail::adjacency_matrix& matrix,
+    detail::p2p_rating_function ping) {
+    ccl::process_device_indices_type per_thread_device_indices;
     for (const auto& mask : per_thread_device_masks) {
         per_thread_device_indices.insert(
             { mask.first, ccl_device_driver::get_device_indices(mask.second) });
@@ -125,20 +125,19 @@ bool thread_group_ring_topology::build(
 bool thread_group_ring_topology::build_specific(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_t& per_thread_device_indices,
-    const details::plain_graph& id_ring) {
+    const ccl::process_device_indices_type& per_thread_device_indices,
+    const detail::plain_graph& id_ring) {
     size_t ring_index = 0;
     constexpr ccl::device_topology_type class_id = ccl::device_topology_type::ring;
 
     out << "Start building topology: " << ::to_string(class_id) << ", for graph:\n";
-    out << details::to_string(id_ring);
+    out << detail::to_string(id_ring);
 
     // id_ring - inter-thread ring
     out << "\nStart indexer:" << std::endl;
-    details::id_thread_table assigned_ids; //device_id -> thread_id
+    detail::id_thread_table assigned_ids; //device_id -> thread_id
     auto& ctx_per_thread_data = context.thread_device_topology;
-    std::vector<details::marked_idx> marked_id_ring =
-        details::create_marked(id_ring); // marked graph
+    std::vector<detail::marked_idx> marked_id_ring = detail::create_marked(id_ring); // marked graph
 
     auto topology_comm_addr = comm_addr;
     topology_comm_addr.comm_size = marked_id_ring.size();
@@ -154,12 +153,11 @@ bool thread_group_ring_topology::build_specific(
             devices_factory.thread_gpu_comms.find(thread_id)->second;
 
         // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
-        auto rank_builder =
-            create_device_functor<details::graph_ring_indexer<group_id(), class_id>>(
-                marked_id_ring, assigned_ids, thread_id, out_indexed_devices->get_device_storage());
+        auto rank_builder = create_device_functor<detail::graph_ring_indexer<group_id(), class_id>>(
+            marked_id_ring, assigned_ids, thread_id, out_indexed_devices->get_device_storage());
         ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
 
-        details::printer<group_id(), class_id> p;
+        detail::printer<group_id(), class_id> p;
         ccl_tuple_for_each(out_indexed_devices->get_device_storage(), p);
         out << "Indexer result for devices in thread idx (" << thread_id << "/"
             << ctx_per_thread_data.size() << "):\n"
@@ -178,10 +176,10 @@ bool thread_group_ring_topology::build_specific(
                 .get_topology(ring_index)
                 ->get_device_storage();
         const auto& curr_real =
-            details::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
+            detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
                 indexed_devices_for_current_thread, id_ring);
         const auto& curr_virt =
-            details::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
+            detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
                 indexed_devices_for_current_thread, id_ring);
 
         size_t tg_max_rank = std::max({ std::get<0>(curr_real), std::get<0>(curr_virt) });
@@ -205,11 +203,10 @@ bool thread_group_ring_topology::build_specific(
             auto& next_thread_ring_topology = context.get_thread_topology<class_id>(next_thread_id)
                                                   .get_topology(ring_index)
                                                   ->get_device_storage();
-            const auto& real =
-                details::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
-                    next_thread_ring_topology, id_ring);
+            const auto& real = detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
+                next_thread_ring_topology, id_ring);
             const auto& virt =
-                details::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
+                detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
                     next_thread_ring_topology, id_ring);
 
             if (next_rank != std::min({ std::get<0>(real), std::get<0>(virt) })) {
@@ -225,14 +222,14 @@ bool thread_group_ring_topology::build_specific(
                 << ")" << std::endl;
             if (next_rank == std::get<0>(real)) {
                 auto locker =
-                    details::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
+                    detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
                         next_rank, 0, real, devices_factory, indexed_devices_for_current_thread);
                 out << "Added real locker by index: " << next_rank
                     << ", for thread idx: " << current_thread_idx << ":\n"
                     << locker->to_string() << std::endl;
             }
             else if (next_rank == std::get<0>(virt)) {
-                auto locker = details::
+                auto locker = detail::
                     add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), class_id>(
                         next_rank, 0, virt, devices_factory, indexed_devices_for_current_thread);
                 out << "Added virtual locker by index: " << next_rank
@@ -254,14 +251,14 @@ bool thread_group_ring_topology::build_specific(
 bool thread_group_ring_topology::build_scale_up_specific(
     std::ostream& out,
     const ccl::context_comm_addr& comm_addr,
-    const ccl::process_device_indices_t& per_thread_device_indicess,
-    const details::plain_graph_list& graph_list) {
+    const ccl::process_device_indices_type& per_thread_device_indicess,
+    const detail::plain_graph_list& graph_list) {
     size_t ring_index = 0;
     constexpr ccl::device_topology_type class_id = ccl::device_topology_type::ring;
 
     out << "Start building topology: " << ::to_string(class_id)
         << ", for graphs: " << graph_list.size() << "\n";
-    out << details::to_string(graph_list);
+    out << detail::to_string(graph_list);
 
     auto& ctx_per_thread_data = context.thread_device_topology;
     (void)ctx_per_thread_data;
@@ -294,7 +291,7 @@ bool thread_group_ring_topology::build_scale_up_specific(
                 }
 
                 auto scale_virt =
-                    details::add_numa_proxy_device<ccl_virtual_gpu_comm, group_id(), class_id>(
+                    detail::add_numa_proxy_device<ccl_virtual_gpu_comm, group_id(), class_id>(
                         *non_indexed_plain_devices, last_in_graph_index, context, devices_factory);
                 if (scale_virt) {
                     created_scaleup_indices.insert(last_in_graph_index);
@@ -303,7 +300,7 @@ bool thread_group_ring_topology::build_scale_up_specific(
                 }
                 else {
                     auto scale_real =
-                        details::add_numa_proxy_device<ccl_gpu_comm, group_id(), class_id>(
+                        detail::add_numa_proxy_device<ccl_gpu_comm, group_id(), class_id>(
                             *non_indexed_plain_devices,
                             last_in_graph_index,
                             context,
@@ -333,7 +330,7 @@ bool thread_group_ring_topology::build_scale_up_specific(
     std::map<size_t /*graph_num*/, size_t /*offset*/>
         index_offset_for_graphs; // calculate indexed devices count in each graph
 
-    ccl::device_indices_t total_device_indices;
+    ccl::device_indices_type total_device_indices;
     for (const auto& graph : graph_list) {
         total_device_indices.insert(graph.begin(), graph.end());
     }
@@ -342,10 +339,10 @@ bool thread_group_ring_topology::build_scale_up_specific(
     topology_comm_addr.comm_size = total_device_indices.size();
 
     for (const auto& id_ring : graph_list) {
-        details::id_thread_table assigned_ids; //device_id -> thread_id
+        detail::id_thread_table assigned_ids; //device_id -> thread_id
         auto& ctx_per_thread_data = context.thread_device_topology;
-        std::vector<details::marked_idx> marked_id_ring =
-            details::create_marked(id_ring); // marked graph
+        std::vector<detail::marked_idx> marked_id_ring =
+            detail::create_marked(id_ring); // marked graph
         size_t index_offset = accumulated_index_offset_for_graph;
 
         for (auto per_thread_it = ctx_per_thread_data.begin();
@@ -369,13 +366,13 @@ bool thread_group_ring_topology::build_scale_up_specific(
 
             // use graph ids to enumerate thread plain list `thread_gpu_comms` into `out_indexed_devices`
             auto rank_builder = create_device_functor<
-                details::graph_ring_indexer_unique_index<group_id(), class_id>>(
+                detail::graph_ring_indexer_unique_index<group_id(), class_id>>(
                 marked_id_ring, assigned_ids, thread_id, out_indexed_devices, index_offset, 0, 0);
 
             ccl_tuple_for_each(*non_indexed_plain_devices, rank_builder);
 
             // print partial topology enumeration for 'graph' from 'graph_list'
-            details::printer<group_id(), class_id> p;
+            detail::printer<group_id(), class_id> p;
             ccl_tuple_for_each(out_indexed_devices, p);
             out << "Indexer result for devices in thread idx (" << thread_id << "/"
                 << ctx_per_thread_data.size() << "):\n"
@@ -405,19 +402,19 @@ bool thread_group_ring_topology::build_scale_up_specific(
 
             //find max device rank in current thread devices
             const auto& curr_real =
-                details::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
+                detail::get_device_with_min_rank<ccl_gpu_comm, group_id(), class_id>(
                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_virt =
-                details::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
+                detail::get_device_with_min_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
                     indexed_devices_for_current_thread, id_ring);
-            const auto& curr_scale_real = details::
+            const auto& curr_scale_real = detail::
                 get_device_with_min_rank<ccl_numa_proxy<ccl_gpu_comm>, group_id(), class_id>(
                     indexed_devices_for_current_thread, id_ring);
             const auto& curr_scale_virt =
-                details::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                  group_id(),
-                                                  class_id>(indexed_devices_for_current_thread,
-                                                            id_ring);
+                detail::get_device_with_min_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
+                                                 group_id(),
+                                                 class_id>(indexed_devices_for_current_thread,
+                                                           id_ring);
 
             size_t tg_max_rank = std::max({ std::get<0>(curr_real),
                                             std::get<0>(curr_virt),
@@ -445,19 +442,19 @@ bool thread_group_ring_topology::build_scale_up_specific(
                         .get_additiona_topology(ring_index)
                         ->get_device_storage();
                 const auto& real =
-                    details::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
+                    detail::get_device_with_max_rank<ccl_gpu_comm, group_id(), class_id>(
                         next_thread_ring_topology, id_ring);
                 const auto& virt =
-                    details::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
+                    detail::get_device_with_max_rank<ccl_virtual_gpu_comm, group_id(), class_id>(
                         next_thread_ring_topology, id_ring);
                 const auto& scale_real =
-                    details::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>,
-                                                      group_id(),
-                                                      class_id>(next_thread_ring_topology, id_ring);
+                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_gpu_comm>,
+                                                     group_id(),
+                                                     class_id>(next_thread_ring_topology, id_ring);
                 const auto& scale_virt =
-                    details::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
-                                                      group_id(),
-                                                      class_id>(next_thread_ring_topology, id_ring);
+                    detail::get_device_with_max_rank<ccl_numa_proxy<ccl_virtual_gpu_comm>,
+                                                     group_id(),
+                                                     class_id>(next_thread_ring_topology, id_ring);
                 if (next_rank != std::min({ std::get<0>(real),
                                             std::get<0>(virt),
                                             std::get<0>(scale_real),
@@ -475,7 +472,7 @@ bool thread_group_ring_topology::build_scale_up_specific(
                     << ")" << std::endl;
                 if (next_rank == std::get<0>(real)) {
                     auto locker =
-                        details::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
+                        detail::add_concurrent_locker_device<ccl_gpu_comm, group_id(), class_id>(
                             next_rank,
                             index_offset_for_graphs[graph_num],
                             real,
@@ -487,7 +484,7 @@ bool thread_group_ring_topology::build_scale_up_specific(
                         << locker->to_string() << std::endl;
                 }
                 else if (next_rank == std::get<0>(virt)) {
-                    auto locker = details::
+                    auto locker = detail::
                         add_concurrent_locker_device<ccl_virtual_gpu_comm, group_id(), class_id>(
                             next_rank,
                             index_offset_for_graphs[graph_num],
@@ -520,7 +517,7 @@ bool thread_group_ring_topology::build_scale_up_specific(
          ++per_thread_it) {
         size_t thread_id = per_thread_it->first;
 
-        details::printer<group_id(), class_id> p;
+        detail::printer<group_id(), class_id> p;
         ccl_tuple_for_each(context.get_thread_topology<class_id>(thread_id)
                                .get_additiona_topology(ring_index)
                                ->get_device_storage(),
diff --git a/src/common/comm/l0/topology/ring/thread_group_ring_creator.hpp b/src/common/comm/l0/topology/ring/thread_group_ring_creator.hpp
index 1701c1f9a..33de0c94d 100644
--- a/src/common/comm/l0/topology/ring/thread_group_ring_creator.hpp
+++ b/src/common/comm/l0/topology/ring/thread_group_ring_creator.hpp
@@ -39,34 +39,34 @@ class thread_group_ring_topology {
 
     static size_t default_property_p2p_rating_calculator(const ccl_device& lhs,
                                                          const ccl_device& rhs);
-    static details::adjacency_matrix build_p2p_capability_matrix(
+    static detail::adjacency_matrix build_p2p_capability_matrix(
         std::ostream& out,
         const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
-        details::p2p_rating_function ping = default_property_p2p_rating_calculator);
-    static details::adjacency_matrix build_p2p_capability_matrix(
+        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
+    static detail::adjacency_matrix build_p2p_capability_matrix(
         std::ostream& out,
-        const ccl::process_device_indices_t& per_thread_device_indices,
-        details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+        const ccl::process_device_indices_type& per_thread_device_indices,
+        detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 
     bool build(std::ostream& out,
                const ccl::context_comm_addr& context_addr,
                const ccl::process_aggregated_device_mask_t& per_thread_device_masks,
-               const details::adjacency_matrix& matrix,
-               details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+               const detail::adjacency_matrix& matrix,
+               detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
     bool build(std::ostream& out,
                const ccl::context_comm_addr& context_addr,
-               const ccl::process_device_indices_t& per_thread_device_indices,
-               const details::adjacency_matrix& matrix,
-               details::p2p_rating_function ping = default_property_p2p_rating_calculator);
+               const ccl::process_device_indices_type& per_thread_device_indices,
+               const detail::adjacency_matrix& matrix,
+               detail::p2p_rating_function ping = default_property_p2p_rating_calculator);
 
 private:
     bool build_specific(std::ostream& out,
                         const ccl::context_comm_addr& context_addr,
-                        const ccl::process_device_indices_t& per_thread_device_indices,
-                        const details::plain_graph& graph);
+                        const ccl::process_device_indices_type& per_thread_device_indices,
+                        const detail::plain_graph& graph);
     bool build_scale_up_specific(std::ostream& out,
                                  const ccl::context_comm_addr& context_addr,
-                                 const ccl::process_device_indices_t& per_thread_device_indicess,
-                                 const details::plain_graph_list& graph_list);
+                                 const ccl::process_device_indices_type& per_thread_device_indicess,
+                                 const detail::plain_graph_list& graph_list);
 };
 } // namespace native
diff --git a/src/common/comm/l0/topology/topology_construction_utils.cpp b/src/common/comm/l0/topology/topology_construction_utils.cpp
index c3011ecdb..5bb4d1873 100644
--- a/src/common/comm/l0/topology/topology_construction_utils.cpp
+++ b/src/common/comm/l0/topology/topology_construction_utils.cpp
@@ -16,7 +16,7 @@
 #include "common/comm/l0/topology/topology_construction_utils.hpp"
 
 namespace native {
-std::ostream& operator<<(std::ostream& out, const details::adjacency_matrix& matrix) {
+std::ostream& operator<<(std::ostream& out, const detail::adjacency_matrix& matrix) {
     if (matrix.empty()) {
         return out;
     }
@@ -28,7 +28,7 @@ std::ostream& operator<<(std::ostream& out, const details::adjacency_matrix& mat
         out << left_index << "\t:\t{";
         for (const auto& device_cross_rating_value : device_adjacencies) {
             const ccl::device_index_type& right_index = device_cross_rating_value.first;
-            details::cross_device_rating rating = device_cross_rating_value.second;
+            detail::cross_device_rating rating = device_cross_rating_value.second;
             out << right_index << "/ " << rating << ", ";
         }
         out << "},\n";
@@ -37,7 +37,7 @@ std::ostream& operator<<(std::ostream& out, const details::adjacency_matrix& mat
     return out;
 }
 
-namespace details {
+namespace detail {
 std::ostream& operator<<(std::ostream& out, const adjacency_matrix& matrix) {
     if (matrix.empty()) {
         return out;
@@ -50,7 +50,7 @@ std::ostream& operator<<(std::ostream& out, const adjacency_matrix& matrix) {
         out << ccl::to_string(left_index) << "\t:\t{";
         for (const auto& device_cross_rating_value : device_adjacencies) {
             const ccl::device_index_type& right_index = device_cross_rating_value.first;
-            details::cross_device_rating rating = device_cross_rating_value.second;
+            detail::cross_device_rating rating = device_cross_rating_value.second;
             out << ccl::to_string(right_index) << "/ " << rating << ", ";
         }
         out << "},\n";
@@ -70,8 +70,15 @@ size_t property_p2p_rating_calculator(const native::ccl_device& lhs,
     ze_device_p2p_properties_t p2p = lhs.get_p2p_properties(rhs);
     if (p2p.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS)
         return weight;
-    else
-        return 0;
+    else {
+        ze_bool_t access;
+        ze_result_t ret = zeDeviceCanAccessPeer(lhs.handle, rhs.handle, &access);
+        if (ret != ZE_RESULT_SUCCESS) {
+            throw std::runtime_error(std::string("Cannot execute zeDeviceCanAccessPeer, error: ") +
+                                     native::to_string(ret));
+        }
+        return access ? weight : 0;
+    }
 }
 
 std::string to_string(const plain_graph& cont) {
@@ -295,7 +302,7 @@ adjacency_matrix create_adjacency_matrix_for_devices(
 }
 
 plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::device_indices_t& device_indexes) {
+                           const ccl::device_indices_type& device_indexes) {
     plain_graph ids_ring;
 
     std::multimap<ccl::device_index_type, bool> marked_indices;
@@ -337,7 +344,7 @@ plain_graph graph_resolver(const adjacency_matrix& matrix,
                                              ". Check adjacency_matrix construction");
                 }
 
-                details::cross_device_rating rating = rating_it->second;
+                detail::cross_device_rating rating = rating_it->second;
                 if (rating != 0) {
                     //find next
                     ids_ring.push_back(it->first);
@@ -383,7 +390,7 @@ plain_graph graph_resolver(const adjacency_matrix& matrix,
 }
 
 plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::process_device_indices_t& per_process_device_indexes) {
+                           const ccl::process_device_indices_type& per_process_device_indexes) {
     plain_graph ids_ring;
 
     for (const auto& thread_group_val : per_process_device_indexes) {
@@ -408,7 +415,7 @@ plain_graph graph_resolver(const adjacency_matrix& matrix,
 
 /* graph list creation utils */
 plain_graph_list graph_list_resolver(const adjacency_matrix& matrix,
-                                     const ccl::device_indices_t& device_indexes) {
+                                     const ccl::device_indices_type& device_indexes) {
     plain_graph_list isles;
 
     using marked_storage = std::multimap<ccl::device_index_type, bool>;
@@ -463,7 +470,7 @@ plain_graph_list graph_list_resolver(const adjacency_matrix& matrix,
                         ". Check adjacency_matrix construction");
                 }
 
-                details::cross_device_rating rating = rating_it->second;
+                detail::cross_device_rating rating = rating_it->second;
                 if (rating != 0) {
                     //find next
                     cur_graph.push_back(index);
@@ -555,7 +562,7 @@ struct index_extractor<typename colored_plain_graph::value_type> {
 
 template <template <class...> class container, class graph_list, class index_getter>
 graph_list merge_graphs_stable(const container<graph_list>& lists,
-                               details::p2p_rating_function ping,
+                               detail::p2p_rating_function ping,
                                index_getter get,
                                bool brake_on_incompatible,
                                bool to_right,
@@ -654,7 +661,7 @@ bool check_graph_a2a_capable(const plain_graph& graph,
             throw std::runtime_error(ss.str());
         }
 
-        const details::adjacency_list& control_list = m_it->second;
+        const detail::adjacency_list& control_list = m_it->second;
         for (const ccl::device_index_type& rhs_index : graph) {
             auto c_it = control_list.find(rhs_index);
             if (c_it != control_list.end() and c_it->second != 0) {
@@ -673,7 +680,7 @@ bool check_graph_a2a_capable(const plain_graph& graph,
 }
 
 plain_graph_list merge_graph_lists_stable(const std::list<plain_graph_list>& lists,
-                                          details::p2p_rating_function ping,
+                                          detail::p2p_rating_function ping,
                                           bool brake_on_incompatible) {
     size_t merged_process_count = 0;
     return merge_graphs_stable(lists,
@@ -685,7 +692,7 @@ plain_graph_list merge_graph_lists_stable(const std::list<plain_graph_list>& lis
 }
 
 colored_plain_graph_list merge_graph_lists_stable(const std::list<colored_plain_graph_list>& lists,
-                                                  details::p2p_rating_function ping,
+                                                  detail::p2p_rating_function ping,
                                                   bool brake_on_incompatible) {
     size_t merged_process_count = 0;
     return merge_graphs_stable(lists,
@@ -698,7 +705,7 @@ colored_plain_graph_list merge_graph_lists_stable(const std::list<colored_plain_
 
 colored_plain_graph_list merge_graph_lists_stable_for_process(
     const std::list<colored_plain_graph_list>& lists,
-    details::p2p_rating_function ping,
+    detail::p2p_rating_function ping,
     bool to_right,
     size_t& merged_process_count) {
     return merge_graphs_stable(lists,
@@ -711,8 +718,8 @@ colored_plain_graph_list merge_graph_lists_stable_for_process(
 
 plain_graph_list graph_list_resolver(
     const adjacency_matrix& matrix,
-    const ccl::process_device_indices_t& per_process_device_indexes,
-    details::p2p_rating_function ping) {
+    const ccl::process_device_indices_type& per_process_device_indexes,
+    detail::p2p_rating_function ping) {
     std::list<plain_graph_list> lists;
     for (const auto& thread_group_val : per_process_device_indexes) {
         lists.emplace_back(graph_list_resolver(matrix, thread_group_val.second));
@@ -787,5 +794,5 @@ void reset_color(colored_plain_graph_list& list, color_t new_color) {
         }
     }
 }
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/topology/topology_construction_utils.hpp b/src/common/comm/l0/topology/topology_construction_utils.hpp
index 330a2da9f..e8fb4d3ec 100644
--- a/src/common/comm/l0/topology/topology_construction_utils.hpp
+++ b/src/common/comm/l0/topology/topology_construction_utils.hpp
@@ -34,7 +34,7 @@ struct device_group_context;
 struct device_storage;
 struct ccl_device;
 
-namespace details {
+namespace detail {
 
 adjacency_matrix create_adjacency_matrix_for_devices(
     const ccl_device_driver::devices_storage_type& devices,
@@ -57,18 +57,18 @@ void fill_adjacency_matrix_for_single_device_in_devices_by_cond(
         std::function<bool(const ccl::device_index_type&)>());
 
 plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::device_indices_t& device_indexes);
+                           const ccl::device_indices_type& device_indexes);
 plain_graph graph_resolver(const adjacency_matrix& matrix,
-                           const ccl::process_device_indices_t& per_process_device_indexes);
+                           const ccl::process_device_indices_type& per_process_device_indexes);
 plain_graph graph_resolver(const adjacency_matrix& matrix,
                            const ccl::process_aggregated_device_mask_t& per_process_device_masks);
 
 plain_graph_list graph_list_resolver(const adjacency_matrix& matrix,
-                                     const ccl::device_indices_t& device_indexes);
+                                     const ccl::device_indices_type& device_indexes);
 plain_graph_list graph_list_resolver(
     const adjacency_matrix& matrix,
-    const ccl::process_device_indices_t& per_process_device_indexes,
-    details::p2p_rating_function ping);
+    const ccl::process_device_indices_type& per_process_device_indexes,
+    detail::p2p_rating_function ping);
 
 plain_graph_list graph_list_resolver(
     const adjacency_matrix& matrix,
@@ -79,15 +79,15 @@ bool check_graph_a2a_capable(const plain_graph& graph,
                              std::ostream& out);
 
 plain_graph_list merge_graph_lists_stable(const std::list<plain_graph_list>& lists,
-                                          details::p2p_rating_function ping,
+                                          detail::p2p_rating_function ping,
                                           bool brake_on_incompatible = false);
 
 colored_plain_graph_list merge_graph_lists_stable(const std::list<colored_plain_graph_list>& lists,
-                                                  details::p2p_rating_function ping,
+                                                  detail::p2p_rating_function ping,
                                                   bool brake_on_incompatible = false);
 colored_plain_graph_list merge_graph_lists_stable_for_process(
     const std::list<colored_plain_graph_list>& lists,
-    details::p2p_rating_function ping,
+    detail::p2p_rating_function ping,
     bool to_right,
     size_t& merged_process_count);
 
@@ -96,7 +96,7 @@ size_t property_p2p_rating_calculator(const native::ccl_device& lhs,
                                       size_t weight);
 
 void reset_color(colored_plain_graph_list& list, color_t new_color);
-} // namespace details
-std::ostream& operator<<(std::ostream& out, const details::adjacency_matrix& matrix);
+} // namespace detail
+std::ostream& operator<<(std::ostream& out, const detail::adjacency_matrix& matrix);
 
 } // namespace native
diff --git a/src/common/comm/l0/topology/topology_creator.hpp b/src/common/comm/l0/topology/topology_creator.hpp
index 5dd01cea8..5a3dd1ca6 100644
--- a/src/common/comm/l0/topology/topology_creator.hpp
+++ b/src/common/comm/l0/topology/topology_creator.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 namespace native {
-namespace details {
+namespace detail {
 
 template <class F>
 struct device_group_container_functor {
@@ -33,10 +33,10 @@ struct device_group_container_functor {
 private:
     F operation;
 };
-} // namespace details
+} // namespace detail
 
 template <class F, class... Args>
-details::device_group_container_functor<F> create_device_functor(Args&&... args) {
-    return details::device_group_container_functor<F>(std::forward<Args>(args)...);
+detail::device_group_container_functor<F> create_device_functor(Args&&... args) {
+    return detail::device_group_container_functor<F>(std::forward<Args>(args)...);
 }
 } // namespace native
diff --git a/src/common/comm/l0/topology/topology_declarations.hpp b/src/common/comm/l0/topology/topology_declarations.hpp
index 1c5868b15..f434fa293 100644
--- a/src/common/comm/l0/topology/topology_declarations.hpp
+++ b/src/common/comm/l0/topology/topology_declarations.hpp
@@ -21,7 +21,7 @@
 #include "oneapi/ccl/native_device_api/l0/utils.hpp"
 
 namespace native {
-namespace details {
+namespace detail {
 struct marked_idx : std::pair<bool, ccl::device_index_type> {
     marked_idx(bool m, ccl::device_index_type i) : std::pair<bool, ccl::device_index_type>(m, i) {}
 };
@@ -101,11 +101,11 @@ std::string to_string(const global_sorted_colored_plain_graphs& cluster);
 std::string to_string(const global_plain_colored_graphs& cluster);
 
 std::ostream& operator<<(std::ostream& out, const colored_idx& idx);
-} // namespace details
+} // namespace detail
 
 template <class payload_type>
 std::ostream& operator<<(std::ostream& out,
-                         const details::colored_indexed_data<payload_type>& data) {
+                         const detail::colored_indexed_data<payload_type>& data) {
     out << data.to_string();
     return out;
 }
diff --git a/src/common/comm/l0/topology/topology_serializer.cpp b/src/common/comm/l0/topology/topology_serializer.cpp
index 63c0c22f6..309e37979 100644
--- a/src/common/comm/l0/topology/topology_serializer.cpp
+++ b/src/common/comm/l0/topology/topology_serializer.cpp
@@ -16,7 +16,7 @@
 #include "common/comm/l0/topology/topology_serializer.hpp"
 
 namespace native {
-namespace details {
+namespace detail {
 namespace serialize {
 device_path_serializable::raw_data_t device_path_serializable::result() {
     return data;
@@ -112,13 +112,13 @@ device_path_serializer::raw_data_t device_path_serializer::serialize_indices_imp
 }
 
 device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
-    const details::plain_graph_list& list,
+    const detail::plain_graph_list& list,
     size_t data_offset) {
     /*
     std::list<raw_data_t> serialized_list;
     size_t list_size = list.size();
     size_t total_size = sizeof(list_size) + data_offset;
-    for (const details::plain_graph& graph : list)
+    for (const detail::plain_graph& graph : list)
     {
         size_t graph_count = graph.size();
         raw_data_t serialized_graph = device_path_serializer::serialize_indices(graph,
@@ -147,7 +147,7 @@ device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
 }
 
 device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
-    const details::global_sorted_plain_graphs& list) {
+    const detail::global_sorted_plain_graphs& list) {
     /*std::list<raw_data_t> serialized_list;
     size_t cluster_size = list.size();
     size_t total_size = sizeof(cluster_size); //preambule size
@@ -181,13 +181,13 @@ device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
 }
 
 device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
-    const details::colored_plain_graph_list& list,
+    const detail::colored_plain_graph_list& list,
     size_t offset) {
     return device_path_serializer::serialize_indices_impl(list, offset);
 }
 
 device_path_serializable::raw_data_t device_path_serializer::serialize_indices(
-    const details::global_sorted_colored_plain_graphs& list) {
+    const detail::global_sorted_colored_plain_graphs& list) {
     return device_path_serializer::serialize_indices_impl(list);
 }
 
@@ -333,13 +333,13 @@ std::map<size_t, T> device_path_deserializer::deserialize_generic_indices_map_im
     return global;
 }
 
-details::plain_graph_list device_path_deserializer::deserialize_graph_list_indices(
+detail::plain_graph_list device_path_deserializer::deserialize_graph_list_indices(
     const raw_data_t& data,
     size_t& deserialized_bytes_count,
     size_t offset) {
     return device_path_deserializer::deserialize_generic_indices_list_impl<
-        typename details::plain_graph_list::value_type>(data, deserialized_bytes_count, offset, 0);
-    /*details::plain_graph_list list;
+        typename detail::plain_graph_list::value_type>(data, deserialized_bytes_count, offset, 0);
+    /*detail::plain_graph_list list;
     size_t list_size = 0;
     // preconditions
     if (data.size() < sizeof(list_size) + offset)
@@ -385,7 +385,7 @@ details::plain_graph_list device_path_deserializer::deserialize_graph_list_indic
                                      ", with offset: " + std::to_string(offset));
         }
         //deserialize graph_data
-        details::plain_graph graph;
+        detail::plain_graph graph;
         graph.reserve(graph_size);
         //deserialize graph portion
         auto data_end_it = data_it;
@@ -409,12 +409,12 @@ details::plain_graph_list device_path_deserializer::deserialize_graph_list_indic
     return list;*/
 }
 
-details::global_sorted_plain_graphs device_path_deserializer::deserialize_global_graph_list_indices(
+detail::global_sorted_plain_graphs device_path_deserializer::deserialize_global_graph_list_indices(
     const raw_data_t& data) {
     return device_path_deserializer::template deserialize_generic_indices_map_impl<
-        typename details::global_sorted_plain_graphs::mapped_type>(data, 0);
+        typename detail::global_sorted_plain_graphs::mapped_type>(data, 0);
     /*
-    details::global_sorted_plain_graphs global;
+    detail::global_sorted_plain_graphs global;
     size_t global_size = 0;
     size_t deserialized_bytes_count = 0;
     // preconditions
@@ -448,7 +448,7 @@ details::global_sorted_plain_graphs device_path_deserializer::deserialize_global
         deserialized_bytes_count += expected_count;
         //get graph_data for process
         size_t process_deserialized_count = 0;
-        details::plain_graph_list process_list =
+        detail::plain_graph_list process_list =
                 device_path_deserializer::deserialize_graph_list_indices(raw_data_t(data_it,
                                                                                     data.end()),
                                                                          process_deserialized_count);
@@ -474,30 +474,30 @@ details::global_sorted_plain_graphs device_path_deserializer::deserialize_global
     */
 }
 
-details::colored_plain_graph_list device_path_deserializer::deserialize_colored_graph_list_indices(
+detail::colored_plain_graph_list device_path_deserializer::deserialize_colored_graph_list_indices(
     const raw_data_t& list,
     size_t& deserialized_bytes_count,
     size_t offset) {
     return device_path_deserializer::deserialize_generic_indices_list_impl<
-        typename details::colored_plain_graph_list::value_type>(
+        typename detail::colored_plain_graph_list::value_type>(
         list,
         deserialized_bytes_count,
         offset,
-        sizeof(details::colored_idx) - sizeof(ccl::device_index_type));
+        sizeof(detail::colored_idx) - sizeof(ccl::device_index_type));
 }
 
-details::global_sorted_colored_plain_graphs
+detail::global_sorted_colored_plain_graphs
 device_path_deserializer::deserialize_global_colored_graph_list_indices(const raw_data_t& list) {
     return device_path_deserializer::deserialize_generic_indices_map_impl<
-        typename details::global_sorted_colored_plain_graphs::mapped_type>(
-        list, sizeof(details::colored_idx) - sizeof(ccl::device_index_type));
+        typename detail::global_sorted_colored_plain_graphs::mapped_type>(
+        list, sizeof(detail::colored_idx) - sizeof(ccl::device_index_type));
 }
 
-details::colored_idx device_path_deserializer::extract_index(raw_data_t::const_iterator it_begin,
-                                                             raw_data_t::const_iterator it_end,
-                                                             std::false_type raw_index) {
-    constexpr size_t color_size = sizeof(details::color_t);
-    constexpr size_t stride = sizeof(details::colored_idx) - sizeof(ccl::device_index_type);
+detail::colored_idx device_path_deserializer::extract_index(raw_data_t::const_iterator it_begin,
+                                                            raw_data_t::const_iterator it_end,
+                                                            std::false_type raw_index) {
+    constexpr size_t color_size = sizeof(detail::color_t);
+    constexpr size_t stride = sizeof(detail::colored_idx) - sizeof(ccl::device_index_type);
     if (std::distance(it_begin, it_end) %
             (device_path_serializable::device_index_size() + stride) !=
         0) {
@@ -508,11 +508,11 @@ details::colored_idx device_path_deserializer::extract_index(raw_data_t::const_i
             std::to_string(device_path_serializable::device_index_size() + stride));
     }
 
-    details::color_t color = 0;
+    detail::color_t color = 0;
     memcpy(&color, &(*it_begin), color_size);
     std::advance(it_begin, stride);
 
-    return details::colored_idx(
+    return detail::colored_idx(
         color, device_path_deserializer::extract_index(it_begin, it_end, std::true_type{}));
 }
 
@@ -551,10 +551,10 @@ ccl::device_index_type device_path_deserializer::extract_index(raw_data_t::const
     return path;
 }
 /*
-ccl::device_indices_t device_path_deserializer::operator()(const std::vector<unsigned char>& raw_data)
+ccl::device_indices_type device_path_deserializer::operator()(const std::vector<unsigned char>& raw_data)
 {
     size_t elem_count = base::get_indices_count(raw_data.size());
-    ccl::device_indices_t data;
+    ccl::device_indices_type data;
     constexpr auto offset = sizeof(ccl::index_type) / sizeof(unsigned char);
     for(auto raw_data_it = raw_data.begin(); raw_data_it != raw_data.end(); )
     {
@@ -575,5 +575,5 @@ ccl::device_indices_t device_path_deserializer::operator()(const std::vector<uns
 }
 */
 } // namespace serialize
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/l0/topology/topology_serializer.hpp b/src/common/comm/l0/topology/topology_serializer.hpp
index bb3d1d371..78f3cdf57 100644
--- a/src/common/comm/l0/topology/topology_serializer.hpp
+++ b/src/common/comm/l0/topology/topology_serializer.hpp
@@ -16,7 +16,7 @@
 #pragma once
 #include "common/comm/l0/topology/topology_construction_utils.hpp"
 namespace native {
-namespace details {
+namespace detail {
 namespace serialize {
 struct device_path_serializable {
     using raw_data_t = std::vector<unsigned char>;
@@ -62,11 +62,11 @@ struct device_path_serializer : device_path_serializable {
     }
 
     template <template <class...> class container>
-    static raw_data_t serialize_indices(const container<details::colored_idx>& indices,
+    static raw_data_t serialize_indices(const container<detail::colored_idx>& indices,
                                         size_t additional_reserved_bytes = 0) {
-        static_assert(sizeof(details::colored_idx) >= sizeof(ccl::device_index_type),
+        static_assert(sizeof(detail::colored_idx) >= sizeof(ccl::device_index_type),
                       "'stride' must be positive or zero");
-        constexpr size_t stride = sizeof(details::colored_idx) - sizeof(ccl::device_index_type);
+        constexpr size_t stride = sizeof(detail::colored_idx) - sizeof(ccl::device_index_type);
         device_path_serializer consumer(indices.size(), additional_reserved_bytes, stride);
         for (const auto& path : indices) {
             //serialize color
@@ -82,11 +82,11 @@ struct device_path_serializer : device_path_serializable {
         return consumer.result();
     }
 
-    static raw_data_t serialize_indices(const details::plain_graph_list& list, size_t offset = 0);
-    static raw_data_t serialize_indices(const details::global_sorted_plain_graphs& list);
-    static raw_data_t serialize_indices(const details::colored_plain_graph_list& list,
+    static raw_data_t serialize_indices(const detail::plain_graph_list& list, size_t offset = 0);
+    static raw_data_t serialize_indices(const detail::global_sorted_plain_graphs& list);
+    static raw_data_t serialize_indices(const detail::colored_plain_graph_list& list,
                                         size_t offset = 0);
-    static raw_data_t serialize_indices(const details::global_sorted_colored_plain_graphs& list);
+    static raw_data_t serialize_indices(const detail::global_sorted_colored_plain_graphs& list);
 
     template <class index_type>
     void operator()(const index_type& value) {
@@ -153,26 +153,25 @@ struct device_path_deserializer : device_path_serializable {
         return ret;
     }
 
-    static details::plain_graph_list deserialize_graph_list_indices(
-        const raw_data_t& list,
-        size_t& deserialized_bytes_count,
-        size_t offset = 0);
-    static details::global_sorted_plain_graphs deserialize_global_graph_list_indices(
+    static detail::plain_graph_list deserialize_graph_list_indices(const raw_data_t& list,
+                                                                   size_t& deserialized_bytes_count,
+                                                                   size_t offset = 0);
+    static detail::global_sorted_plain_graphs deserialize_global_graph_list_indices(
         const raw_data_t& list);
 
-    static details::colored_plain_graph_list deserialize_colored_graph_list_indices(
+    static detail::colored_plain_graph_list deserialize_colored_graph_list_indices(
         const raw_data_t& list,
         size_t& deserialized_bytes_count,
         size_t offset = 0);
-    static details::global_sorted_colored_plain_graphs
-    deserialize_global_colored_graph_list_indices(const raw_data_t& list);
+    static detail::global_sorted_colored_plain_graphs deserialize_global_colored_graph_list_indices(
+        const raw_data_t& list);
 
     static ccl::device_index_type extract_index(raw_data_t::const_iterator it_begin,
                                                 raw_data_t::const_iterator it_end,
                                                 std::true_type raw_index);
-    static details::colored_idx extract_index(raw_data_t::const_iterator it_begin,
-                                              raw_data_t::const_iterator it_end,
-                                              std::false_type colored_index);
+    static detail::colored_idx extract_index(raw_data_t::const_iterator it_begin,
+                                             raw_data_t::const_iterator it_end,
+                                             std::false_type colored_index);
 
 private:
     template <class T>
@@ -185,5 +184,5 @@ struct device_path_deserializer : device_path_serializable {
                                                                     size_t stride = 0);
 };
 } // namespace serialize
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/common/comm/single_device_communicator/single_device_base.hpp b/src/common/comm/single_device_communicator/single_device_base.hpp
index c72675277..611e7b566 100644
--- a/src/common/comm/single_device_communicator/single_device_base.hpp
+++ b/src/common/comm/single_device_communicator/single_device_base.hpp
@@ -52,7 +52,7 @@ class typed_single_device_base_communicator : public base_communicator {
     }
 
     typed_single_device_base_communicator(ccl::unified_device_type&& device,
-                                          ccl::unified_device_context_type&& context,
+                                          ccl::unified_context_type&& context,
                                           size_t thread_idx,
                                           size_t process_idx,
                                           const ccl::comm_split_attr& attr);
@@ -62,65 +62,10 @@ class typed_single_device_base_communicator : public base_communicator {
 
     bool is_ready() const override;
 
-    // communicator interfaces implementation
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION__VOID;
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(char);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(int);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(int64_t);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(uint64_t);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(float);
-    DEVICE_COMM_INTERFACE_COLL_DEFINITION(double);
-
+    COMM_INTERFACE_COLL_METHODS(DEFINITION);
 #ifdef CCL_ENABLE_SYCL
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<char COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<uint64_t COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION__VOID;
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(char, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(int64_t, uint64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, char);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, int);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, ccl::bf16);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, float);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, double);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, int64_t);
-    DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(uint64_t, uint64_t);
-
-#ifdef CCL_ENABLE_SYCL
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>,
-                                                  cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int COMMA 1>,
-                                                  cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                  cl::sycl::buffer<float COMMA 1>);
-    DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(cl::sycl::buffer<int64_t COMMA 1>,
-                                                  cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+    SYCL_COMM_INTERFACE_COLL_METHODS(DEFINITION);
+#endif /* CCL_ENABLE_SYCL */
 
     // troubleshooting
     std::string to_string() const;
diff --git a/src/common/comm/single_device_communicator/single_device_base_impl.hpp b/src/common/comm/single_device_communicator/single_device_base_impl.hpp
index 1484ddbbe..5f752b3a4 100644
--- a/src/common/comm/single_device_communicator/single_device_base_impl.hpp
+++ b/src/common/comm/single_device_communicator/single_device_base_impl.hpp
@@ -14,8 +14,8 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/comm/single_device_communicator/single_device_base.hpp"
 
 #define TEMPLATE_DECL_ARG class comm_impl, class communicator_traits
@@ -24,11 +24,12 @@
 template <TEMPLATE_DECL_ARG>
 typed_single_device_base_communicator<TEMPLATE_DEF_ARG>::typed_single_device_base_communicator(
     ccl::unified_device_type&& owned_device,
-    ccl::unified_device_context_type&& context,
+    ccl::unified_context_type&& context,
     size_t thread_idx,
     size_t process_idx,
     const ccl::comm_split_attr& attr)
-        : base_communicator(std::move(owned_device), std::move(context),
+        : base_communicator(std::move(owned_device),
+                            std::move(context),
                             thread_idx,
                             process_idx /*, comm_attr*/,
                             attr) {
@@ -54,8 +55,8 @@ bool typed_single_device_base_communicator<TEMPLATE_DEF_ARG>::is_ready() const {
 }
 
 template <TEMPLATE_DECL_ARG>
-ccl::group_split_type
-typed_single_device_base_communicator<TEMPLATE_DEF_ARG>::get_topology_type() const {
+ccl::group_split_type typed_single_device_base_communicator<TEMPLATE_DEF_ARG>::get_topology_type()
+    const {
     return self_t::topology_type();
 }
 
diff --git a/src/common/comm/single_device_communicator/single_device_communicator.cpp b/src/common/comm/single_device_communicator/single_device_communicator.cpp
index 128a623fc..69c32ceb0 100644
--- a/src/common/comm/single_device_communicator/single_device_communicator.cpp
+++ b/src/common/comm/single_device_communicator/single_device_communicator.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
 #include "common/comm/single_device_communicator/single_device_communicator_impl.hpp"
 #ifdef MULTI_GPU_SUPPORT
@@ -25,16 +25,20 @@
 using namespace ccl;
 
 single_device_communicator::single_device_communicator(ccl::unified_device_type&& device,
-                                                       ccl::unified_device_context_type&& context,
+                                                       ccl::unified_context_type&& context,
                                                        size_t thread_idx,
                                                        size_t process_idx,
                                                        const ccl::comm_split_attr& attr)
-        : base_t(std::move(device), std::move(context), thread_idx, process_idx /*, comm_attr*/, attr) {}
+        : base_t(std::move(device),
+                 std::move(context),
+                 thread_idx,
+                 process_idx /*, comm_attr*/,
+                 attr) {}
 
-single_device_communicator::~single_device_communicator() {
-}
+single_device_communicator::~single_device_communicator() {}
 
-std::shared_ptr<ccl::communicator_interface> single_device_communicator::split(const ccl::comm_split_attr& attr) {
+std::shared_ptr<ccl::communicator_interface> single_device_communicator::split(
+    const ccl::comm_split_attr& attr) {
     // TODO
     throw ccl::exception(std::string(__FUNCTION__) + " - 'is not implemented");
     return {};
@@ -50,11 +54,10 @@ void single_device_communicator::set_ccl_comm(std::shared_ptr<ccl_comm> impl) {
 
 //TODO use visit() to set `context`
 void single_device_communicator::set_context(
-    const ccl::unified_device_context_type::ccl_native_t& in_context) {
+    const ccl::unified_context_type::ccl_native_t& in_context) {
     context = in_context;
 }
-void single_device_communicator::set_context(const ccl::context& in_context)
-{
+void single_device_communicator::set_context(const ccl::context& in_context) {
     context = in_context.get_native();
 }
 
@@ -72,21 +75,19 @@ void single_device_communicator::visit(ccl::gpu_comm_attr& comm_attr) {
     this->set_comm_group_id(comm_attr.get_unique_id());
 }
 #endif
-single_device_communicator::coll_request_t single_device_communicator::barrier(
-    const ccl::stream::impl_value_t& op_stream,
-    const ccl::barrier_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event single_device_communicator::barrier(const ccl::stream::impl_value_t& op_stream,
+                                               const ccl::barrier_attr& attr,
+                                               const ccl::vector_class<ccl::event>& deps) {
     // TODO what exactly we need to do with 'attr' here?
 
     ccl_barrier_impl(comm_impl.get(), op_stream.get());
 
     // TODO what exactly we need to return here? ccl_barrier_impl() is void func
-    ccl_request* req = nullptr;
-    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
+    return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(nullptr));
 }
 
 /* allgatherv */
-single_device_communicator::coll_request_t single_device_communicator::allgatherv_base_impl(
+ccl::event single_device_communicator::allgatherv_base_impl(
     const void* send_buf,
     size_t send_count,
     void* recv_buf,
@@ -95,190 +96,247 @@ single_device_communicator::coll_request_t single_device_communicator::allgather
     const ccl::stream::impl_value_t& stream,
     const ccl_coll_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    coll_request_t req;
-
-    if (!allgather_usm_visitor_base_t::visit(
-            req, dtype, send_buf, send_count, recv_buf, recv_counts, stream, attr, deps)) {
-        req = coll_request_t(std::unique_ptr<ccl::event_impl>(
-            new ccl::host_event_impl(ccl_allgatherv_impl(reinterpret_cast<const void*>(send_buf),
-                                                           send_count,
-                                                           reinterpret_cast<void*>(recv_buf),
-                                                           recv_counts.data(),
-                                                           dtype,
-                                                           attr,
-                                                           comm_impl.get(),
-                                                           stream.get()))));
+    using namespace ::native::detail;
+
+    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
+    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
+
+    const ccl_stream* stream_handle = nullptr;
+
+    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
     }
-    return req;
-}
+    else if (mode == usm_support_mode::need_conversion)
+#ifdef CCL_ENABLE_SYCL
+        stream_handle = stream.get();
+#else
+        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                             " - USM convertation is not supported for such configuration");
+#endif
 
-single_device_communicator::coll_request_t single_device_communicator::allgatherv_impl(
-    const void* send_buf,
-    size_t send_count,
-    void* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+    return ccl::event(std::unique_ptr<ccl::event_impl>(
+        new ccl::host_event_impl(ccl_allgatherv_impl(send_buf,
+                                                     send_count,
+                                                     recv_buf,
+                                                     recv_counts.data(),
+                                                     dtype,
+                                                     attr,
+                                                     comm_impl.get(),
+                                                     stream_handle))));
+}
 
+ccl::event single_device_communicator::allgatherv_impl(const void* send_buf,
+                                                       size_t send_count,
+                                                       void* recv_buf,
+                                                       const ccl::vector_class<size_t>& recv_counts,
+                                                       ccl::datatype dtype,
+                                                       const ccl::stream::impl_value_t& stream,
+                                                       const ccl::allgatherv_attr& attr,
+                                                       const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
-    return allgatherv_base_impl(send_buf, send_count, recv_buf, recv_counts, dtype, stream, internal_attr, deps);
+    return allgatherv_base_impl(
+        send_buf, send_count, recv_buf, recv_counts, dtype, stream, internal_attr, deps);
 }
 
-single_device_communicator::coll_request_t single_device_communicator::allgatherv_impl(
-    const void* send_buf,
-    size_t send_count,
-    const ccl::vector_class<void*>& recv_bufs,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-
+ccl::event single_device_communicator::allgatherv_impl(const void* send_buf,
+                                                       size_t send_count,
+                                                       const ccl::vector_class<void*>& recv_bufs,
+                                                       const ccl::vector_class<size_t>& recv_counts,
+                                                       ccl::datatype dtype,
+                                                       const ccl::stream::impl_value_t& stream,
+                                                       const ccl::allgatherv_attr& attr,
+                                                       const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
     internal_attr.vector_buf = 1;
-    return allgatherv_base_impl(send_buf, send_count, (void*)(recv_bufs.data()), recv_counts, dtype, stream, internal_attr, deps);
+    return allgatherv_base_impl(send_buf,
+                                send_count,
+                                (void*)(recv_bufs.data()),
+                                recv_counts,
+                                dtype,
+                                stream,
+                                internal_attr,
+                                deps);
 }
 
 /* allreduce */
-single_device_communicator::coll_request_t single_device_communicator::allreduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    coll_request_t req;
-    // try to apply USM pointers convertation
-    if (!allreduce_usm_visitor_base_t::visit(
-            req, dtype, send_buf, recv_buf, count, reduction, stream, attr, deps)) {
-        // no predefined convertation from USM, assumed CUSTOM_TYPE
-        req = coll_request_t(std::unique_ptr<
-                             ccl::event_impl>(new ccl::host_event_impl(ccl_allreduce_impl(
-            send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), stream.get()))));
+ccl::event single_device_communicator::allreduce_impl(const void* send_buf,
+                                                      void* recv_buf,
+                                                      size_t count,
+                                                      ccl::datatype dtype,
+                                                      ccl::reduction reduction,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::allreduce_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
+    using namespace ::native::detail;
+
+    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
+    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
+
+    const ccl_stream* stream_handle = nullptr;
+
+    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
     }
-    return req;
+    else if (mode == usm_support_mode::need_conversion)
+#ifdef CCL_ENABLE_SYCL
+        stream_handle = stream.get();
+#else
+        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                             " - USM convertation is not supported for such configuration");
+#endif
+
+    return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_allreduce_impl(
+        send_buf, recv_buf, count, dtype, reduction, attr, comm_impl.get(), stream_handle))));
 }
 
 /* alltoall */
-single_device_communicator::coll_request_t single_device_communicator::alltoall_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    coll_request_t req;
-    if (!alltoall_usm_visitor_base_t::visit(
-            req, dtype, send_buf, recv_buf, count, stream, attr, deps)) {
-        req = coll_request_t(
-            std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_alltoall_impl(
-                send_buf, recv_buf, count, dtype, attr, comm_impl.get(), stream.get()))));
+ccl::event single_device_communicator::alltoall_impl(const void* send_buf,
+                                                     void* recv_buf,
+                                                     size_t count,
+                                                     ccl::datatype dtype,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::alltoall_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
+    using namespace ::native::detail;
+
+    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
+    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
+
+    const ccl_stream* stream_handle = nullptr;
+
+    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
     }
-    return req;
+    else if (mode == usm_support_mode::need_conversion)
+#ifdef CCL_ENABLE_SYCL
+        stream_handle = stream.get();
+#else
+        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                             " - USM convertation is not supported for such configuration");
+#endif
+
+    return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_alltoall_impl(
+        send_buf, recv_buf, count, dtype, attr, comm_impl.get(), stream_handle))));
 }
 
-single_device_communicator::coll_request_t single_device_communicator::alltoall_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<void*>& recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event single_device_communicator::alltoall_impl(const ccl::vector_class<void*>& send_buf,
+                                                     const ccl::vector_class<void*>& recv_buf,
+                                                     size_t count,
+                                                     ccl::datatype dtype,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::alltoall_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* alltoallv */
-single_device_communicator::coll_request_t single_device_communicator::alltoallv_impl(
-    const void* send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    void* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    coll_request_t req;
-    if (!alltoallv_usm_visitor_base_t::visit(
-            req, dtype, send_buf, send_counts, recv_buf, recv_counts, stream, attr, deps)) {
-        req = coll_request_t(std::unique_ptr<ccl::event_impl>(
-            new ccl::host_event_impl(ccl_alltoallv_impl(send_buf,
-                                                          send_counts.data(),
-                                                          recv_buf,
-                                                          recv_counts.data(),
-                                                          dtype,
-                                                          attr,
-                                                          comm_impl.get(),
-                                                          stream.get()))));
+ccl::event single_device_communicator::alltoallv_impl(const void* send_buf,
+                                                      const ccl::vector_class<size_t>& send_counts,
+                                                      void* recv_buf,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      ccl::datatype dtype,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::alltoallv_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
+    using namespace ::native::detail;
+
+    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
+    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
+
+    const ccl_stream* stream_handle = nullptr;
+
+    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
     }
-    return req;
+    else if (mode == usm_support_mode::need_conversion)
+#ifdef CCL_ENABLE_SYCL
+        stream_handle = stream.get();
+#else
+        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                             " - USM convertation is not supported for such configuration");
+#endif
+
+    return ccl::event(std::unique_ptr<ccl::event_impl>(
+        new ccl::host_event_impl(ccl_alltoallv_impl(send_buf,
+                                                    send_counts.data(),
+                                                    recv_buf,
+                                                    recv_counts.data(),
+                                                    dtype,
+                                                    attr,
+                                                    comm_impl.get(),
+                                                    stream_handle))));
 }
-single_device_communicator::coll_request_t single_device_communicator::alltoallv_impl(
-    const ccl::vector_class<void*>& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    ccl::vector_class<void*> recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    ccl::datatype dtype,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& dep) {
+ccl::event single_device_communicator::alltoallv_impl(const ccl::vector_class<void*>& send_buf,
+                                                      const ccl::vector_class<size_t>& send_counts,
+                                                      ccl::vector_class<void*> recv_buf,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      ccl::datatype dtype,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::alltoallv_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& dep) {
     throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
     return {};
 }
 
 /* bcast */
-single_device_communicator::coll_request_t single_device_communicator::broadcast_impl(
-    void* buf,
-    size_t count,
-    ccl::datatype dtype,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    coll_request_t req;
-    if (!broadcast_usm_visitor_base_t::visit(req, dtype, buf, count, root, stream, attr, deps)) {
-        req = coll_request_t(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(
-            ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), stream.get()))));
+ccl::event single_device_communicator::broadcast_impl(void* buf,
+                                                      size_t count,
+                                                      ccl::datatype dtype,
+                                                      int root,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::broadcast_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
+    using namespace ::native::detail;
+
+    std::vector<void*> bufs = { buf };
+    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
+
+    const ccl_stream* stream_handle = nullptr;
+
+    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
     }
-    return req;
+    else if (mode == usm_support_mode::need_conversion)
+#ifdef CCL_ENABLE_SYCL
+        stream_handle = stream.get();
+#else
+        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                             " - USM convertation is not supported for such configuration");
+#endif
+
+    return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(
+        ccl_broadcast_impl(buf, count, dtype, root, attr, comm_impl.get(), stream_handle))));
 }
 
 /* reduce */
-single_device_communicator::coll_request_t single_device_communicator::reduce_impl(
-    const void* send_buf,
-    void* recv_buf,
-    size_t count,
-    ccl::datatype dtype,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    coll_request_t req;
-    if (!reduce_usm_visitor_base_t::visit(
-            req, dtype, send_buf, recv_buf, count, reduction, root, stream, attr, deps)) {
-        req = coll_request_t(std::unique_ptr<ccl::event_impl>(
-            new ccl::host_event_impl(ccl_reduce_impl(send_buf,
-                                                       recv_buf,
-                                                       count,
-                                                       dtype,
-                                                       reduction,
-                                                       root,
-                                                       attr,
-                                                       comm_impl.get(),
-                                                       stream.get()))));
+ccl::event single_device_communicator::reduce_impl(const void* send_buf,
+                                                   void* recv_buf,
+                                                   size_t count,
+                                                   ccl::datatype dtype,
+                                                   ccl::reduction reduction,
+                                                   int root,
+                                                   const ccl::stream::impl_value_t& stream,
+                                                   const ccl::reduce_attr& attr,
+                                                   const ccl::vector_class<ccl::event>& deps) {
+    using namespace ::native::detail;
+
+    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
+    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
+
+    const ccl_stream* stream_handle = nullptr;
+
+    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
     }
-    return req;
+    else if (mode == usm_support_mode::need_conversion)
+#ifdef CCL_ENABLE_SYCL
+        stream_handle = stream.get();
+#else
+        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                             " - USM convertation is not supported for such configuration");
+#endif
+
+    return ccl::event(std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(ccl_reduce_impl(
+        send_buf, recv_buf, count, dtype, reduction, root, attr, comm_impl.get(), stream_handle))));
 }
 
 /* reduce_scatter */
-single_device_communicator::coll_request_t single_device_communicator::reduce_scatter_impl(
+ccl::event single_device_communicator::reduce_scatter_impl(
     const void* send_buf,
     void* recv_buf,
     size_t recv_count,
@@ -287,25 +345,30 @@ single_device_communicator::coll_request_t single_device_communicator::reduce_sc
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_scatter_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
+    using namespace ::native::detail;
 
-    coll_request_t req;
-    if (!reduce_scatter_usm_visitor_base_t::visit(
-            req, dtype, send_buf, recv_buf, recv_count, reduction, stream, attr, deps)) {
-        req = coll_request_t(std::unique_ptr<ccl::event_impl>(
-            new ccl::host_event_impl(ccl_reduce_scatter_impl(send_buf,
-                                                       recv_buf,
-                                                       recv_count,
-                                                       dtype,
-                                                       reduction,
-                                                       attr,
-                                                       comm_impl.get(),
-                                                       stream.get()))));
+    std::vector<void*> bufs = { (void*)send_buf, recv_buf };
+    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
+
+    const ccl_stream* stream_handle = nullptr;
+
+    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
     }
-    return req;
+    else if (mode == usm_support_mode::need_conversion)
+#ifdef CCL_ENABLE_SYCL
+        stream_handle = stream.get();
+#else
+        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                             " - USM convertation is not supported for such configuration");
+#endif
+
+    return ccl::event(std::unique_ptr<
+                      ccl::event_impl>(new ccl::host_event_impl(ccl_reduce_scatter_impl(
+        send_buf, recv_buf, recv_count, dtype, reduction, attr, comm_impl.get(), stream_handle))));
 }
 
 /* sparse_allreduce */
-single_device_communicator::coll_request_t single_device_communicator::sparse_allreduce_impl(
+ccl::event single_device_communicator::sparse_allreduce_impl(
     const void* send_ind_buf,
     size_t send_ind_count,
     const void* send_val_buf,
@@ -320,163 +383,45 @@ single_device_communicator::coll_request_t single_device_communicator::sparse_al
     const ccl::stream::impl_value_t& stream,
     const ccl::sparse_allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    coll_request_t req;
-    if (!sparse_allreduce_usm_visitor_base_t::visit(req,
-                                                    index_dtype,
-                                                    value_dtype,
-                                                    send_ind_buf,
-                                                    send_ind_count,
-                                                    send_val_buf,
-                                                    send_val_count,
-                                                    recv_ind_buf,
-                                                    recv_ind_count,
-                                                    recv_val_buf,
-                                                    recv_val_count,
-                                                    reduction,
-                                                    stream,
-                                                    attr,
-                                                    deps)) {
-        req = coll_request_t(std::unique_ptr<ccl::event_impl>(
-            new ccl::host_event_impl(ccl_sparse_allreduce_impl(send_ind_buf,
-                                                                 send_ind_count,
-                                                                 send_val_buf,
-                                                                 send_val_count,
-                                                                 recv_ind_buf,
-                                                                 recv_ind_count,
-                                                                 recv_val_buf,
-                                                                 recv_val_count,
-                                                                 index_dtype,
-                                                                 value_dtype,
-                                                                 reduction,
-                                                                 attr,
-                                                                 comm_impl.get(),
-                                                                 stream.get()))));
-    }
-    return req;
-}
+    using namespace ::native::detail;
+
+    std::vector<void*> bufs = {
+        (void*)send_ind_buf, (void*)send_val_buf, recv_ind_buf, recv_val_buf
+    };
+    auto mode = check_assoc_device_memory(bufs, get_device(), get_context());
 
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(single_device_communicator, char);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(single_device_communicator, int);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(single_device_communicator, int64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(single_device_communicator, uint64_t);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(single_device_communicator, float);
-DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(single_device_communicator, double);
+    const ccl_stream* stream_handle = nullptr;
 
+    if ((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared)) {
+    }
+    else if (mode == usm_support_mode::need_conversion)
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(single_device_communicator,
-                                                cl::sycl::buffer<char COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(single_device_communicator,
-                                                cl::sycl::buffer<int COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(single_device_communicator,
-                                                cl::sycl::buffer<int64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(single_device_communicator,
-                                                cl::sycl::buffer<uint64_t COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(single_device_communicator,
-                                                cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(single_device_communicator,
-                                                cl::sycl::buffer<double COMMA 1>);
-#endif //CCL_ENABLE_SYCL
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              char,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              char,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              char,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              char,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              char,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              char,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              char,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator, int, int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              int64_t,
-                                                              uint64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              uint64_t,
-                                                              char);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              uint64_t,
-                                                              int);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              uint64_t,
-                                                              ccl::bf16);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              uint64_t,
-                                                              float);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              uint64_t,
-                                                              double);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              uint64_t,
-                                                              int64_t);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(single_device_communicator,
-                                                              uint64_t,
-                                                              uint64_t);
+        stream_handle = stream.get();
+#else
+        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                             " - USM convertation is not supported for such configuration");
+#endif
+
+    return ccl::event(std::unique_ptr<ccl::event_impl>(
+        new ccl::host_event_impl(ccl_sparse_allreduce_impl(send_ind_buf,
+                                                           send_ind_count,
+                                                           send_val_buf,
+                                                           send_val_count,
+                                                           recv_ind_buf,
+                                                           recv_ind_count,
+                                                           recv_val_buf,
+                                                           recv_val_count,
+                                                           index_dtype,
+                                                           value_dtype,
+                                                           reduction,
+                                                           attr,
+                                                           comm_impl.get(),
+                                                           stream_handle))));
+}
 
+COMM_INTERFACE_COLL_INSTANTIATION(single_device_communicator);
 #ifdef CCL_ENABLE_SYCL
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    single_device_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    single_device_communicator,
-    cl::sycl::buffer<int COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    single_device_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<float COMMA 1>);
-DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION(
-    single_device_communicator,
-    cl::sycl::buffer<int64_t COMMA 1>,
-    cl::sycl::buffer<ccl::bf16 COMMA 1>);
-#endif //CCL_ENABLE_SYCL
+SYCL_COMM_INTERFACE_COLL_INSTANTIATION(single_device_communicator);
+#endif /* CCL_ENABLE_SYCL */
+
 #endif //#if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
diff --git a/src/common/comm/single_device_communicator/single_device_communicator.hpp b/src/common/comm/single_device_communicator/single_device_communicator.hpp
index 5c75007ba..535ad692e 100644
--- a/src/common/comm/single_device_communicator/single_device_communicator.hpp
+++ b/src/common/comm/single_device_communicator/single_device_communicator.hpp
@@ -26,34 +26,13 @@ struct device_group_context;
 
 class single_device_communicator
         : public typed_single_device_base_communicator<single_device_communicator,
-                                                       ccl::gpu_communicator_traits>,
-          /*USM-converter visitors*/
-          public allgather_usm_visitor<single_device_communicator>,
-          public allreduce_usm_visitor<single_device_communicator>,
-          public alltoall_usm_visitor<single_device_communicator>,
-          public alltoallv_usm_visitor<single_device_communicator>,
-          public broadcast_usm_visitor<single_device_communicator>,
-          public reduce_usm_visitor<single_device_communicator>,
-          public reduce_scatter_usm_visitor<single_device_communicator>,
-          public sparse_allreduce_usm_visitor<single_device_communicator> {
+                                                       ccl::gpu_communicator_traits> {
 public:
     using base_t = typed_single_device_base_communicator<single_device_communicator,
                                                          ccl::gpu_communicator_traits>;
-    //TODO -S- Use event_impl for all communicator impl!
-    using coll_request_t = ccl::event;
-    using allgather_usm_visitor_base_t = allgather_usm_visitor<single_device_communicator>;
-    using allreduce_usm_visitor_base_t = allreduce_usm_visitor<single_device_communicator>;
-    using alltoall_usm_visitor_base_t = alltoall_usm_visitor<single_device_communicator>;
-    using alltoallv_usm_visitor_base_t = alltoallv_usm_visitor<single_device_communicator>;
-    using broadcast_usm_visitor_base_t = broadcast_usm_visitor<single_device_communicator>;
-    using reduce_usm_visitor_base_t = reduce_usm_visitor<single_device_communicator>;
-    using reduce_scatter_usm_visitor_base_t =
-        reduce_scatter_usm_visitor<single_device_communicator>;
-    using sparse_allreduce_usm_visitor_base_t =
-        sparse_allreduce_usm_visitor<single_device_communicator>;
 
     single_device_communicator(ccl::unified_device_type&& device,
-                               ccl::unified_device_context_type&& context,
+                               ccl::unified_context_type&& context,
                                size_t thread_idx,
                                size_t proces_idx,
                                const ccl::comm_split_attr& attr);
@@ -65,20 +44,21 @@ class single_device_communicator
 #ifdef MULTI_GPU_SUPPORT
     void visit(ccl::gpu_comm_attr& comm_attr) override;
 #endif
-    coll_request_t barrier(const ccl::stream::impl_value_t& op_stream,
-                           const ccl::barrier_attr& attr,
-                           const ccl::vector_class<ccl::event>& deps) override;
+    ccl::event barrier(const ccl::stream::impl_value_t& op_stream,
+                       const ccl::barrier_attr& attr,
+                       const ccl::vector_class<ccl::event>& deps) override;
 
-    DEVICE_COMM_IMPL_DECLARATION
-    DEVICE_COMM_IMPL_CLASS_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_DECLARATION
-    DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION
+    COMM_IMPL_DECLARATION
+    COMM_IMPL_CLASS_DECLARATION
+    COMM_IMPL_SPARSE_DECLARATION
+    COMM_IMPL_SPARSE_CLASS_DECLARATION
 
     void set_ccl_comm(std::shared_ptr<ccl_comm> imp);
 
     //TODO use visit() to set `context`
-    void set_context(const ccl::unified_device_context_type::ccl_native_t& context);
+    void set_context(const ccl::unified_context_type::ccl_native_t& context);
     void set_context(const ccl::context& context);
+
 private:
     std::shared_ptr<ccl_comm> comm_impl;
 };
diff --git a/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp b/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp
index 500a1d4a4..8431bd8dd 100644
--- a/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp
+++ b/src/common/comm/single_device_communicator/single_device_communicator_impl.hpp
@@ -28,7 +28,7 @@
 /* allgatherv */
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::allgatherv_base_impl(
+ccl::event single_device_communicator::allgatherv_base_impl(
     const buffer_type* send_buf,
     size_t send_count,
     buffer_type* recv_buf,
@@ -36,122 +36,68 @@ single_device_communicator::coll_request_t single_device_communicator::allgather
     const ccl::stream::impl_value_t& stream,
     const ccl_coll_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-
-    std::unique_ptr<ccl::chargeable_event> scoped_req;
-    using namespace ::native::details;
-
-    usm_support_mode send_buf_result, recv_buf_result;
-    std::string send_buf_error, recv_buf_error;
-    std::tie(send_buf_result, std::ignore, send_buf_error) =
-        ::native::details::check_assoc_device_memory(send_buf, get_device(), get_context());
-    std::tie(recv_buf_result, std::ignore, recv_buf_error) =
-        ::native::details::check_assoc_device_memory(recv_buf, get_device(), get_context());
-    if ((send_buf_result == usm_support_mode::direct or
-         send_buf_result == usm_support_mode::shared) and
-        (recv_buf_result == usm_support_mode::direct or
-         recv_buf_result == usm_support_mode::shared)) {
-        LOG_TRACE("comm: ", to_string(), " - use USM direct pointers for both buffers");
-        scoped_req = ccl::details::make_unique_scoped_event<
-            ccl::host_event_impl>(ccl_allgatherv_impl(
-            reinterpret_cast<const void*>(send_buf),
-            send_count,
-            reinterpret_cast<void*>(recv_buf),
-            recv_counts.data(),
-            ccl::native_type_info<buffer_type>::ccl_datatype_value,
-            attr,
-            comm_impl.get(),
-            nullptr /*TODO fix core part, because stream existance use force cast to sycl::buffer*/
-            /*, stream.get()*/));
-    }
-    else if (send_buf_result == usm_support_mode::need_conversion and
-             recv_buf_result == usm_support_mode::need_conversion) {
-        size_t recv_total_size = std::accumulate(recv_counts.begin(), recv_counts.end(), size_t{});
-        ccl_request* req = nullptr;
-#ifdef CCL_ENABLE_SYCL
-        auto scoped_req_sycl = ccl::details::make_unique_scoped_event<ccl::host_event_impl>(
-            nullptr,
-            /*send_buf*/
-            cl::sycl::buffer<buffer_type>{
-                send_buf, send_count, cl::sycl::property::buffer::use_host_ptr{} },
-            /*recv_buf*/
-            cl::sycl::buffer<buffer_type>{
-                recv_buf, recv_total_size, cl::sycl::property::buffer::use_host_ptr{} });
-
-        req = ccl_allgatherv_impl(
-            reinterpret_cast<const void*>(
-                &scoped_req_sycl->template get_arg_by_index<0>() /*send_buf*/),
-            send_count,
-            reinterpret_cast<void*>(&scoped_req_sycl->template get_arg_by_index<1>() /*recv_buf*/),
-            recv_counts.data(),
-            ccl::native_type_info<buffer_type>::ccl_datatype_value,
-            attr,
-            comm_impl.get(),
-            stream.get());
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                        " - USM convertation is not supported for such configuration");
-#endif
-        scoped_req_sycl->charge(req);
-        scoped_req = std::move(scoped_req_sycl);
-    }
-    else {
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) + "\nsend_buf check result:\n" +
-                             send_buf_error + "\nrecv_buf check result:\n" + recv_buf_error);
-    }
-
-    return std::unique_ptr<ccl::event_impl>(scoped_req.release());
+    return allgatherv_base_impl(send_buf,
+                                send_count,
+                                recv_buf,
+                                recv_counts,
+                                ccl::native_type_info<buffer_type>::dtype,
+                                stream,
+                                attr,
+                                deps);
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::allgatherv_impl(
-    const buffer_type* send_buf,
-    size_t send_count,
-    buffer_type* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-
+ccl::event single_device_communicator::allgatherv_impl(const buffer_type* send_buf,
+                                                       size_t send_count,
+                                                       buffer_type* recv_buf,
+                                                       const ccl::vector_class<size_t>& recv_counts,
+                                                       const ccl::stream::impl_value_t& stream,
+                                                       const ccl::allgatherv_attr& attr,
+                                                       const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
-    return allgatherv_base_impl(send_buf, send_count, recv_buf, recv_counts, stream, internal_attr, deps);
+    return allgatherv_base_impl(
+        send_buf, send_count, recv_buf, recv_counts, stream, internal_attr, deps);
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::allgatherv_impl(
-    const buffer_type* send_buf,
-    size_t send_count,
-    ccl::vector_class<buffer_type*>& recv_bufs,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-
+ccl::event single_device_communicator::allgatherv_impl(const buffer_type* send_buf,
+                                                       size_t send_count,
+                                                       ccl::vector_class<buffer_type*>& recv_bufs,
+                                                       const ccl::vector_class<size_t>& recv_counts,
+                                                       const ccl::stream::impl_value_t& stream,
+                                                       const ccl::allgatherv_attr& attr,
+                                                       const ccl::vector_class<ccl::event>& deps) {
     ccl_coll_attr internal_attr(attr);
     internal_attr.vector_buf = 1;
-    return allgatherv_base_impl(send_buf, send_count, (buffer_type*)(recv_bufs.data()), recv_counts, stream, internal_attr, deps);
+    return allgatherv_base_impl(send_buf,
+                                send_count,
+                                (buffer_type*)(recv_bufs.data()),
+                                recv_counts,
+                                stream,
+                                internal_attr,
+                                deps);
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::allgatherv_impl(
-    const buffer_type& send_buf,
-    size_t send_count,
-    buffer_type& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allgatherv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event single_device_communicator::allgatherv_impl(const buffer_type& send_buf,
+                                                       size_t send_count,
+                                                       buffer_type& recv_buf,
+                                                       const ccl::vector_class<size_t>& recv_counts,
+                                                       const ccl::stream::impl_value_t& stream,
+                                                       const ccl::allgatherv_attr& attr,
+                                                       const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_allgatherv_impl(reinterpret_cast<const void*>(&send_buf),
                                            send_count,
                                            reinterpret_cast<void*>(&recv_buf),
                                            recv_counts.data(),
-                                           ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                           ccl::native_type_info<buffer_type>::dtype,
                                            attr,
                                            comm_impl.get(),
                                            stream.get());
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::allgatherv_impl(
+ccl::event single_device_communicator::allgatherv_impl(
     const buffer_type& send_buf,
     size_t send_count,
     ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -165,107 +111,35 @@ single_device_communicator::coll_request_t single_device_communicator::allgather
 
 /* allreduce */
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::allreduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    using namespace native;
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Single device communicator for group_id: " + ::to_string(group_id) +
-            ", class_id: " + ::to_string(class_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    LOG_DEBUG("device idx: ", get_device_path(), ", rank: (", rank(), "/", size(), ")");
-
-    std::unique_ptr<ccl::chargeable_event> scoped_req;
-    using namespace ::native::details;
-
-    usm_support_mode send_buf_result, recv_buf_result;
-    std::string send_buf_error, recv_buf_error;
-    std::tie(send_buf_result, std::ignore, send_buf_error) =
-        ::native::details::check_assoc_device_memory(send_buf, get_device(), get_context());
-    std::tie(recv_buf_result, std::ignore, recv_buf_error) =
-        ::native::details::check_assoc_device_memory(recv_buf, get_device(), get_context());
-    if ((send_buf_result == usm_support_mode::direct or
-         send_buf_result == usm_support_mode::shared) and
-        (recv_buf_result == usm_support_mode::direct or
-         recv_buf_result == usm_support_mode::shared)) {
-        LOG_TRACE("comm: ", to_string(), " - use USM direct pointers for both buffers");
-        scoped_req = ccl::details::make_unique_scoped_event<
-            ccl::host_event_impl>(ccl_allreduce_impl(
-            reinterpret_cast<const void*>(send_buf),
-            reinterpret_cast<void*>(recv_buf),
-            count,
-            ccl::native_type_info<buffer_type>::ccl_datatype_value,
-            reduction,
-            attr,
-            comm_impl.get(),
-            nullptr /*TODO fix core part, because stream existance use force cast to sycl::buffer*/
-            /*, stream.get()*/));
-    }
-    else if (send_buf_result == usm_support_mode::need_conversion and
-             recv_buf_result == usm_support_mode::need_conversion) {
-        ccl_request* req = nullptr;
-        LOG_TRACE("comm: ", to_string(), " - use USM pointers convertation for both buffers");
-#ifdef CCL_ENABLE_SYCL
-        auto scoped_req_sycl = ccl::details::make_unique_scoped_event<ccl::host_event_impl>(
-            nullptr,
-            /*send_buf*/
-            cl::sycl::buffer<buffer_type>{
-                send_buf, count, cl::sycl::property::buffer::use_host_ptr{} },
-            /*recv_buf*/
-            cl::sycl::buffer<buffer_type>{
-                recv_buf, count, cl::sycl::property::buffer::use_host_ptr{} });
-
-        req = ccl_allreduce_impl(
-            reinterpret_cast<const void*>(
-                &scoped_req_sycl->template get_arg_by_index<0>() /*send_buf*/),
-            reinterpret_cast<void*>(&scoped_req_sycl->template get_arg_by_index<1>() /*recv_buf*/),
-            count,
-            ccl::native_type_info<buffer_type>::ccl_datatype_value,
-            reduction,
-            attr,
-            comm_impl.get(),
-            stream.get());
-#else
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                        " - USM convertation is not supported for such configuration");
-#endif
-        scoped_req_sycl->charge(req);
-        scoped_req = std::move(scoped_req_sycl);
-    }
-    else {
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) + "\nsend_buf check result:\n" +
-                             send_buf_error + "\nrecv_buf check result:\n" + recv_buf_error);
-    }
-
-    return std::unique_ptr<ccl::event_impl>(scoped_req.release());
-    ;
+ccl::event single_device_communicator::allreduce_impl(const buffer_type* send_buf,
+                                                      buffer_type* recv_buf,
+                                                      size_t count,
+                                                      ccl::reduction reduction,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::allreduce_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
+    return allreduce_impl(send_buf,
+                          recv_buf,
+                          count,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          reduction,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::allreduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::allreduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event single_device_communicator::allreduce_impl(const buffer_type& send_buf,
+                                                      buffer_type& recv_buf,
+                                                      size_t count,
+                                                      ccl::reduction reduction,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::allreduce_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_allreduce_impl(reinterpret_cast<const void*>(&send_buf),
                                           reinterpret_cast<void*>(&recv_buf),
                                           count,
-                                          ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                          ccl::native_type_info<buffer_type>::dtype,
                                           reduction,
                                           attr,
                                           comm_impl.get(),
@@ -275,102 +149,18 @@ single_device_communicator::coll_request_t single_device_communicator::allreduce
 
 /* alltoall */
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::alltoall_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (unlikely(is_ready() == false)) {
-        throw ccl::exception(std::string(
-            "Single device communicator for group_id: " + ::to_string(group_id) +
-            ", class_id: " + ::to_string(class_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    LOG_DEBUG("device idx: ", get_device_path(), ", rank: (", rank(), "/", size(), ")");
-
-    std::unique_ptr<ccl::chargeable_event> scoped_req;
-    using namespace ::native::details;
-
-    // test USM arguments on validity
-    using alltoall_usm_check_result = multiple_assoc_result<2>;
-    alltoall_usm_check_result usm_assoc_results =
-        check_multiple_assoc_device_memory(get_device(), get_context(), send_buf, recv_buf);
-    usm_support_mode test_value = std::get<assoc_result_index::SUPPORT_MODE>(usm_assoc_results[0]);
-    bool ret = std::all_of(usm_assoc_results.begin(),
-                           usm_assoc_results.end(),
-                           [test_value](const typename alltoall_usm_check_result::value_type& v) {
-                               return test_value == std::get<assoc_result_index::SUPPORT_MODE>(v);
-                           });
-    if (!ret) {
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - invalid USM arguments:\n" +
-                             ::native::details::to_string(usm_assoc_results) +
-                             "\nMixed types are not supported as well");
-    }
-    switch (test_value) {
-        case usm_support_mode::shared: /*the same as `direct` at now*/
-            LOG_TRACE("comm: ", to_string(), " - use USM shared pointers for buffers");
-        case usm_support_mode::direct: {
-            LOG_TRACE("comm: ", to_string(), " - use USM direct pointers for buffers");
-            scoped_req = ccl::details::make_unique_scoped_event<
-                ccl::host_event_impl>(ccl_alltoall_impl(
-                reinterpret_cast<const void*>(send_buf),
-                reinterpret_cast<void*>(recv_buf),
-                count,
-                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                attr,
-                comm_impl.get(),
-                nullptr /*TODO fix core part, because stream existance use force cast to sycl::buffer*/
-                /*, stream.get()*/));
-            break;
-        }
-        case need_conversion: {
-            ccl_request* req = nullptr;
-#ifdef CCL_ENABLE_SYCL
-            LOG_TRACE(
-                "comm: ", to_string(), " - use USM pointers convertation to SYCL for both buffers");
-            auto scoped_req_sycl = ccl::details::make_unique_scoped_event<ccl::host_event_impl>(
-                nullptr,
-                /*send_buf*/
-                cl::sycl::buffer<buffer_type>{
-                    send_buf, count * size(), cl::sycl::property::buffer::use_host_ptr{} },
-                /*recv_buf*/
-                cl::sycl::buffer<buffer_type>{
-                    recv_buf, count * size(), cl::sycl::property::buffer::use_host_ptr{} });
-
-            req = ccl_alltoall_impl(
-                reinterpret_cast<const void*>(
-                    &scoped_req_sycl->template get_arg_by_index<0>() /*send_buf*/),
-                reinterpret_cast<void*>(
-                    &scoped_req_sycl->template get_arg_by_index<1>() /*recv_buf*/),
-                count,
-                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                attr,
-                comm_impl.get(),
-                stream.get());
-#else
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                            " - USM convertation is not supported for such configuration");
-#endif
-            scoped_req_sycl->charge(req);
-            scoped_req = std::move(scoped_req_sycl);
-            break;
-        }
-        default:
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                                 " - USM category is not supported for such configuration:\n" +
-                                 ::native::details::to_string(usm_assoc_results[0]));
-    }
-    return std::unique_ptr<ccl::event_impl>(scoped_req.release());
-    ;
+ccl::event single_device_communicator::alltoall_impl(const buffer_type* send_buf,
+                                                     buffer_type* recv_buf,
+                                                     size_t count,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::alltoall_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
+    return alltoall_impl(
+        send_buf, recv_buf, count, ccl::native_type_info<buffer_type>::dtype, stream, attr, deps);
 }
+
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::alltoall_impl(
+ccl::event single_device_communicator::alltoall_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<buffer_type*>& recv_buf,
     size_t count,
@@ -382,24 +172,24 @@ single_device_communicator::coll_request_t single_device_communicator::alltoall_
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::alltoall_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoall_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event single_device_communicator::alltoall_impl(const buffer_type& send_buf,
+                                                     buffer_type& recv_buf,
+                                                     size_t count,
+                                                     const ccl::stream::impl_value_t& stream,
+                                                     const ccl::alltoall_attr& attr,
+                                                     const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_alltoall_impl(reinterpret_cast<const void*>(&send_buf),
                                          reinterpret_cast<void*>(&recv_buf),
                                          count,
-                                         ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                         ccl::native_type_info<buffer_type>::dtype,
                                          attr,
                                          comm_impl.get(),
                                          stream.get());
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
+
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::alltoall_impl(
+ccl::event single_device_communicator::alltoall_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
     size_t count,
@@ -412,114 +202,25 @@ single_device_communicator::coll_request_t single_device_communicator::alltoall_
 
 /* alltoallv */
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::alltoallv_impl(
-    const buffer_type* send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    buffer_type* recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Single device communicator for group_id: " + ::to_string(group_id) +
-            ", class_id: " + ::to_string(class_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    LOG_DEBUG("device idx: ", get_device_path(), ", rank: (", rank(), "/", size(), ")");
-
-    std::unique_ptr<ccl::chargeable_event> scoped_req;
-    using namespace ::native::details;
-
-    // test USM arguments on validity
-    using alltoallv_usm_check_result = multiple_assoc_result<2>;
-    alltoallv_usm_check_result usm_assoc_results =
-        check_multiple_assoc_device_memory(get_device(), get_context(), send_buf, recv_buf);
-    usm_support_mode test_value = std::get<assoc_result_index::SUPPORT_MODE>(usm_assoc_results[0]);
-    bool ret = std::all_of(usm_assoc_results.begin(),
-                           usm_assoc_results.end(),
-                           [test_value](const typename alltoallv_usm_check_result::value_type& v) {
-                               return test_value == std::get<assoc_result_index::SUPPORT_MODE>(v);
-                           });
-    if (!ret) {
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - invalid USM arguments:\n" +
-                             ::native::details::to_string(usm_assoc_results));
-    }
-    switch (test_value) {
-        case usm_support_mode::shared: /*the same as `direct` at now*/
-            LOG_TRACE("comm: ", to_string(), " - use USM shared pointers for buffers");
-        case usm_support_mode::direct: {
-            LOG_TRACE("comm: ", to_string(), " - use USM direct pointers for buffers");
-            scoped_req = ccl::details::make_unique_scoped_event<
-                ccl::host_event_impl>(ccl_alltoallv_impl(
-                reinterpret_cast<const void*>(send_buf),
-                send_counts.data(),
-                reinterpret_cast<void*>(recv_buf),
-                recv_counts.data(),
-                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                attr,
-                comm_impl.get(),
-                nullptr /*TODO fix core part, because stream existance use force cast to sycl::buffer*/
-                /*, stream.get()*/));
-            break;
-        }
-        case need_conversion: {
-            ccl_request* req = nullptr;
-#ifdef CCL_ENABLE_SYCL
-            size_t send_total_size =
-                std::accumulate(send_counts.begin(), send_counts.end(), size_t{});
-            size_t recv_total_size =
-                std::accumulate(recv_counts.begin(), recv_counts.end(), size_t{});
-            LOG_TRACE(
-                "comm: ",
-                to_string(),
-                " - use USM pointers convertation to SYCL for both buffers, send_total_size: ",
-                send_total_size,
-                ", recv_total_size: ",
-                recv_total_size);
-            auto scoped_req_sycl = ccl::details::make_unique_scoped_event<ccl::host_event_impl>(
-                nullptr,
-                /*send_buf*/
-                cl::sycl::buffer<buffer_type>{
-                    send_buf, send_total_size, cl::sycl::property::buffer::use_host_ptr{} },
-                /*recv_buf*/
-                cl::sycl::buffer<buffer_type>{
-                    recv_buf, recv_total_size, cl::sycl::property::buffer::use_host_ptr{} });
-
-            req = ccl_alltoallv_impl(
-                reinterpret_cast<const void*>(
-                    &scoped_req_sycl->template get_arg_by_index<0>() /*send_buf*/),
-                send_counts.data(),
-                reinterpret_cast<void*>(
-                    &scoped_req_sycl->template get_arg_by_index<1>() /*recv_buf*/),
-                recv_counts.data(),
-                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                attr,
-                comm_impl.get(),
-                stream.get());
-#else
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                            " - USM convertation is not supported for such configuration");
-#endif
-            scoped_req_sycl->charge(req);
-            scoped_req = std::move(scoped_req_sycl);
-            break;
-        }
-        default:
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                                 " - USM category is not supported for such configuration:\n" +
-                                 ::native::details::to_string(usm_assoc_results[0]));
-    }
-    return std::unique_ptr<ccl::event_impl>(scoped_req.release());
-    ;
+ccl::event single_device_communicator::alltoallv_impl(const buffer_type* send_buf,
+                                                      const ccl::vector_class<size_t>& send_counts,
+                                                      buffer_type* recv_buf,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::alltoallv_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
+    return alltoallv_impl(send_buf,
+                          send_counts,
+                          recv_buf,
+                          recv_counts,
+                          ccl::native_type_info<buffer_type>::dtype,
+                          stream,
+                          attr,
+                          deps);
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::alltoallv_impl(
+ccl::event single_device_communicator::alltoallv_impl(
     const ccl::vector_class<buffer_type*>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<buffer_type*>& recv_buf,
@@ -532,26 +233,26 @@ single_device_communicator::coll_request_t single_device_communicator::alltoallv
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::alltoallv_impl(
-    const buffer_type& send_buf,
-    const ccl::vector_class<size_t>& send_counts,
-    buffer_type& recv_buf,
-    const ccl::vector_class<size_t>& recv_counts,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::alltoallv_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event single_device_communicator::alltoallv_impl(const buffer_type& send_buf,
+                                                      const ccl::vector_class<size_t>& send_counts,
+                                                      buffer_type& recv_buf,
+                                                      const ccl::vector_class<size_t>& recv_counts,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::alltoallv_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_alltoallv_impl(reinterpret_cast<const void*>(&send_buf),
                                           send_counts.data(),
                                           reinterpret_cast<void*>(&recv_buf),
                                           recv_counts.data(),
-                                          ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                          ccl::native_type_info<buffer_type>::dtype,
                                           attr,
                                           comm_impl.get(),
                                           stream.get());
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
+
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::alltoallv_impl(
+ccl::event single_device_communicator::alltoallv_impl(
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf,
     const ccl::vector_class<size_t>& send_counts,
     const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& recv_buf,
@@ -565,97 +266,26 @@ single_device_communicator::coll_request_t single_device_communicator::alltoallv
 
 /* bcast */
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::broadcast_impl(
-    buffer_type* buf,
-    size_t count,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Single device communicator for group_id: " + ::to_string(group_id) +
-            ", class_id: " + ::to_string(class_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    LOG_DEBUG("device idx: ", get_device_path(), ", rank: (", rank(), "/", size(), ")");
-
-    std::unique_ptr<ccl::chargeable_event> scoped_req;
-    using namespace ::native::details;
-
-    // test USM arguments on validity
-    using broadcast_usm_check_result = multiple_assoc_result<1>;
-    broadcast_usm_check_result usm_assoc_results =
-        check_multiple_assoc_device_memory(get_device(), get_context(), buf);
-    usm_support_mode test_value = std::get<assoc_result_index::SUPPORT_MODE>(usm_assoc_results[0]);
-    switch (test_value) {
-        case usm_support_mode::shared: /*the same as `direct` at now*/
-            LOG_TRACE("comm: ", to_string(), " - use USM shared pointers for buffers");
-        case usm_support_mode::direct: {
-            LOG_TRACE("comm: ", to_string(), " - use USM direct pointers for buffers");
-            scoped_req = ccl::details::make_unique_scoped_event<
-                ccl::host_event_impl>(ccl_broadcast_impl(
-                reinterpret_cast<void*>(buf),
-                count,
-                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                root,
-                attr,
-                comm_impl.get(),
-                nullptr /*TODO fix core part, because stream existance use force cast to sycl::buffer*/
-                /*, stream.get()*/));
-            break;
-        }
-        case need_conversion: {
-            ccl_request* req = nullptr;
-#ifdef CCL_ENABLE_SYCL
-            LOG_TRACE(
-                "comm: ", to_string(), " - use USM pointers convertation to SYCL for both buffers");
-            auto scoped_req_sycl = ccl::details::make_unique_scoped_event<ccl::host_event_impl>(
-                nullptr,
-                /*buf*/
-                cl::sycl::buffer<buffer_type>{
-                    buf, count, cl::sycl::property::buffer::use_host_ptr{} });
-
-            req = ccl_broadcast_impl(
-                reinterpret_cast<void*>(&scoped_req_sycl->template get_arg_by_index<0>() /*buf*/),
-                count,
-                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                root,
-                attr,
-                comm_impl.get(),
-                stream.get());
-#else
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                            " - USM convertation is not supported for such configuration");
-#endif
-            scoped_req_sycl->charge(req);
-            scoped_req = std::move(scoped_req_sycl);
-            break;
-        }
-        default:
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                                 " - USM category is not supported for such configuration:\n" +
-                                 ::native::details::to_string(usm_assoc_results[0]));
-    }
-    return std::unique_ptr<ccl::event_impl>(scoped_req.release());
-    ;
+ccl::event single_device_communicator::broadcast_impl(buffer_type* buf,
+                                                      size_t count,
+                                                      int root,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::broadcast_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
+    return broadcast_impl(
+        buf, count, ccl::native_type_info<buffer_type>::dtype, root, stream, attr, deps);
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::broadcast_impl(
-    buffer_type& buf,
-    size_t count,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::broadcast_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event single_device_communicator::broadcast_impl(buffer_type& buf,
+                                                      size_t count,
+                                                      int root,
+                                                      const ccl::stream::impl_value_t& stream,
+                                                      const ccl::broadcast_attr& attr,
+                                                      const ccl::vector_class<ccl::event>& deps) {
     ccl_request* req = ccl_broadcast_impl(reinterpret_cast<void*>(&buf),
                                           count,
-                                          ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                          ccl::native_type_info<buffer_type>::dtype,
                                           root,
                                           attr,
                                           comm_impl.get(),
@@ -665,122 +295,40 @@ single_device_communicator::coll_request_t single_device_communicator::broadcast
 
 /* reduce */
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::reduce_impl(
-    const buffer_type* send_buf,
-    buffer_type* recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Single device communicator for group_id: " + ::to_string(group_id) +
-            ", class_id: " + ::to_string(class_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    LOG_DEBUG("device idx: ", get_device_path(), ", rank: (", rank(), "/", size(), ")");
-
-    std::unique_ptr<ccl::chargeable_event> scoped_req;
-    using namespace ::native::details;
-
-    // test USM arguments on validity
-    using reduce_usm_check_result = multiple_assoc_result<2>;
-    reduce_usm_check_result usm_assoc_results =
-        check_multiple_assoc_device_memory(get_device(), get_context(), send_buf, recv_buf);
-    usm_support_mode test_value = std::get<assoc_result_index::SUPPORT_MODE>(usm_assoc_results[0]);
-    bool ret = std::all_of(usm_assoc_results.begin(),
-                           usm_assoc_results.end(),
-                           [test_value](const typename reduce_usm_check_result::value_type& v) {
-                               return test_value == std::get<assoc_result_index::SUPPORT_MODE>(v);
-                           });
-    if (!ret) {
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - invalid USM arguments:\n" +
-                             ::native::details::to_string(usm_assoc_results));
-    }
-    switch (test_value) {
-        case usm_support_mode::shared: /*the same as `direct` at now*/
-            LOG_TRACE("comm: ", to_string(), " - use USM shared pointers for buffers");
-        case usm_support_mode::direct: {
-            LOG_TRACE("comm: ", to_string(), " - use USM direct pointers for buffers");
-            scoped_req = ccl::details::make_unique_scoped_event<
-                ccl::host_event_impl>(ccl_reduce_impl(
-                reinterpret_cast<const void*>(send_buf),
-                reinterpret_cast<void*>(recv_buf),
-                count,
-                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                reduction,
-                root,
-                attr,
-                comm_impl.get(),
-                nullptr /*TODO fix core part, because stream existance use force cast to sycl::buffer*/
-                /*, stream.get()*/));
-            break;
-        }
-        case need_conversion: {
-            ccl_request* req = nullptr;
-#ifdef CCL_ENABLE_SYCL
-            LOG_TRACE(
-                "comm: ", to_string(), " - use USM pointers convertation to SYCL for both buffers");
-            auto scoped_req_sycl = ccl::details::make_unique_scoped_event<ccl::host_event_impl>(
-                nullptr,
-                /*send_buf*/
-                cl::sycl::buffer<buffer_type>{
-                    send_buf, count, cl::sycl::property::buffer::use_host_ptr{} },
-                /*recv_buf*/
-                cl::sycl::buffer<buffer_type>{
-                    recv_buf, count, cl::sycl::property::buffer::use_host_ptr{} });
-
-            req =
-                ccl_reduce_impl(reinterpret_cast<const void*>(
-                                    &scoped_req_sycl->template get_arg_by_index<0>() /*send_buf*/),
-                                reinterpret_cast<void*>(
-                                    &scoped_req_sycl->template get_arg_by_index<1>() /*recv_buf*/),
-                                count,
-                                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                                reduction,
-                                root,
-                                attr,
-                                comm_impl.get(),
-                                stream.get());
-#else
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                            " - USM convertation is not supported for such configuration");
-#endif
-            scoped_req_sycl->charge(req);
-            scoped_req = std::move(scoped_req_sycl);
-            break;
-        }
-        default:
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                                 " - USM category is not supported for such configuration:\n" +
-                                 ::native::details::to_string(usm_assoc_results[0]));
-    }
-    return std::unique_ptr<ccl::event_impl>(scoped_req.release());
-    ;
+ccl::event single_device_communicator::reduce_impl(const buffer_type* send_buf,
+                                                   buffer_type* recv_buf,
+                                                   size_t count,
+                                                   ccl::reduction reduction,
+                                                   int root,
+                                                   const ccl::stream::impl_value_t& stream,
+                                                   const ccl::reduce_attr& attr,
+                                                   const ccl::vector_class<ccl::event>& deps) {
+    return reduce_impl(send_buf,
+                       recv_buf,
+                       count,
+                       ccl::native_type_info<buffer_type>::dtype,
+                       reduction,
+                       root,
+                       stream,
+                       attr,
+                       deps);
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::reduce_impl(
-    const buffer_type& send_buf,
-    buffer_type& recv_buf,
-    size_t count,
-    ccl::reduction reduction,
-    size_t root,
-    const ccl::stream::impl_value_t& stream,
-    const ccl::reduce_attr& attr,
-    const ccl::vector_class<ccl::event>& deps) {
+ccl::event single_device_communicator::reduce_impl(const buffer_type& send_buf,
+                                                   buffer_type& recv_buf,
+                                                   size_t count,
+                                                   ccl::reduction reduction,
+                                                   int root,
+                                                   const ccl::stream::impl_value_t& stream,
+                                                   const ccl::reduce_attr& attr,
+                                                   const ccl::vector_class<ccl::event>& deps) {
     const ccl_stream* stream_ptr = stream.get();
 
     ccl_request* req = ccl_reduce_impl(reinterpret_cast<const void*>(&send_buf),
                                        reinterpret_cast<void*>(&recv_buf),
                                        count,
-                                       ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                       ccl::native_type_info<buffer_type>::dtype,
                                        reduction,
                                        root,
                                        attr,
@@ -791,7 +339,7 @@ single_device_communicator::coll_request_t single_device_communicator::reduce_im
 
 /* reduce_scatter */
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::reduce_scatter_impl(
+ccl::event single_device_communicator::reduce_scatter_impl(
     const buffer_type* send_buf,
     buffer_type* recv_buf,
     size_t recv_count,
@@ -799,97 +347,18 @@ single_device_communicator::coll_request_t single_device_communicator::reduce_sc
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_scatter_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Single device communicator for group_id: " + ::to_string(group_id) +
-            ", class_id: " + ::to_string(class_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    LOG_DEBUG("device idx: ", get_device_path(), ", rank: (", rank(), "/", size(), ")");
-
-    std::unique_ptr<ccl::chargeable_event> scoped_req;
-    using namespace ::native::details;
-
-    // test USM arguments on validity
-    using reduce_scatter_usm_check_result = multiple_assoc_result<2>;
-    reduce_scatter_usm_check_result usm_assoc_results =
-        check_multiple_assoc_device_memory(get_device(), get_context(), send_buf, recv_buf);
-    usm_support_mode test_value = std::get<assoc_result_index::SUPPORT_MODE>(usm_assoc_results[0]);
-    bool ret = std::all_of(usm_assoc_results.begin(),
-                           usm_assoc_results.end(),
-                           [test_value](const typename reduce_scatter_usm_check_result::value_type& v) {
-                               return test_value == std::get<assoc_result_index::SUPPORT_MODE>(v);
-                           });
-    if (!ret) {
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - invalid USM arguments:\n" +
-                             ::native::details::to_string(usm_assoc_results));
-    }
-    switch (test_value) {
-        case usm_support_mode::shared: /*the same as `direct` at now*/
-            LOG_TRACE("comm: ", to_string(), " - use USM shared pointers for buffers");
-        case usm_support_mode::direct: {
-            LOG_TRACE("comm: ", to_string(), " - use USM direct pointers for buffers");
-            scoped_req = ccl::details::make_unique_scoped_event<
-                ccl::host_event_impl>(ccl_reduce_scatter_impl(
-                reinterpret_cast<const void*>(send_buf),
-                reinterpret_cast<void*>(recv_buf),
-                recv_count,
-                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                reduction,
-                attr,
-                comm_impl.get(),
-                nullptr /*TODO fix core part, because stream existance use force cast to sycl::buffer*/
-                /*, stream.get()*/));
-            break;
-        }
-        case need_conversion: {
-            ccl_request* req = nullptr;
-#ifdef CCL_ENABLE_SYCL
-            LOG_TRACE(
-                "comm: ", to_string(), " - use USM pointers convertation to SYCL for both buffers");
-            auto scoped_req_sycl = ccl::details::make_unique_scoped_event<ccl::host_event_impl>(
-                nullptr,
-                /*send_buf*/
-                cl::sycl::buffer<buffer_type>{
-                    send_buf, recv_count, cl::sycl::property::buffer::use_host_ptr{} },
-                /*recv_buf*/
-                cl::sycl::buffer<buffer_type>{
-                    recv_buf, recv_count, cl::sycl::property::buffer::use_host_ptr{} });
-
-            req =
-                ccl_reduce_scatter_impl(reinterpret_cast<const void*>(
-                                    &scoped_req_sycl->template get_arg_by_index<0>() /*send_buf*/),
-                                reinterpret_cast<void*>(
-                                    &scoped_req_sycl->template get_arg_by_index<1>() /*recv_buf*/),
-                                recv_count,
-                                ccl::native_type_info<buffer_type>::ccl_datatype_value,
-                                reduction,
-                                attr,
-                                comm_impl.get(),
-                                stream.get());
-#else
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                            " - USM convertation is not supported for such configuration");
-#endif
-            scoped_req_sycl->charge(req);
-            scoped_req = std::move(scoped_req_sycl);
-            break;
-        }
-        default:
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                                 " - USM category is not supported for such configuration:\n" +
-                                 ::native::details::to_string(usm_assoc_results[0]));
-    }
-    return std::unique_ptr<ccl::event_impl>(scoped_req.release());
+    return reduce_scatter_impl(send_buf,
+                               recv_buf,
+                               recv_count,
+                               ccl::native_type_info<buffer_type>::dtype,
+                               reduction,
+                               stream,
+                               attr,
+                               deps);
 }
 
 template <class buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::reduce_scatter_impl(
+ccl::event single_device_communicator::reduce_scatter_impl(
     const buffer_type& send_buf,
     buffer_type& recv_buf,
     size_t recv_count,
@@ -897,12 +366,11 @@ single_device_communicator::coll_request_t single_device_communicator::reduce_sc
     const ccl::stream::impl_value_t& stream,
     const ccl::reduce_scatter_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-
     const ccl_stream* stream_ptr = stream.get();
     ccl_request* req = ccl_reduce_scatter_impl(reinterpret_cast<const void*>(&send_buf),
                                                reinterpret_cast<void*>(&recv_buf),
                                                recv_count,
-                                               ccl::native_type_info<buffer_type>::ccl_datatype_value,
+                                               ccl::native_type_info<buffer_type>::dtype,
                                                reduction,
                                                attr,
                                                comm_impl.get(),
@@ -912,7 +380,7 @@ single_device_communicator::coll_request_t single_device_communicator::reduce_sc
 
 /* sparse_allreduce */
 template <class index_buffer_type, class value_buffer_type>
-single_device_communicator::coll_request_t single_device_communicator::sparse_allreduce_impl(
+ccl::event single_device_communicator::sparse_allreduce_impl(
     const index_buffer_type* send_ind_buf,
     size_t send_ind_count,
     const value_buffer_type* send_val_buf,
@@ -925,115 +393,24 @@ single_device_communicator::coll_request_t single_device_communicator::sparse_al
     const ccl::stream::impl_value_t& stream,
     const ccl::sparse_allreduce_attr& attr,
     const ccl::vector_class<ccl::event>& deps) {
-    static constexpr ccl::group_split_type group_id = base_t::topology_type();
-    static constexpr ccl::device_topology_type class_id = base_t::topology_class();
-
-    if (!is_ready()) {
-        throw ccl::exception(std::string(
-            "Single device communicator for group_id: " + ::to_string(group_id) +
-            ", class_id: " + ::to_string(class_id) +
-            " is not ready yet. Not all сommunicators are created in group. Please create them before usage"));
-    }
-
-    LOG_DEBUG("device idx: ", get_device_path(), ", rank: (", rank(), "/", size(), ")");
-
-    std::unique_ptr<ccl::chargeable_event> scoped_req;
-    using namespace ::native::details;
-
-    // test USM arguments on validity
-    using sparse_allreduce_usm_check_result = multiple_assoc_result<4>;
-    sparse_allreduce_usm_check_result usm_assoc_results = check_multiple_assoc_device_memory(
-        get_device(), get_context(), send_ind_buf, send_val_buf, recv_ind_buf, recv_val_buf);
-    usm_support_mode test_value = std::get<assoc_result_index::SUPPORT_MODE>(usm_assoc_results[0]);
-    bool ret =
-        std::all_of(usm_assoc_results.begin(),
-                    usm_assoc_results.end(),
-                    [test_value](const typename sparse_allreduce_usm_check_result::value_type& v) {
-                        return test_value == std::get<assoc_result_index::SUPPORT_MODE>(v);
-                    });
-    if (!ret) {
-        throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - invalid USM arguments:\n" +
-                             ::native::details::to_string(usm_assoc_results));
-    }
-    switch (test_value) {
-        case usm_support_mode::shared: /*the same as `direct` at now*/
-            LOG_TRACE("comm: ", to_string(), " - use USM shared pointers for buffers");
-        case usm_support_mode::direct: {
-            LOG_TRACE("comm: ", to_string(), " - use USM direct pointers for buffers");
-            scoped_req = ccl::details::make_unique_scoped_event<
-                ccl::host_event_impl>(ccl_sparse_allreduce_impl(
-                (const void*)send_ind_buf,
-                send_ind_count,
-                (const void*)send_val_buf,
-                send_val_count,
-                (void*)recv_ind_buf,
-                recv_ind_count,
-                (void*)recv_val_buf,
-                recv_val_count,
-                ccl::native_type_info<index_buffer_type>::ccl_datatype_value,
-                ccl::native_type_info<value_buffer_type>::ccl_datatype_value,
-                reduction,
-                attr,
-                comm_impl.get(),
-                nullptr /*TODO fix core part, because stream existance use force cast to sycl::buffer*/
-                /*, stream.get()*/));
-            break;
-        }
-        case need_conversion: {
-            ccl_request* req = nullptr;
-#ifdef CCL_ENABLE_SYCL
-            LOG_TRACE("comm: ",
-                      to_string(),
-                      " - use USM pointers convertation to SYCL for every buffers");
-            auto scoped_req_sycl = ccl::details::make_unique_scoped_event<ccl::host_event_impl>(
-                nullptr,
-                /*send_ind_buf*/
-                cl::sycl::buffer<index_buffer_type>{
-                    send_ind_buf, send_ind_count, cl::sycl::property::buffer::use_host_ptr{} },
-                /*send_val_buf*/
-                cl::sycl::buffer<value_buffer_type>{
-                    send_val_buf, send_val_count, cl::sycl::property::buffer::use_host_ptr{} },
-                /*recv_ind_buf*/
-                cl::sycl::buffer<index_buffer_type>{
-                    recv_ind_buf, recv_ind_count, cl::sycl::property::buffer::use_host_ptr{} },
-                /*recv_val_buf*/
-                cl::sycl::buffer<value_buffer_type>{
-                    recv_val_buf, recv_val_count, cl::sycl::property::buffer::use_host_ptr{} });
-            req = ccl_sparse_allreduce_impl(
-                reinterpret_cast<const void*>(&scoped_req_sycl->template get_arg_by_index<0>()),
-                send_ind_count,
-                reinterpret_cast<const void*>(&scoped_req_sycl->template get_arg_by_index<1>()),
-                send_val_count,
-                reinterpret_cast<void*>(&scoped_req_sycl->template get_arg_by_index<2>()),
-                recv_ind_count,
-                reinterpret_cast<void*>(&scoped_req_sycl->template get_arg_by_index<3>()),
-                recv_val_count,
-                ccl::native_type_info<index_buffer_type>::ccl_datatype_value,
-                ccl::native_type_info<value_buffer_type>::ccl_datatype_value,
-                reduction,
-                attr,
-                comm_impl.get(),
-                stream.get());
-
-#else
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                            " - USM convertation is not supported for such configuration");
-#endif
-            scoped_req_sycl->charge(req);
-            scoped_req = std::move(scoped_req_sycl);
-            break;
-        }
-        default:
-            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
-                                 " - USM category is not supported for such configuration:\n" +
-                                 ::native::details::to_string(usm_assoc_results[0]));
-    }
-    return std::unique_ptr<ccl::event_impl>(scoped_req.release());
-    ;
+    return sparse_allreduce_impl(send_ind_buf,
+                                 send_ind_count,
+                                 send_val_buf,
+                                 send_val_count,
+                                 recv_ind_buf,
+                                 recv_ind_count,
+                                 recv_val_buf,
+                                 recv_val_count,
+                                 ccl::native_type_info<index_buffer_type>::dtype,
+                                 ccl::native_type_info<value_buffer_type>::dtype,
+                                 reduction,
+                                 stream,
+                                 attr,
+                                 deps);
 }
 
 template <class index_buffer_container_type, class value_buffer_container_type>
-single_device_communicator::coll_request_t single_device_communicator::sparse_allreduce_impl(
+ccl::event single_device_communicator::sparse_allreduce_impl(
     const index_buffer_container_type& send_ind_buf,
     size_t send_ind_count,
     const value_buffer_container_type& send_val_buf,
@@ -1048,20 +425,20 @@ single_device_communicator::coll_request_t single_device_communicator::sparse_al
     const ccl::vector_class<ccl::event>& deps) {
     const ccl_stream* stream_ptr = stream.get();
 
-    ccl_request* req = ccl_sparse_allreduce_impl(
-        reinterpret_cast<const void*>(&send_ind_buf),
-        send_ind_count,
-        reinterpret_cast<const void*>(&send_val_buf),
-        send_val_count,
-        reinterpret_cast<void*>(&recv_ind_buf),
-        recv_ind_count,
-        reinterpret_cast<void*>(&recv_val_buf),
-        recv_val_count,
-        ccl::native_type_info<index_buffer_container_type>::ccl_datatype_value,
-        ccl::native_type_info<value_buffer_container_type>::ccl_datatype_value,
-        reduction,
-        attr,
-        comm_impl.get(),
-        stream_ptr);
+    ccl_request* req =
+        ccl_sparse_allreduce_impl(reinterpret_cast<const void*>(&send_ind_buf),
+                                  send_ind_count,
+                                  reinterpret_cast<const void*>(&send_val_buf),
+                                  send_val_count,
+                                  reinterpret_cast<void*>(&recv_ind_buf),
+                                  recv_ind_count,
+                                  reinterpret_cast<void*>(&recv_val_buf),
+                                  recv_val_count,
+                                  ccl::native_type_info<index_buffer_container_type>::dtype,
+                                  ccl::native_type_info<value_buffer_container_type>::dtype,
+                                  reduction,
+                                  attr,
+                                  comm_impl.get(),
+                                  stream_ptr);
     return std::unique_ptr<ccl::event_impl>(new ccl::host_event_impl(req));
 }
diff --git a/src/common/comm/usm_visitor/allgather_usm_visitor.hpp b/src/common/comm/usm_visitor/allgather_usm_visitor.hpp
index 765680d56..76671a238 100644
--- a/src/common/comm/usm_visitor/allgather_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/allgather_usm_visitor.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 template <class communicator_impl>
 struct allgather_usm_visitor {
     using self_t = communicator_impl;
-    using coll_request_t = ccl::event;
 
     self_t* get_self() {
         return static_cast<self_t*>(this);
@@ -32,7 +31,7 @@ struct allgather_usm_visitor {
     }
 
     template <class... Args>
-    bool visit(coll_request_t& req,
+    bool visit(ccl::event& req,
                ccl::datatype dtype,
                const void* send_buf,
                size_t send_count,
@@ -45,13 +44,17 @@ struct allgather_usm_visitor {
                   ccl::to_string(dtype),
                   " , handle: ",
                   utils::enum_to_underlying(dtype));
+
+        CCL_THROW("unexpected path");
+
         switch (dtype) {
             case ccl::datatype::int8: {
                 using type = char;
-                req = get_self()->template allgatherv_base_impl<type>(static_cast<const type*>(send_buf),
-                                                                 send_count,
-                                                                 static_cast<type*>(recv_buf),
-                                                                 std::forward<Args>(args)...);
+                req = get_self()->template allgatherv_base_impl<type>(
+                    static_cast<const type*>(send_buf),
+                    send_count,
+                    static_cast<type*>(recv_buf),
+                    std::forward<Args>(args)...);
                 processed = true;
                 break;
             }
@@ -75,10 +78,11 @@ struct allgather_usm_visitor {
             }
             case ccl::datatype::int32: {
                 using type = int32_t;
-                req = get_self()->template allgatherv_base_impl<type>(static_cast<const type*>(send_buf),
-                                                                 send_count,
-                                                                 static_cast<type*>(recv_buf),
-                                                                 std::forward<Args>(args)...);
+                req = get_self()->template allgatherv_base_impl<type>(
+                    static_cast<const type*>(send_buf),
+                    send_count,
+                    static_cast<type*>(recv_buf),
+                    std::forward<Args>(args)...);
                 processed = true;
                 break;
             }
@@ -90,19 +94,21 @@ struct allgather_usm_visitor {
             }
             case ccl::datatype::int64: {
                 using type = int64_t;
-                req = get_self()->template allgatherv_base_impl<type>(static_cast<const type*>(send_buf),
-                                                                 send_count,
-                                                                 static_cast<type*>(recv_buf),
-                                                                 std::forward<Args>(args)...);
+                req = get_self()->template allgatherv_base_impl<type>(
+                    static_cast<const type*>(send_buf),
+                    send_count,
+                    static_cast<type*>(recv_buf),
+                    std::forward<Args>(args)...);
                 processed = true;
                 break;
             }
             case ccl::datatype::uint64: {
                 using type = uint64_t;
-                req = get_self()->template allgatherv_base_impl<type>(static_cast<const type*>(send_buf),
-                                                                 send_count,
-                                                                 static_cast<type*>(recv_buf),
-                                                                 std::forward<Args>(args)...);
+                req = get_self()->template allgatherv_base_impl<type>(
+                    static_cast<const type*>(send_buf),
+                    send_count,
+                    static_cast<type*>(recv_buf),
+                    std::forward<Args>(args)...);
                 processed = true;
                 break;
             }
@@ -114,19 +120,21 @@ struct allgather_usm_visitor {
             }
             case ccl::datatype::float32: {
                 using type = float;
-                req = get_self()->template allgatherv_base_impl<type>(static_cast<const type*>(send_buf),
-                                                                 send_count,
-                                                                 static_cast<type*>(recv_buf),
-                                                                 std::forward<Args>(args)...);
+                req = get_self()->template allgatherv_base_impl<type>(
+                    static_cast<const type*>(send_buf),
+                    send_count,
+                    static_cast<type*>(recv_buf),
+                    std::forward<Args>(args)...);
                 processed = true;
                 break;
             }
             case ccl::datatype::float64: {
                 using type = double;
-                req = get_self()->template allgatherv_base_impl<type>(static_cast<const type*>(send_buf),
-                                                                 send_count,
-                                                                 static_cast<type*>(recv_buf),
-                                                                 std::forward<Args>(args)...);
+                req = get_self()->template allgatherv_base_impl<type>(
+                    static_cast<const type*>(send_buf),
+                    send_count,
+                    static_cast<type*>(recv_buf),
+                    std::forward<Args>(args)...);
                 processed = true;
                 break;
             }
diff --git a/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp b/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp
index 8820b01f0..9154ece9e 100644
--- a/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/allreduce_usm_visitor.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 template <class communicator_impl>
 struct allreduce_usm_visitor {
     using self_t = communicator_impl;
-    using coll_request_t = ccl::event;
 
     self_t* get_self() {
         return static_cast<self_t*>(this);
@@ -32,7 +31,7 @@ struct allreduce_usm_visitor {
     }
 
     template <class... Args>
-    bool visit(coll_request_t& req,
+    bool visit(ccl::event& req,
                ccl::datatype dtype,
                const void* send_buf,
                void* recv_buf,
@@ -45,6 +44,9 @@ struct allreduce_usm_visitor {
                   ccl::to_string(dtype),
                   " , handle: ",
                   utils::enum_to_underlying(dtype));
+
+        CCL_THROW("unexpected path");
+
         switch (dtype) {
             case ccl::datatype::int8: {
                 using type = char;
diff --git a/src/common/comm/usm_visitor/alltoall_usm_visitor.hpp b/src/common/comm/usm_visitor/alltoall_usm_visitor.hpp
index 0ad1bcdb5..468214b61 100644
--- a/src/common/comm/usm_visitor/alltoall_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/alltoall_usm_visitor.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 template <class communicator_impl>
 struct alltoall_usm_visitor {
     using self_t = communicator_impl;
-    using coll_request_t = ccl::event;
 
     self_t* get_self() {
         return static_cast<self_t*>(this);
@@ -32,7 +31,7 @@ struct alltoall_usm_visitor {
     }
 
     template <class... Args>
-    bool visit(coll_request_t& req,
+    bool visit(ccl::event& req,
                ccl::datatype dtype,
                const void* send_buf,
                void* recv_buf,
@@ -45,6 +44,9 @@ struct alltoall_usm_visitor {
                   ccl::to_string(dtype),
                   " , handle: ",
                   utils::enum_to_underlying(dtype));
+
+        CCL_THROW("unexpected path");
+
         switch (dtype) {
             case ccl::datatype::int8: {
                 using type = char;
diff --git a/src/common/comm/usm_visitor/alltoallv_usm_visitor.hpp b/src/common/comm/usm_visitor/alltoallv_usm_visitor.hpp
index 43328adc7..e3c81014b 100644
--- a/src/common/comm/usm_visitor/alltoallv_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/alltoallv_usm_visitor.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 template <class communicator_impl>
 struct alltoallv_usm_visitor {
     using self_t = communicator_impl;
-    using coll_request_t = ccl::event;
 
     self_t* get_self() {
         return static_cast<self_t*>(this);
@@ -32,7 +31,7 @@ struct alltoallv_usm_visitor {
     }
 
     template <class... Args>
-    bool visit(coll_request_t& req,
+    bool visit(ccl::event& req,
                ccl::datatype dtype,
                const void* send_buf,
                const ccl::vector_class<size_t>& send_count,
@@ -46,6 +45,9 @@ struct alltoallv_usm_visitor {
                   ccl::to_string(dtype),
                   " , handle: ",
                   utils::enum_to_underlying(dtype));
+
+        CCL_THROW("unexpected path");
+
         switch (dtype) {
             case ccl::datatype::int8: {
                 using type = char;
diff --git a/src/common/comm/usm_visitor/broadcast_usm_visitor.hpp b/src/common/comm/usm_visitor/broadcast_usm_visitor.hpp
index 2a1c3a4a4..1742cc479 100644
--- a/src/common/comm/usm_visitor/broadcast_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/broadcast_usm_visitor.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 template <class communicator_impl>
 struct broadcast_usm_visitor {
     using self_t = communicator_impl;
-    using coll_request_t = ccl::event;
 
     self_t* get_self() {
         return static_cast<self_t*>(this);
@@ -32,7 +31,7 @@ struct broadcast_usm_visitor {
     }
 
     template <class... Args>
-    bool visit(coll_request_t& req, ccl::datatype dtype, void* buf, size_t count, Args&&... args) {
+    bool visit(ccl::event& req, ccl::datatype dtype, void* buf, size_t count, Args&&... args) {
         bool processed = false;
         LOG_TRACE("comm: ",
                   get_self()->to_string(),
@@ -40,6 +39,9 @@ struct broadcast_usm_visitor {
                   ccl::to_string(dtype),
                   " , handle: ",
                   utils::enum_to_underlying(dtype));
+
+        CCL_THROW("unexpected path");
+
         switch (dtype) {
             case ccl::datatype::int8: {
                 using type = char;
diff --git a/src/common/comm/usm_visitor/reduce_scatter_usm_visitor.hpp b/src/common/comm/usm_visitor/reduce_scatter_usm_visitor.hpp
index 580561938..38f7f3b21 100644
--- a/src/common/comm/usm_visitor/reduce_scatter_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/reduce_scatter_usm_visitor.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 template <class communicator_impl>
 struct reduce_scatter_usm_visitor {
     using self_t = communicator_impl;
-    using coll_request_t = ccl::event;
 
     self_t* get_self() {
         return static_cast<self_t*>(this);
@@ -33,7 +32,7 @@ struct reduce_scatter_usm_visitor {
     }
 
     template <class... Args>
-    bool visit(coll_request_t& req,
+    bool visit(ccl::event& req,
                ccl::datatype dtype,
                const void* send_buf,
                void* recv_buf,
@@ -46,6 +45,9 @@ struct reduce_scatter_usm_visitor {
                   ccl::to_string(dtype),
                   " , handle: ",
                   utils::enum_to_underlying(dtype));
+
+        CCL_THROW("unexpected path");
+
         switch (dtype) {
             case ccl::datatype::int8: {
                 using type = char;
diff --git a/src/common/comm/usm_visitor/reduce_usm_visitor.hpp b/src/common/comm/usm_visitor/reduce_usm_visitor.hpp
index 02b49f7c7..5d8b66fa9 100644
--- a/src/common/comm/usm_visitor/reduce_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/reduce_usm_visitor.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 template <class communicator_impl>
 struct reduce_usm_visitor {
     using self_t = communicator_impl;
-    using coll_request_t = ccl::event;
 
     self_t* get_self() {
         return static_cast<self_t*>(this);
@@ -32,7 +31,7 @@ struct reduce_usm_visitor {
     }
 
     template <class... Args>
-    bool visit(coll_request_t& req,
+    bool visit(ccl::event& req,
                ccl::datatype dtype,
                const void* send_buf,
                void* recv_buf,
@@ -45,6 +44,9 @@ struct reduce_usm_visitor {
                   ccl::to_string(dtype),
                   " , handle: ",
                   utils::enum_to_underlying(dtype));
+
+        CCL_THROW("unexpected path");
+
         switch (dtype) {
             case ccl::datatype::int8: {
                 using type = char;
diff --git a/src/common/comm/usm_visitor/sparse_allreduce_usm_visitor.hpp b/src/common/comm/usm_visitor/sparse_allreduce_usm_visitor.hpp
index a444b6d9c..d7533748a 100644
--- a/src/common/comm/usm_visitor/sparse_allreduce_usm_visitor.hpp
+++ b/src/common/comm/usm_visitor/sparse_allreduce_usm_visitor.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include "oneapi/ccl.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 template <class communicator_impl>
 struct sparse_allreduce_usm_visitor {
     using self_t = communicator_impl;
-    using coll_request_t = ccl::event;
 
     self_t* get_self() {
         return static_cast<self_t*>(this);
@@ -33,7 +32,7 @@ struct sparse_allreduce_usm_visitor {
     }
 
     template <class... Args>
-    bool visit(coll_request_t& req,
+    bool visit(ccl::event& req,
                ccl::datatype index_dtype,
                ccl::datatype value_dtype,
                const void* send_ind_buf,
@@ -52,6 +51,9 @@ struct sparse_allreduce_usm_visitor {
                   ccl::to_string(value_dtype),
                   " , handle: ",
                   utils::enum_to_underlying(value_dtype));
+
+        CCL_THROW("unexpected path");
+
         switch (value_dtype) //TODO -S- value only
         {
             case ccl::datatype::int8: {
diff --git a/src/common/context/context.cpp b/src/common/context/context.cpp
index 57c63bce1..85a4b4f82 100644
--- a/src/common/context/context.cpp
+++ b/src/common/context/context.cpp
@@ -17,23 +17,21 @@
 #include "common/context/context.hpp"
 #include "oneapi/ccl/native_device_api/export_api.hpp"
 
-ccl_context_impl::ccl_context_impl(device_context_native_t& ctx, const ccl::library_version& version)
-    : version(version),
-    native_device_context(ctx)
-{
-}
+ccl_context_impl::ccl_context_impl(context_native_t& ctx, const ccl::library_version& version)
+        : version(version),
+          native_context(ctx) {}
 
-ccl_context_impl::ccl_context_impl(device_context_native_t&& ctx, const ccl::library_version& version)
-    : version(version),
-    native_device_context(std::move(ctx))
-{
-}
+ccl_context_impl::ccl_context_impl(const context_native_t& ctx, const ccl::library_version& version)
+        : version(version),
+          native_context(ctx) {}
 
-ccl_context_impl::ccl_context_impl(device_context_native_handle_t ctx_handle,
-                    const ccl::library_version& version)
-    : version(version)
-{
-}
+ccl_context_impl::ccl_context_impl(context_native_t&& ctx, const ccl::library_version& version)
+        : version(version),
+          native_context(std::move(ctx)) {}
+
+ccl_context_impl::ccl_context_impl(context_native_handle_t ctx_handle,
+                                   const ccl::library_version& version)
+        : version(version) {}
 
 void ccl_context_impl::build_from_params() {
     if (!creation_is_postponed) {
@@ -63,21 +61,21 @@ typename ccl_context_impl::version_traits_t::type ccl_context_impl::set_attribut
     return version;
 }
 
-const typename ccl_context_impl::version_traits_t::return_type& ccl_context_impl::get_attribute_value(
-    const version_traits_t& id) const {
+const typename ccl_context_impl::version_traits_t::return_type&
+ccl_context_impl::get_attribute_value(const version_traits_t& id) const {
     return version;
 }
 
-const typename ccl_context_impl::cl_backend_traits_t::return_type& ccl_context_impl::get_attribute_value(
-    const cl_backend_traits_t& id) const {
-
+const typename ccl_context_impl::cl_backend_traits_t::return_type&
+ccl_context_impl::get_attribute_value(const cl_backend_traits_t& id) const {
     //TODO
-    throw ccl::exception("TODO - Get value for 'ccl::device_attr_id::cl_backend_traits_t' is not inmlemented");
-    static constexpr ccl::cl_backend_type ret{ccl::cl_backend_type::empty_backend};
+    throw ccl::exception(
+        "TODO - Get value for 'ccl::device_attr_id::cl_backend_traits_t' is not inmlemented");
+    static constexpr ccl::cl_backend_type ret{ ccl::cl_backend_type::empty_backend };
     return ret;
 }
 
-typename ccl_context_impl::native_handle_traits_t::return_type& ccl_context_impl::get_attribute_value(
-    const native_handle_traits_t& id) {
-    return native_device_context;
+typename ccl_context_impl::native_handle_traits_t::return_type&
+ccl_context_impl::get_attribute_value(const native_handle_traits_t& id) {
+    return native_context;
 }
diff --git a/src/common/context/context.hpp b/src/common/context/context.hpp
index 93a9c45bc..b25464190 100644
--- a/src/common/context/context.hpp
+++ b/src/common/context/context.hpp
@@ -14,46 +14,45 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_context_attr_ids.hpp"
-#include "oneapi/ccl/ccl_context_attr_ids_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/context_attr_ids.hpp"
+#include "oneapi/ccl/context_attr_ids_traits.hpp"
 #include "common/utils/utils.hpp"
 
-
-class ccl_context_impl
-{
+class ccl_context_impl {
 public:
-    using device_context_native_handle_t = typename ccl::unified_device_context_type::handle_t;
-    using device_context_native_t = typename ccl::unified_device_context_type::ccl_native_t;
+    using context_native_handle_t = typename ccl::unified_context_type::handle_t;
+    using context_native_t = typename ccl::unified_context_type::ccl_native_t;
 
     ccl_context_impl() = delete;
     ccl_context_impl(const ccl_context_impl& other) = delete;
     ccl_context_impl& operator=(const ccl_context_impl& other) = delete;
 
-    ccl_context_impl(device_context_native_t& ctx, const ccl::library_version& version);
-    ccl_context_impl(device_context_native_t&& ctx, const ccl::library_version& version);
-    ccl_context_impl(device_context_native_handle_t ctx_handle,
-                    const ccl::library_version& version);
+    ccl_context_impl(context_native_t& ctx, const ccl::library_version& version);
+    ccl_context_impl(const context_native_t& ctx, const ccl::library_version& version);
+    ccl_context_impl(context_native_t&& ctx, const ccl::library_version& version);
+    ccl_context_impl(context_native_handle_t ctx_handle, const ccl::library_version& version);
     ~ccl_context_impl() = default;
 
     //Export Attributes
     using version_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::context_attr_id, ccl::context_attr_id::version>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::context_attr_id, ccl::context_attr_id::version>;
     typename version_traits_t::type set_attribute_value(typename version_traits_t::type val,
                                                         const version_traits_t& t);
     const typename version_traits_t::return_type& get_attribute_value(
         const version_traits_t& id) const;
 
-
     using cl_backend_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::context_attr_id, ccl::context_attr_id::cl_backend>;
-    const typename cl_backend_traits_t::return_type& get_attribute_value(const cl_backend_traits_t& id) const;
+        ccl::detail::ccl_api_type_attr_traits<ccl::context_attr_id,
+                                              ccl::context_attr_id::cl_backend>;
+    const typename cl_backend_traits_t::return_type& get_attribute_value(
+        const cl_backend_traits_t& id) const;
 
     using native_handle_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::context_attr_id,
-                                               ccl::context_attr_id::native_handle>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::context_attr_id,
+                                              ccl::context_attr_id::native_handle>;
     typename native_handle_traits_t::return_type& get_attribute_value(
         const native_handle_traits_t& id);
 
@@ -61,6 +60,6 @@ class ccl_context_impl
 
 private:
     const ccl::library_version version;
-    device_context_native_t native_device_context;
+    context_native_t native_context;
     bool creation_is_postponed{ false };
 };
diff --git a/src/common/datatype/datatype.cpp b/src/common/datatype/datatype.cpp
index cba25a96a..e34eb3a00 100644
--- a/src/common/datatype/datatype.cpp
+++ b/src/common/datatype/datatype.cpp
@@ -20,7 +20,27 @@
 #include "common/utils/enums.hpp"
 #include "exec/exec.hpp"
 
-ccl_datatype ccl_datatype_char;
+namespace ccl {
+using datatype_str_enum =
+    utils::enum_to_str<utils::enum_to_underlying(datatype::last_predefined) + 1>;
+string_class to_string(const datatype& dt) {
+    return datatype_str_enum({ "INT8",
+                               "UINT8",
+                               "INT16",
+                               "UINT16",
+                               "INT32",
+                               "UINT32",
+                               "INT64",
+                               "UINT64",
+                               "FLOAT16",
+                               "FLOAT32",
+                               "FLOAT64",
+                               "BFLOAT16" })
+        .choose(dt, "CUSTOM_TYPE");
+}
+} // namespace ccl
+
+ccl_datatype ccl_datatype_int8;
 
 ccl::datatype& operator++(ccl::datatype& d) {
     using IntType = typename std::underlying_type<ccl::datatype>::type;
@@ -34,6 +54,25 @@ ccl::datatype operator++(ccl::datatype& d, int) {
     return tmp;
 }
 
+std::ostream& operator<<(std::ostream& os, const ccl::datatype& dt) {
+    os << ccl::to_string(dt);
+    return os;
+}
+
+// CCL_API
+// std::string to_string(const bfloat16& v) {
+//     std::stringstream ss;
+//     ss << "bf16::data " << v.data;
+//     return ss.str();
+// }
+
+// CCL_API
+// std::string to_string(const float16& v) {
+//     std::stringstream ss;
+//     ss << "fp16::data " << v.data;
+//     return ss.str();
+// }
+
 ccl_datatype::ccl_datatype(ccl::datatype idx, size_t size) : m_idx(idx), m_size(size) {
     CCL_THROW_IF_NOT(m_size > 0, "unexpected datatype size ", m_size);
 }
@@ -42,14 +81,15 @@ ccl_datatype_storage::ccl_datatype_storage() {
     LOG_DEBUG("create datatype_storage");
 
     using IntType = typename std::underlying_type<ccl::datatype>::type;
-    custom_idx = static_cast<ccl::datatype>(static_cast<IntType>(ccl::datatype::last_predefined) + 1);
+    custom_idx =
+        static_cast<ccl::datatype>(static_cast<IntType>(ccl::datatype::last_predefined) + 1);
 
     size_t size = 0;
     std::string name_str;
 
     for (ccl::datatype idx = ccl::datatype::int8; idx <= ccl::datatype::last_predefined; idx++) {
         /* fill table with predefined datatypes */
-        size = (idx == ccl::datatype::int8)       ? sizeof(char)
+        size = (idx == ccl::datatype::int8)       ? sizeof(int8_t)
                : (idx == ccl::datatype::uint8)    ? sizeof(uint8_t)
                : (idx == ccl::datatype::int16)    ? sizeof(int16_t)
                : (idx == ccl::datatype::uint16)   ? sizeof(uint16_t)
@@ -72,10 +112,10 @@ ccl_datatype_storage::ccl_datatype_storage() {
                    : (idx == ccl::datatype::uint32)   ? "UINT32"
                    : (idx == ccl::datatype::int64)    ? "INT64"
                    : (idx == ccl::datatype::uint64)   ? "UINT64"
-                   : (idx == ccl::datatype::float16)  ? "FLOAT16"
-                   : (idx == ccl::datatype::float32)  ? "FLOAT"
-                   : (idx == ccl::datatype::float64)  ? "DOUBLE"
-                   : (idx == ccl::datatype::bfloat16) ? "BFLOAT16"
+                   : (idx == ccl::datatype::float16)  ? "FP16"
+                   : (idx == ccl::datatype::float32)  ? "FP32"
+                   : (idx == ccl::datatype::float64)  ? "FP64"
+                   : (idx == ccl::datatype::bfloat16) ? "BF16"
                                                       : 0;
 
         create_internal(predefined_table, idx, size, name_str);
@@ -94,7 +134,7 @@ ccl_datatype_storage::ccl_datatype_storage() {
                          name_str);
     }
 
-    ccl_datatype_char = get(ccl::datatype::int8);
+    ccl_datatype_int8 = get(ccl::datatype::int8);
 }
 
 ccl_datatype_storage::~ccl_datatype_storage() {
diff --git a/src/common/datatype/datatype.hpp b/src/common/datatype/datatype.hpp
index ca2143ee9..1bf5a984e 100644
--- a/src/common/datatype/datatype.hpp
+++ b/src/common/datatype/datatype.hpp
@@ -19,13 +19,13 @@
 #include <unordered_map>
 #include <utility>
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/log/log.hpp"
 #include "common/utils/spinlock.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_datatype_attr_ids.hpp"
-#include "oneapi/ccl/ccl_datatype_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_datatype_attr.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/datatype_attr_ids.hpp"
+#include "oneapi/ccl/datatype_attr_ids_traits.hpp"
+#include "oneapi/ccl/datatype_attr.hpp"
 
 class ccl_datatype {
 public:
@@ -51,13 +51,11 @@ class ccl_datatype {
 };
 
 /* frequently used in multiple places */
-extern ccl_datatype ccl_datatype_char;
+extern ccl_datatype ccl_datatype_int8;
 
-struct ccl_datatype_hasher
-{
+struct ccl_datatype_hasher {
     template <typename T>
-    std::size_t operator()(T t) const
-    {
+    std::size_t operator()(T t) const {
         return static_cast<std::size_t>(t);
     }
 };
@@ -65,9 +63,7 @@ struct ccl_datatype_hasher
 using ccl_datatype_lock_t = ccl_spinlock;
 
 using ccl_datatype_table_t =
-    std::unordered_map<ccl::datatype,
-                       std::pair<ccl_datatype, std::string>,
-                       ccl_datatype_hasher>;
+    std::unordered_map<ccl::datatype, std::pair<ccl_datatype, std::string>, ccl_datatype_hasher>;
 
 class ccl_datatype_storage {
 public:
@@ -99,5 +95,9 @@ class ccl_datatype_storage {
     ccl_datatype_table_t custom_table;
 };
 
+namespace ccl {
+string_class to_string(const datatype& dt);
+}
+
 ccl::datatype& operator++(ccl::datatype& d);
 ccl::datatype operator++(ccl::datatype& d, int);
diff --git a/src/common/datatype/datatype_attr.hpp b/src/common/datatype/datatype_attr.hpp
index 5072e94c0..2726373d3 100644
--- a/src/common/datatype/datatype_attr.hpp
+++ b/src/common/datatype/datatype_attr.hpp
@@ -13,59 +13,59 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_datatype_attr_ids_traits.hpp"
-
-namespace ccl {
-
-class ccl_datatype_attr_impl {
-public:
-    /**
-     * `version` operations
-     */
-    using version_traits_t =
-        details::ccl_api_type_attr_traits<datatype_attr_id, datatype_attr_id::version>;
-
-    const typename version_traits_t::return_type& get_attribute_value(
-        const version_traits_t& id) const {
-        return version;
-    }
-
-    typename version_traits_t::return_type set_attribute_value(typename version_traits_t::type val,
-                                                               const version_traits_t& t) {
-        (void)t;
-        throw ccl::exception("Set value for 'ccl::datatype_attr_id::version' is not allowed");
-        return version;
-    }
-
-    /**
-     * `size` operations
-     */
-    using size_traits_t =
-        details::ccl_api_type_attr_traits<datatype_attr_id, datatype_attr_id::size>;
-
-    const typename size_traits_t::return_type& get_attribute_value(const size_traits_t& id) const {
-        return datatype_size;
-    }
-
-    typename size_traits_t::return_type set_attribute_value(typename size_traits_t::return_type val,
-                                                            const size_traits_t& t) {
-        if (val <= 0) {
-            throw ccl::exception("Size value must be greater than 0");
-        }
-        auto old = datatype_size;
-        datatype_size = val;
-        return old;
-    }
-
-    ccl_datatype_attr_impl(const typename version_traits_t::return_type& version)
-            : version(version) {}
-
-protected:
-    typename version_traits_t::return_type version;
-    typename size_traits_t::return_type datatype_size = 1;
-};
-
-} // namespace ccl
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/datatype_attr_ids_traits.hpp"
+
+namespace ccl {
+
+class ccl_datatype_attr_impl {
+public:
+    /**
+     * `version` operations
+     */
+    using version_traits_t =
+        detail::ccl_api_type_attr_traits<datatype_attr_id, datatype_attr_id::version>;
+
+    const typename version_traits_t::return_type& get_attribute_value(
+        const version_traits_t& id) const {
+        return version;
+    }
+
+    typename version_traits_t::return_type set_attribute_value(typename version_traits_t::type val,
+                                                               const version_traits_t& t) {
+        (void)t;
+        throw ccl::exception("Set value for 'ccl::datatype_attr_id::version' is not allowed");
+        return version;
+    }
+
+    /**
+     * `size` operations
+     */
+    using size_traits_t =
+        detail::ccl_api_type_attr_traits<datatype_attr_id, datatype_attr_id::size>;
+
+    const typename size_traits_t::return_type& get_attribute_value(const size_traits_t& id) const {
+        return datatype_size;
+    }
+
+    typename size_traits_t::return_type set_attribute_value(typename size_traits_t::return_type val,
+                                                            const size_traits_t& t) {
+        if (val <= 0) {
+            throw ccl::exception("Size value must be greater than 0");
+        }
+        auto old = datatype_size;
+        datatype_size = val;
+        return old;
+    }
+
+    ccl_datatype_attr_impl(const typename version_traits_t::return_type& version)
+            : version(version) {}
+
+protected:
+    typename version_traits_t::return_type version;
+    typename size_traits_t::return_type datatype_size = 1;
+};
+
+} // namespace ccl
diff --git a/src/common/device/device.cpp b/src/common/device/device.cpp
index 30b135fa4..670466f7e 100644
--- a/src/common/device/device.cpp
+++ b/src/common/device/device.cpp
@@ -18,22 +18,20 @@
 #include "oneapi/ccl/native_device_api/export_api.hpp"
 
 ccl_device_impl::ccl_device_impl(device_native_t& dev, const ccl::library_version& version)
-    : version(version),
-    native_device(dev)
-{
-}
+        : version(version),
+          native_device(dev) {}
+
+ccl_device_impl::ccl_device_impl(const device_native_t& dev, const ccl::library_version& version)
+        : version(version),
+          native_device(dev) {}
 
 ccl_device_impl::ccl_device_impl(device_native_t&& dev, const ccl::library_version& version)
-    : version(version),
-    native_device(std::move(dev))
-{
-}
+        : version(version),
+          native_device(std::move(dev)) {}
 
 ccl_device_impl::ccl_device_impl(device_native_handle_t dev_handle,
-                    const ccl::library_version& version)
-    : version(version)
-{
-}
+                                 const ccl::library_version& version)
+        : version(version) {}
 
 void ccl_device_impl::build_from_params() {
     if (!creation_is_postponed) {
@@ -68,12 +66,12 @@ const typename ccl_device_impl::version_traits_t::return_type& ccl_device_impl::
     return version;
 }
 
-const typename ccl_device_impl::cl_backend_traits_t::return_type& ccl_device_impl::get_attribute_value(
-    const cl_backend_traits_t& id) const {
-
+const typename ccl_device_impl::cl_backend_traits_t::return_type&
+ccl_device_impl::get_attribute_value(const cl_backend_traits_t& id) const {
     //TODO
-    throw ccl::exception("TODO - Get value for 'ccl::device_attr_id::cl_backend_traits_t' is not inmlemented");
-    static constexpr ccl::cl_backend_type ret{ccl::cl_backend_type::empty_backend};
+    throw ccl::exception(
+        "TODO - Get value for 'ccl::device_attr_id::cl_backend_traits_t' is not inmlemented");
+    static constexpr ccl::cl_backend_type ret{ ccl::cl_backend_type::empty_backend };
     return ret;
 }
 
diff --git a/src/common/device/device.hpp b/src/common/device/device.hpp
index 75867a3a4..62cdb21a0 100644
--- a/src/common/device/device.hpp
+++ b/src/common/device/device.hpp
@@ -14,16 +14,14 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_device_attr_ids.hpp"
-#include "oneapi/ccl/ccl_device_attr_ids_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/device_attr_ids.hpp"
+#include "oneapi/ccl/device_attr_ids_traits.hpp"
 #include "common/utils/utils.hpp"
 
-
-class ccl_device_impl
-{
+class ccl_device_impl {
 public:
     using device_native_handle_t = typename ccl::unified_device_type::handle_t;
     using device_native_t = typename ccl::unified_device_type::ccl_native_t;
@@ -33,27 +31,27 @@ class ccl_device_impl
     ccl_device_impl& operator=(const ccl_device_impl& other) = delete;
 
     ccl_device_impl(device_native_t& dev, const ccl::library_version& version);
+    ccl_device_impl(const device_native_t& dev, const ccl::library_version& version);
     ccl_device_impl(device_native_t&& dev, const ccl::library_version& version);
-    ccl_device_impl(device_native_handle_t dev_handle,
-                    const ccl::library_version& version);
+    ccl_device_impl(device_native_handle_t dev_handle, const ccl::library_version& version);
     ~ccl_device_impl() = default;
 
     //Export Attributes
     using version_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::device_attr_id, ccl::device_attr_id::version>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::device_attr_id, ccl::device_attr_id::version>;
     typename version_traits_t::type set_attribute_value(typename version_traits_t::type val,
                                                         const version_traits_t& t);
     const typename version_traits_t::return_type& get_attribute_value(
         const version_traits_t& id) const;
 
-
     using cl_backend_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::device_attr_id, ccl::device_attr_id::cl_backend>;
-    const typename cl_backend_traits_t::return_type& get_attribute_value(const cl_backend_traits_t& id) const;
+        ccl::detail::ccl_api_type_attr_traits<ccl::device_attr_id, ccl::device_attr_id::cl_backend>;
+    const typename cl_backend_traits_t::return_type& get_attribute_value(
+        const cl_backend_traits_t& id) const;
 
     using native_handle_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::device_attr_id,
-                                               ccl::device_attr_id::native_handle>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::device_attr_id,
+                                              ccl::device_attr_id::native_handle>;
     typename native_handle_traits_t::return_type& get_attribute_value(
         const native_handle_traits_t& id);
 
diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp
index edcc3e98a..bc35a4acf 100644
--- a/src/common/env/env.cpp
+++ b/src/common/env/env.cpp
@@ -22,7 +22,8 @@
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
 #include "exec/exec.hpp"
-#include "oneapi/ccl/ccl_environment.hpp"
+#include "oneapi/ccl/environment.hpp"
+#include "common/utils/version.hpp"
 
 namespace ccl {
 
@@ -63,10 +64,11 @@ env_data::env_data()
           priority_mode(ccl_priority_none),
           spin_count(100),
           yield_type(ccl_yield_pause),
-          max_short_size(4096),
+          max_short_size(0),
           bcast_part_count(CCL_ENV_SIZET_NOT_SPECIFIED),
           cache_key_type(ccl_cache_key_match_id),
           enable_cache_flush(1),
+          enable_strict_order(0),
 
           chunk_count(1),
           min_chunk_size(65536),
@@ -93,7 +95,7 @@ void env_data::parse() {
     CCL_THROW_IF_NOT(worker_count >= 1, "incorrect ", CCL_WORKER_COUNT, " ", worker_count);
     env_2_type(CCL_WORKER_OFFLOAD, worker_offload);
 
-    env_2_enum(CCL_ATL_TRANSPORT, atl_transport_names, atl_transport);
+    env_2_atl_transport();
     env_2_type(CCL_ATL_SHM, enable_shm);
     env_2_type(CCL_ATL_SYNC_COLL, sync_coll);
     env_2_type(CCL_ATL_EXTRA_EP, extra_ep);
@@ -138,6 +140,7 @@ void env_data::parse() {
     env_2_type(CCL_BCAST_PART_COUNT, (size_t&)bcast_part_count);
     env_2_enum(CCL_CACHE_KEY, ccl_sched_key::key_type_names, cache_key_type);
     env_2_type(CCL_CACHE_FLUSH, enable_cache_flush);
+    env_2_type(CCL_STRICT_ORDER, enable_strict_order);
 
     env_2_type(CCL_CHUNK_COUNT, chunk_count);
     CCL_THROW_IF_NOT(chunk_count >= 1, "incorrect ", CCL_CHUNK_COUNT, " ", chunk_count);
@@ -171,7 +174,6 @@ void env_data::parse() {
 }
 
 void env_data::print() {
-
     std::lock_guard<ccl_spinlock> lock{ print_guard };
 
     if (was_printed)
@@ -186,13 +188,7 @@ void env_data::print() {
 #endif
     LOG_INFO("build mode : ", build_mode);
 
-    ccl::library_version version;
-    version.major = CCL_MAJOR_VERSION;
-    version.minor = CCL_MINOR_VERSION;
-    version.update = CCL_UPDATE_VERSION;
-    version.product_status = CCL_PRODUCT_STATUS;
-    version.build_date = CCL_PRODUCT_BUILD_DATE;
-    version.full = CCL_PRODUCT_FULL;
+    auto version = utils::get_library_version();
 
     LOG_INFO("version : ", version.full);
 
@@ -236,7 +232,9 @@ void env_data::print() {
     LOG_INFO(
         CCL_REDUCE, ": ", (reduce_algo_raw.length()) ? reduce_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(
-        CCL_REDUCE_SCATTER, ": ", (reduce_scatter_algo_raw.length()) ? reduce_scatter_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
+        CCL_REDUCE_SCATTER,
+        ": ",
+        (reduce_scatter_algo_raw.length()) ? reduce_scatter_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_SPARSE_ALLREDUCE,
              ": ",
              (sparse_allreduce_algo_raw.length()) ? sparse_allreduce_algo_raw
@@ -260,6 +258,7 @@ void env_data::print() {
                                                                : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_CACHE_KEY, ": ", str_by_enum(ccl_sched_key::key_type_names, cache_key_type));
     LOG_INFO(CCL_CACHE_FLUSH, ": ", enable_cache_flush);
+    LOG_INFO(CCL_STRICT_ORDER, ": ", enable_strict_order);
 
     LOG_INFO(CCL_CHUNK_COUNT, ": ", chunk_count);
     LOG_INFO(CCL_MIN_CHUNK_SIZE, ": ", min_chunk_size);
@@ -294,20 +293,18 @@ void env_data::print() {
     if (bf16_impl_type != ccl_bf16_none) {
         LOG_INFO("\n\nBF16 is enabled through ",
                  (bf16_impl_type == ccl_bf16_avx512bf) ? "AVX512-BF" : "AVX512-F",
-                "\n");
+                 "\n");
     }
     else {
 #ifdef CCL_BF16_COMPILER
-       LOG_INFO("\n\nBF16 is disabled on HW level\n");
+        LOG_INFO("\n\nBF16 is disabled on HW level\n");
 #else
-       LOG_INFO("\n\nBF16 is disabled on compiler level\n");
+        LOG_INFO("\n\nBF16 is disabled on compiler level\n");
 #endif
     }
-
 }
 
-void env_data::set_internal_env()
-{
+void env_data::set_internal_env() {
     auto attr = ccl_executor::generate_atl_attr(*this);
     atl_wrapper::set_internal_env(attr);
 }
@@ -417,4 +414,22 @@ int env_data::env_2_worker_affinity(size_t local_proc_idx, size_t local_proc_cou
     return read_env;
 }
 
+void env_data::env_2_atl_transport() {
+    if (!getenv(CCL_ATL_TRANSPORT) && !with_mpirun()) {
+        LOG_INFO("\n\nDid not find MPI-launcher specific variables, switch to ATL/OFI"
+                 "\nTo force enable ATL/MPI set CCL_ATL_TRANSPORT=mpi\n");
+
+        atl_transport = ccl_atl_ofi;
+    }
+    else
+        env_2_enum(CCL_ATL_TRANSPORT, atl_transport_names, atl_transport);
+}
+
+bool env_data::with_mpirun() {
+    return (getenv("MPI_LOCALRANKID") || getenv("MPI_LOCALNRANKS") || getenv("PMI_RANK") ||
+            getenv("PMI_SIZE"))
+               ? true
+               : false;
+}
+
 } /* namespace ccl */
diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp
index b74cb6292..237638ab0 100644
--- a/src/common/env/env.hpp
+++ b/src/common/env/env.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "coll/coll.hpp"
 #include "common/utils/utils.hpp"
 #include "common/utils/yield.hpp"
@@ -65,6 +65,7 @@ constexpr const char* CCL_MAX_SHORT_SIZE = "CCL_MAX_SHORT_SIZE";
 constexpr const char* CCL_BCAST_PART_COUNT = "CCL_BCAST_PART_COUNT";
 constexpr const char* CCL_CACHE_KEY = "CCL_CACHE_KEY";
 constexpr const char* CCL_CACHE_FLUSH = "CCL_CACHE_FLUSH";
+constexpr const char* CCL_STRICT_ORDER = "CCL_STRICT_ORDER";
 
 constexpr const char* CCL_CHUNK_COUNT = "CCL_CHUNK_COUNT";
 constexpr const char* CCL_MIN_CHUNK_SIZE = "CCL_MIN_CHUNK_SIZE";
@@ -160,6 +161,7 @@ class env_data {
     ssize_t bcast_part_count;
     ccl_cache_key_type cache_key_type;
     int enable_cache_flush;
+    int enable_strict_order;
 
     size_t chunk_count;
     size_t min_chunk_size;
@@ -223,10 +225,13 @@ class env_data {
         }
     }
 
+    static bool with_mpirun();
+
     static std::map<ccl_priority_mode, std::string> priority_mode_names;
     static std::map<ccl_atl_transport, std::string> atl_transport_names;
 
     int env_2_worker_affinity(size_t local_proc_idx, size_t local_proc_count);
+    void env_2_atl_transport();
 
 private:
     int env_2_worker_affinity_auto(size_t local_proc_idx, size_t workers_per_process);
diff --git a/src/common/event/ccl_event.hpp b/src/common/event/ccl_event.hpp
index 218862ac4..9185e8912 100644
--- a/src/common/event/ccl_event.hpp
+++ b/src/common/event/ccl_event.hpp
@@ -14,25 +14,27 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "common/event/event_internal/event_internal_attr_ids.hpp"
-#include "common/event/event_internal/event_internal_attr_ids_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "common/event/ccl_event_attr_ids.hpp"
+#include "common/event/ccl_event_attr_ids_traits.hpp"
 #include "common/utils/utils.hpp"
 
 namespace ccl {
-class environment; //friend-zone
+namespace detail {
+class environment;
 }
+} // namespace ccl
 
 class alignas(CACHELINE_SIZE) ccl_event {
 public:
-    friend class ccl::environment;
+    friend class ccl::detail::environment;
     using event_native_handle_t = typename ccl::unified_event_type::handle_t;
     using event_native_t = typename ccl::unified_event_type::ccl_native_t;
 
-    using event_native_context_handle_t = typename ccl::unified_device_context_type::handle_t;
-    using event_native_context_t = typename ccl::unified_device_context_type::ccl_native_t;
+    using event_native_context_handle_t = typename ccl::unified_context_type::handle_t;
+    using event_native_context_t = typename ccl::unified_context_type::ccl_native_t;
 
     ccl_event() = delete;
     ccl_event(const ccl_event& other) = delete;
@@ -46,7 +48,7 @@ class alignas(CACHELINE_SIZE) ccl_event {
 
     //Export Attributes
     using version_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::version>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::version>;
     typename version_traits_t::type set_attribute_value(typename version_traits_t::type val,
                                                         const version_traits_t& t);
 
@@ -54,18 +56,17 @@ class alignas(CACHELINE_SIZE) ccl_event {
         const version_traits_t& id) const;
 
     using native_handle_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::event_attr_id,
-                                               ccl::event_attr_id::native_handle>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id,
+                                              ccl::event_attr_id::native_handle>;
     typename native_handle_traits_t::return_type& get_attribute_value(
         const native_handle_traits_t& id);
 
     using context_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::context>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::context>;
     typename context_traits_t::return_type& get_attribute_value(const context_traits_t& id);
 
     using command_type_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::event_attr_id,
-                                               ccl::event_attr_id::command_type>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::command_type>;
     typename command_type_traits_t::return_type set_attribute_value(
         typename command_type_traits_t::type val,
         const command_type_traits_t& t);
@@ -74,8 +75,8 @@ class alignas(CACHELINE_SIZE) ccl_event {
         const command_type_traits_t& id) const;
 
     using command_execution_status_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::event_attr_id,
-                                               ccl::event_attr_id::command_execution_status>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::event_attr_id,
+                                              ccl::event_attr_id::command_execution_status>;
     typename command_execution_status_traits_t::return_type set_attribute_value(
         typename command_execution_status_traits_t::type val,
         const command_execution_status_traits_t& t);
diff --git a/src/common/event/event_internal/event_internal_attr_ids.hpp b/src/common/event/ccl_event_attr_ids.hpp
similarity index 100%
rename from src/common/event/event_internal/event_internal_attr_ids.hpp
rename to src/common/event/ccl_event_attr_ids.hpp
diff --git a/src/common/event/event_internal/event_internal_attr_ids_traits.hpp b/src/common/event/ccl_event_attr_ids_traits.hpp
similarity index 88%
rename from src/common/event/event_internal/event_internal_attr_ids_traits.hpp
rename to src/common/event/ccl_event_attr_ids_traits.hpp
index 9b768c82a..02f319911 100644
--- a/src/common/event/event_internal/event_internal_attr_ids_traits.hpp
+++ b/src/common/event/ccl_event_attr_ids_traits.hpp
@@ -16,7 +16,8 @@
 #pragma once
 
 namespace ccl {
-namespace details {
+
+namespace detail {
 
 /**
  * Traits for stream attributes specializations
@@ -35,8 +36,8 @@ struct ccl_api_type_attr_traits<event_attr_id, event_attr_id::native_handle> {
 
 template <>
 struct ccl_api_type_attr_traits<event_attr_id, event_attr_id::context> {
-    using type = typename unified_device_context_type::ccl_native_t;
-    using handle_t = typename unified_device_context_type::ccl_native_t;
+    using type = typename unified_context_type::ccl_native_t;
+    using handle_t = typename unified_context_type::ccl_native_t;
     using return_type = type;
 };
 
@@ -52,5 +53,5 @@ struct ccl_api_type_attr_traits<event_attr_id, event_attr_id::command_execution_
     using return_type = type;
 };
 
-} // namespace details
+} // namespace detail
 } // namespace ccl
diff --git a/src/common/event/event_internal/event_internal.cpp b/src/common/event/event_internal/event_internal.cpp
deleted file mode 100644
index dd454db17..000000000
--- a/src/common/event/event_internal/event_internal.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#include "common/event/event_internal/event_internal_impl.hpp"
-
-namespace ccl {
-
-event_internal::event_internal(event_internal&& src) : base_t(std::move(src)) {}
-
-event_internal::event_internal(impl_value_t&& impl) : base_t(std::move(impl)) {}
-
-event_internal::~event_internal() {}
-
-event_internal& event_internal::operator=(event_internal&& src) {
-    if (src.get_impl() != this->get_impl()) {
-        src.get_impl().swap(this->get_impl());
-        src.get_impl().reset();
-    }
-    return *this;
-}
-
-void event_internal::build_from_params() {
-    get_impl()->build_from_params();
-}
-} // namespace ccl
diff --git a/src/common/event/event_internal/event_internal.hpp b/src/common/event/event_internal/event_internal.hpp
deleted file mode 100644
index c98aa401a..000000000
--- a/src/common/event/event_internal/event_internal.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-class ccl_event;
-namespace ccl {
-
-/**
- * A event object is an abstraction over CPU/GPU events
- * Has no defined public constructor. Use ccl::environment::create_event
- * for event objects creation
- */
-/**
- * Stream class
- */
-class event_internal : public ccl_api_base_movable<event_internal, direct_access_policy, ccl_event> {
-public:
-    using base_t = ccl_api_base_movable<event_internal, direct_access_policy, ccl_event>;
-
-    /**
-     * Declare PIMPL type
-     */
-    using impl_value_t = typename base_t::impl_value_t;
-
-    /**
-     * Declare implementation type
-     */
-    using impl_t = typename impl_value_t::element_type;
-
-    event_internal(event_internal&& src);
-    event_internal& operator=(event_internal&& src);
-    ~event_internal();
-
-    /**
-     * Get specific attribute value by @attrId
-     */
-    template <event_attr_id attrId>
-    const typename details::ccl_api_type_attr_traits<event_attr_id, attrId>::return_type& get()
-        const;
-
-private:
-    friend class environment;
-    friend class communicator;
-    event_internal(impl_value_t&& impl);
-
-    /**
-     *Parametrized event_internal creation helper
-     */
-    template <event_attr_id attrId,
-              class Value/*,
-              class = typename std::enable_if<is_attribute_value_supported<attrId, Value>()>::type*/>
-    typename ccl::details::ccl_api_type_attr_traits<ccl::event_attr_id, attrId>::return_type set(const Value& v);
-
-    void build_from_params();
-    event_internal(const typename details::ccl_api_type_attr_traits<event_attr_id,
-                                                           event_attr_id::version>::type& version);
-
-    /**
-     * Factory methods
-     */
-    template <class event_type,
-              class = typename std::enable_if<is_event_supported<event_type>()>::type>
-    static event_internal create_event(event_type& native_event);
-
-    template <class event_handle_type,
-              class = typename std::enable_if<is_event_supported<event_handle_type>()>::type>
-    static event_internal create_event(event_handle_type native_event_handle,
-                              typename unified_device_context_type::ccl_native_t context);
-
-    template <class event_type, class... attr_value_pair_t>
-    static event_internal create_event_from_attr(event_type& native_event_handle,
-                                        typename unified_device_context_type::ccl_native_t context,
-                                        attr_value_pair_t&&... avps);
-};
-
-template <event_attr_id t, class value_type>
-constexpr auto attr_val(value_type v) -> details::attr_value_tripple<event_attr_id, t, value_type> {
-    return details::attr_value_tripple<event_attr_id, t, value_type>(v);
-}
-
-} // namespace ccl
diff --git a/src/common/event/event_internal/event_internal_impl.hpp b/src/common/event/event_internal/event_internal_impl.hpp
deleted file mode 100644
index f766b7659..000000000
--- a/src/common/event/event_internal/event_internal_impl.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-
-#include "common/event/event_internal/event_internal_attr_ids.hpp"
-#include "common/event/event_internal/event_internal_attr_ids_traits.hpp"
-#include "common/event/event_internal/event_internal.hpp"
-
-#include "common/event/ccl_event.hpp"
-
-namespace ccl {
-
-template <class event_type, class... attr_value_pair_t>
-event_internal event_internal::create_event_from_attr(event_type& native_event_handle,
-                                    typename unified_device_context_type::ccl_native_t context,
-                                    attr_value_pair_t&&... avps) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    event_internal str{ event_internal::impl_value_t(new event_internal::impl_t(native_event_handle, context, ret)) };
-    int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
-    (void)expander;
-    str.build_from_params();
-
-    return str;
-}
-
-template <class event_handle_type, typename T>
-event_internal event_internal::create_event(event_handle_type native_event_handle,
-                          typename unified_device_context_type::ccl_native_t context) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    event_internal str{ event_internal::impl_value_t(new event_internal::impl_t(native_event_handle, context, ret)) };
-    str.build_from_params();
-
-    return str;
-}
-
-template <class event_type, typename T>
-event_internal event_internal::create_event(event_type& native_event) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    return { event_internal::impl_value_t(new event_internal::impl_t(native_event, ret)) };
-}
-
-template <event_attr_id attrId>
-const typename details::ccl_api_type_attr_traits<event_attr_id, attrId>::return_type&
-event_internal::get() const {
-    return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<event_attr_id, attrId>{});
-}
-
-template<event_attr_id attrId,
-             class Value/*,
-             typename T*/>
-typename ccl::details::ccl_api_type_attr_traits<ccl::event_attr_id, attrId>::return_type event_internal::set(const Value& v)
-{
-    return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<event_attr_id, attrId>{});
-}
-
-} // namespace ccl
diff --git a/src/common/event/impls/empty_event.hpp b/src/common/event/impls/empty_event.hpp
index fe1bb7368..04ffcebce 100644
--- a/src/common/event/impls/empty_event.hpp
+++ b/src/common/event/impls/empty_event.hpp
@@ -13,34 +13,32 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "common/event/impls/event_impl.hpp"
-
-namespace ccl {
-
-class empty_event_impl final : public event_impl {
-public:
-    empty_event_impl() = default;
-
-    void wait() override {
-
-    }
-
-    bool test() override {
-        return true;
-    }
-
-    bool cancel() override {
-        return true;
-    }
-
-    event::native_t& get_native() override {
-        throw ccl::exception(std::string(__FUNCTION__) + " - no native event for empty event");
-    }
-
-    ~empty_event_impl() override = default;
-};
-
-} // namespace ccl
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "common/event/impls/event_impl.hpp"
+
+namespace ccl {
+
+class empty_event_impl final : public event_impl {
+public:
+    empty_event_impl() = default;
+
+    void wait() override {}
+
+    bool test() override {
+        return true;
+    }
+
+    bool cancel() override {
+        return true;
+    }
+
+    event::native_t& get_native() override {
+        throw ccl::exception(std::string(__FUNCTION__) + " - no native event for empty event");
+    }
+
+    ~empty_event_impl() override = default;
+};
+
+} // namespace ccl
diff --git a/src/common/event/impls/event_impl.hpp b/src/common/event/impls/event_impl.hpp
index ae3b6117c..1dccf8ef5 100644
--- a/src/common/event/impls/event_impl.hpp
+++ b/src/common/event/impls/event_impl.hpp
@@ -13,20 +13,20 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#include "oneapi/ccl/native_device_api/export_api.hpp"
-#include "oneapi/ccl/ccl_event.hpp"
-
-namespace ccl {
-
-class event_impl {
-public:
-    virtual void wait() = 0;
-    virtual bool test() = 0;
-    virtual bool cancel() = 0;
-    virtual event::native_t& get_native() = 0;
-    virtual ~event_impl() = default;
-};
-
-} // namespace ccl
+#pragma once
+
+#include "oneapi/ccl/native_device_api/export_api.hpp"
+#include "oneapi/ccl/event.hpp"
+
+namespace ccl {
+
+class event_impl {
+public:
+    virtual void wait() = 0;
+    virtual bool test() = 0;
+    virtual bool cancel() = 0;
+    virtual event::native_t& get_native() = 0;
+    virtual ~event_impl() = default;
+};
+
+} // namespace ccl
diff --git a/src/common/event/impls/host_event.hpp b/src/common/event/impls/host_event.hpp
index d6834034b..523c762ac 100644
--- a/src/common/event/impls/host_event.hpp
+++ b/src/common/event/impls/host_event.hpp
@@ -14,8 +14,8 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
 #include "common/event/impls/event_impl.hpp"
 
 class ccl_request;
diff --git a/src/common/event/impls/native_event.cpp b/src/common/event/impls/native_event.cpp
index 6454dceab..f5effe2e2 100644
--- a/src/common/event/impls/native_event.cpp
+++ b/src/common/event/impls/native_event.cpp
@@ -13,44 +13,41 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "common/event/impls/native_event.hpp"
-#include "common/log/log.hpp"
-
-namespace ccl {
-
-native_event_impl::native_event_impl(event::native_t& native_event, ccl::library_version version)
-    : ev(new ccl_event(native_event, version)) {
-}
-
-void native_event_impl::wait() {
-    if (!completed) {
-        #ifdef CCL_ENABLE_SYCL
-            auto native_event = ev->get_attribute_value(
-                                        details::ccl_api_type_attr_traits<ccl::event_attr_id,
-                                        ccl::event_attr_id::native_handle>{});
-            native_event.wait();
-        #else
-            throw ccl::exception(std::string(__FUNCTION__) + " - is not implemented");
-        #endif
-        completed = true;
-    }
-}
-
-bool native_event_impl::test() {
-    if (!completed) {
-        throw ccl::exception(std::string(__FUNCTION__) + " - is not implemented");
-    }
-    return completed;
-}
-
-bool native_event_impl::cancel() {
-    throw ccl::exception(std::string(__FUNCTION__) + " - is not implemented");
-}
-
-event::native_t& native_event_impl::get_native() {
-    return ev->get_attribute_value(
-                    details::ccl_api_type_attr_traits<ccl::event_attr_id,
-                    ccl::event_attr_id::native_handle>{});
-}
-
-} // namespace ccl
+#include "common/event/impls/native_event.hpp"
+#include "common/log/log.hpp"
+
+namespace ccl {
+
+native_event_impl::native_event_impl(std::unique_ptr<ccl_event> ev) : ev(std::move(ev)) {}
+
+void native_event_impl::wait() {
+    if (!completed) {
+#ifdef CCL_ENABLE_SYCL
+        auto native_event = ev->get_attribute_value(
+            detail::ccl_api_type_attr_traits<ccl::event_attr_id,
+                                             ccl::event_attr_id::native_handle>{});
+        native_event.wait();
+#else
+        throw ccl::exception(std::string(__FUNCTION__) + " - is not implemented");
+#endif
+        completed = true;
+    }
+}
+
+bool native_event_impl::test() {
+    if (!completed) {
+        throw ccl::exception(std::string(__FUNCTION__) + " - is not implemented");
+    }
+    return completed;
+}
+
+bool native_event_impl::cancel() {
+    throw ccl::exception(std::string(__FUNCTION__) + " - is not implemented");
+}
+
+event::native_t& native_event_impl::get_native() {
+    return ev->get_attribute_value(
+        detail::ccl_api_type_attr_traits<ccl::event_attr_id, ccl::event_attr_id::native_handle>{});
+}
+
+} // namespace ccl
diff --git a/src/common/event/impls/native_event.hpp b/src/common/event/impls/native_event.hpp
index 1bdea0c47..2a589eb3f 100644
--- a/src/common/event/impls/native_event.hpp
+++ b/src/common/event/impls/native_event.hpp
@@ -13,27 +13,27 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "common/event/impls/event_impl.hpp"
-#include "common/event/ccl_event.hpp"
-
-namespace ccl {
-
-class native_event_impl final : public event_impl {
-public:
-    explicit native_event_impl(event::native_t& native_event, ccl::library_version version);
-    ~native_event_impl() override = default;
-
-    void wait() override;
-    bool test() override;
-    bool cancel() override;
-    event::native_t& get_native() override;
-
-private:
-    std::unique_ptr<ccl_event> ev = nullptr;
-    bool completed = false;
-};
-
-} // namespace ccl
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "common/event/impls/event_impl.hpp"
+#include "common/event/ccl_event.hpp"
+
+namespace ccl {
+
+class native_event_impl final : public event_impl {
+public:
+    explicit native_event_impl(std::unique_ptr<ccl_event> ev);
+    ~native_event_impl() override = default;
+
+    void wait() override;
+    bool test() override;
+    bool cancel() override;
+    event::native_t& get_native() override;
+
+private:
+    std::unique_ptr<ccl_event> ev = nullptr;
+    bool completed = false;
+};
+
+} // namespace ccl
diff --git a/src/common/event/impls/scoped_event.hpp b/src/common/event/impls/scoped_event.hpp
index 1c655a854..edcb7c32d 100644
--- a/src/common/event/impls/scoped_event.hpp
+++ b/src/common/event/impls/scoped_event.hpp
@@ -14,8 +14,8 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_event.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/event.hpp"
 #include "common/utils/tuple.hpp"
 #include "common/event/impls/event_impl.hpp"
 
@@ -98,14 +98,13 @@ class scoped_event_impl final : public chargeable_event {
     impl_t impl;
 };
 
-namespace details {
+namespace detail {
 template <class event_impl_t, class... scoped_args>
 std::unique_ptr<scoped_event_impl<event_impl_t, scoped_args...>> make_unique_scoped_event(
     ccl_request* r,
     scoped_args&&... args) {
     return std::unique_ptr<scoped_event_impl<event_impl_t, scoped_args...>>(
-        new scoped_event_impl<event_impl_t, scoped_args...>(
-            r, std::forward<scoped_args>(args)...));
+        new scoped_event_impl<event_impl_t, scoped_args...>(r, std::forward<scoped_args>(args)...));
 }
 
 template <class event_impl_t, class... scoped_args>
@@ -118,15 +117,14 @@ std::unique_ptr<scoped_event_impl<event_impl_t, scoped_args...>> make_unique_sco
 }
 
 template <class event_impl_t, class operation, class... scoped_args, class... non_scoped_args>
-std::unique_ptr<chargeable_event> make_and_charge_scoped_event(
-    operation op,
-    std::tuple<scoped_args...>&& args,
-    non_scoped_args&&... elapsed_args) {
+std::unique_ptr<chargeable_event> make_and_charge_scoped_event(operation op,
+                                                               std::tuple<scoped_args...>&& args,
+                                                               non_scoped_args&&... elapsed_args) {
     auto typed_arg = make_unique_scoped_event<event_impl_t>(
         nullptr, std::forward<std::tuple<scoped_args...>>(args));
     typed_arg->charge_by_op(op, std::forward<non_scoped_args>(elapsed_args)...);
     return typed_arg;
 }
 
-} // namespace details
+} // namespace detail
 } // namespace ccl
diff --git a/src/common/global/global.cpp b/src/common/global/global.cpp
index af43c0c02..e970a84af 100644
--- a/src/common/global/global.cpp
+++ b/src/common/global/global.cpp
@@ -51,7 +51,7 @@ env_data& global_data::env() {
     return get().env_object;
 }
 
-ccl_status_t global_data::reset() {
+ccl::status global_data::reset() {
     /*
         executor is resize_dependent object but out of regular reset procedure
         executor is responsible for resize logic and has own multi-step reset
@@ -60,17 +60,17 @@ ccl_status_t global_data::reset() {
     reset_resize_dependent_objects();
     reset_resize_independent_objects();
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t global_data::init() {
+ccl::status global_data::init() {
     env_object.parse();
     env_object.set_internal_env();
 
     init_resize_dependent_objects();
     init_resize_independent_objects();
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 void global_data::init_resize_dependent_objects() {
@@ -97,9 +97,6 @@ void global_data::init_resize_independent_objects() {
 
     algorithm_selector->init();
 
-    default_coll_attr.reset(new ccl_coll_attr_t{});
-    memset(default_coll_attr.get(), 0, sizeof(ccl_coll_attr_t));
-
     bf16_impl_type = ccl_bf16_get_impl_type();
 }
 
@@ -113,7 +110,6 @@ void global_data::reset_resize_dependent_objects() {
 void global_data::reset_resize_independent_objects() {
     parallelizer.reset();
     algorithm_selector.reset();
-    default_coll_attr.reset();
 }
 
 } /* namespace ccl */
diff --git a/src/common/global/global.hpp b/src/common/global/global.hpp
index e3b6d9146..1f4aada25 100644
--- a/src/common/global/global.hpp
+++ b/src/common/global/global.hpp
@@ -21,6 +21,7 @@
 #include "common/env/env.hpp"
 #include "common/utils/utils.hpp"
 #include "common/comm/l0/comm_context_storage.hpp"
+#include "internal_types.hpp"
 
 #include <memory>
 #include <thread>
@@ -28,15 +29,15 @@
 #define COMMON_CATCH_BLOCK() \
     catch (ccl::exception & ccl_e) { \
         LOG_ERROR("ccl internal error: ", ccl_e.what()); \
-        return ccl_status_invalid_arguments; \
+        return ccl::status::invalid_arguments; \
     } \
     catch (std::exception & e) { \
         LOG_ERROR("error: ", e.what()); \
-        return ccl_status_runtime_error; \
+        return ccl::status::runtime_error; \
     } \
     catch (...) { \
         LOG_ERROR("general error"); \
-        return ccl_status_runtime_error; \
+        return ccl::status::runtime_error; \
     }
 
 class ccl_comm;
@@ -82,8 +83,8 @@ class global_data {
 
     ~global_data();
 
-    ccl_status_t init();
-    ccl_status_t reset();
+    ccl::status init();
+    ccl::status reset();
 
     static global_data& get();
     static env_data& env();
@@ -96,7 +97,6 @@ class global_data {
     std::shared_ptr<ccl_comm> comm;
     std::unique_ptr<ccl_datatype_storage> dtypes;
     std::unique_ptr<ccl_executor> executor;
-    std::unique_ptr<ccl_coll_attr_t> default_coll_attr; // TODO: use ccl_coll_attr
     std::unique_ptr<ccl_sched_cache> sched_cache;
     std::unique_ptr<ccl_parallelizer> parallelizer;
     std::unique_ptr<ccl_fusion_manager> fusion_manager;
@@ -123,7 +123,7 @@ class global_data {
     { \
         do { \
             if (unlikely(ccl::global_data::get().executor->is_locked)) { \
-                return ccl_status_blocked_due_to_resize; \
+                return ccl::status::blocked_due_to_resize; \
             } \
         } while (0); \
     }
diff --git a/src/common/log/log.hpp b/src/common/log/log.hpp
index 7994d5b0f..5912008ae 100644
--- a/src/common/log/log.hpp
+++ b/src/common/log/log.hpp
@@ -23,10 +23,12 @@
 #include <mutex>
 #include <sstream>
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/utils/spinlock.hpp"
 #include "common/utils/utils.hpp"
 
+std::ostream& operator<<(std::ostream& os, const ccl::datatype& dt);
+
 #define __FILENAME__ \
     ({ \
         const char* ptr = strrchr(__FILE__, '/'); \
diff --git a/src/common/stream/stream.cpp b/src/common/stream/stream.cpp
index 9c9fea05b..d313c7575 100644
--- a/src/common/stream/stream.cpp
+++ b/src/common/stream/stream.cpp
@@ -13,92 +13,143 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "common/global/global.hpp"
 #include "common/log/log.hpp"
 #include "common/stream/stream.hpp"
 #include "common/stream/stream_provider_dispatcher_impl.hpp"
 #include "oneapi/ccl/native_device_api/export_api.hpp"
 #include "unified_context_impl.hpp"
 
-#ifdef MULTI_GPU_SUPPORT
-#ifdef CCL_ENABLE_SYCL
-template std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    cl::sycl::queue& native_stream,
-    const ccl::library_version& version);
-template std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    cl_command_queue& native_stream_handle,
-    const ccl::library_version& version);
-#else
-template std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    std::shared_ptr<native::ccl_device::device_queue>& native_stream,
-    const ccl::library_version& version);
-template std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    ze_command_queue_handle_t& native_stream_handle,
-    const ccl::library_version& version);
-#endif
-#else
+ccl_stream::ccl_stream(stream_type type,
+                       stream_native_t& stream,
+                       const ccl::library_version& version)
+        : type(type),
+          version(version) {
+    native_stream = stream;
+
 #ifdef CCL_ENABLE_SYCL
-template std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    cl::sycl::queue& native_stream,
-    const ccl::library_version& version);
-template std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    cl_command_queue& native_stream,
-    const ccl::library_version& version);
-#else
-template std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    typename ccl::unified_stream_type::ccl_native_t& native_stream,
-    const ccl::library_version& version);
-#endif
-#endif
+    native_streams.resize(ccl::global_data::env().worker_count);
+    for (size_t idx = 0; idx < native_streams.size(); idx++) {
+        native_streams[idx] = stream_native_t(stream.get_context(), stream.get_device());
+    }
+#endif /* CCL_ENABLE_SYCL */
+}
+
+ccl_stream::ccl_stream(stream_type type,
+                       stream_native_handle_t handle,
+                       const ccl::library_version& version)
+        : type(type),
+          version(version) {
+    creation_is_postponed = true;
+    (void)handle;
+    throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + " - unsupported ");
+}
+
+ccl_stream::ccl_stream(stream_type type, const ccl::library_version& version)
+        : type(type),
+          version(version) {
+    creation_is_postponed = true;
+    LOG_DEBUG("Scheduled postponed stream creation");
+}
 
 void ccl_stream::build_from_params() {
     if (!creation_is_postponed) {
-        throw ccl::exception("error");
+        throw ccl::exception(std::string(__FUNCTION__) +
+                             " - incorrect usage, stream is not sheduled for postponed creation");
     }
 
-    type = ccl_stream_host;
+    type = stream_type::host;
     try {
 #ifdef CCL_ENABLE_SYCL
-        if (is_context_enabled) {
-            stream_native_t stream_candidate{ native_context, native_device };
+        if (native_context.first) {
+            if (!native_device.first) {
+                throw ccl::exception(
+                    std::string(__FUNCTION__) +
+                    " - incorrect usage, not enough parameters for stream creation: "
+                    " context is available, but device is not. Both required");
+            }
+
+            LOG_DEBUG("create stream from device & context");
+            stream_native_t stream_candidate{ native_context.second, native_device.second };
             std::swap(stream_candidate,
-                      native_stream); //TODO USE attributes fro sycl queue construction
+                      native_stream); //TODO USE attributes from sycl queue construction
         }
-        else {
-            stream_native_t stream_candidate{ native_device };
+        else if (native_device.first) {
+            LOG_DEBUG("create stream from device only");
+            stream_native_t stream_candidate{ native_device.second };
             std::swap(stream_candidate,
-                      native_stream); //TODO USE attributes fro sycl queue construction
+                      native_stream); //TODO USE attributes from sycl queue construction
+
+            native_context.second = native_stream.get_context();
+            native_context.first = true;
+        }
+        else {
+            throw ccl::exception(std::string(__FUNCTION__) +
+                                 " - incorrect usage, not enough parameters for stream creation: "
+                                 " context is empty and device is empty too.");
         }
 
         //override type
-        if (native_stream.get_device().is_host())
-        {
-            type = ccl_stream_host;
+        if (native_stream.get_device().is_host()) {
+            type = stream_type::host;
         }
-        else if(native_stream.get_device().is_cpu())
-        {
-            type = ccl_stream_cpu;
+        else if (native_stream.get_device().is_cpu()) {
+            type = stream_type::cpu;
         }
-        else if(native_stream.get_device().is_gpu())
-        {
-            type = ccl_stream_gpu;
+        else if (native_stream.get_device().is_gpu()) {
+            type = stream_type::gpu;
         }
-        else
-        {
-            throw ccl::invalid_argument("CORE", "create_stream", std::string("Unsupported SYCL queue's device type for postponed creation:\n") +
-                                        native_stream.get_device().template get_info<cl::sycl::info::device::name>() +
-                                        std::string("Supported types: host, cpu, gpu"));
+        else {
+            throw ccl::invalid_argument(
+                "CORE",
+                "create_stream",
+                std::string("Unsupported SYCL queue's device type for postponed creation:\n") +
+                    native_stream.get_device().template get_info<cl::sycl::info::device::name>() +
+                    std::string("Supported types: host, cpu, gpu"));
         }
-    LOG_INFO("SYCL queue type from postponed creation: ", static_cast<int>(type), " device: ",
-             native_stream.get_device().template get_info<cl::sycl::info::device::name>());
+        LOG_INFO("SYCL queue type from postponed creation: ",
+                 static_cast<int>(type),
+                 " device: ",
+                 native_stream.get_device().template get_info<cl::sycl::info::device::name>());
 #else
-    #ifdef MULTI_GPU_SUPPORT
+#ifdef MULTI_GPU_SUPPORT
         ze_command_queue_desc_t descr =
             stream_native_device_t::element_type::get_default_queue_desc();
 
-        //TODO use attributes
-        native_device->create_cmd_queue(descr);
-        type = ccl_stream_gpu;
-    #endif
+        //TODO use attributes....
+        //Create from device & context
+        if (native_context.first) {
+            if (!native_device.first) {
+                throw ccl::exception(
+                    std::string(__FUNCTION__) +
+                    " - incorrect usage, not enough parameters for stream creation: "
+                    " context is available, but device is not. Both required");
+            }
+
+            LOG_DEBUG("create stream from device & context");
+            auto stream_candidate =
+                native_device.second->create_cmd_queue(native_context.second, descr);
+            native_stream = std::make_shared<typename ccl::unified_stream_type::impl_t>(
+                std::move(stream_candidate));
+        }
+        else if (native_device.first) {
+            LOG_DEBUG("create stream from device only");
+
+            auto stream_candidate = native_device.second->create_cmd_queue({}, descr);
+            native_stream = std::make_shared<typename ccl::unified_stream_type::impl_t>(
+                std::move(stream_candidate));
+
+            native_context.second = native_stream->get_ctx().lock();
+            native_context.first = true;
+        }
+        else {
+            throw ccl::exception(std::string(__FUNCTION__) +
+                                 " - incorrect usage, not enough parameters for stream creation: "
+                                 " context is empty and device is empty too.");
+        }
+
+        type = stream_type::gpu;
+#endif
 #endif
     }
     catch (const std::exception& ex) {
@@ -123,23 +174,27 @@ const typename ccl_stream::version_traits_t::return_type& ccl_stream::get_attrib
 
 typename ccl_stream::native_handle_traits_t::return_type& ccl_stream::get_attribute_value(
     const native_handle_traits_t& id) {
-    /*
-    if (!native_stream_set)
-    {
-        throw  ccl::exception("native stream is not set");
+    if (creation_is_postponed) {
+        throw ccl::exception(std::string(__FUNCTION__) + " - stream is not properly created yet");
     }
-*/
+
     return native_stream;
 }
 
 typename ccl_stream::device_traits_t::return_type& ccl_stream::get_attribute_value(
     const device_traits_t& id) {
-    return native_device;
+    if (!native_device.first) {
+        throw ccl::exception(std::string(__FUNCTION__) + " - stream has no native device");
+    }
+    return native_device.second;
 }
 
 typename ccl_stream::context_traits_t::return_type& ccl_stream::get_attribute_value(
     const context_traits_t& id) {
-    return native_context;
+    if (!native_context.first) {
+        throw ccl::exception(std::string(__FUNCTION__) + " - stream has no native context");
+    }
+    return native_context.second;
 }
 
 typename ccl_stream::context_traits_t::return_type& ccl_stream::set_attribute_value(
@@ -148,20 +203,21 @@ typename ccl_stream::context_traits_t::return_type& ccl_stream::set_attribute_va
     if (!creation_is_postponed) {
         throw ccl::exception("Cannot set 'ccl::stream_attr_id::context'`for constructed stream");
     }
-    std::swap(native_context, val);
-    return native_context;
+    std::swap(native_context.second, val);
+    native_context.first = true;
+    return native_context.second;
 }
-
+/*
 typename ccl_stream::context_traits_t::return_type& ccl_stream::set_attribute_value(
     typename context_traits_t::handle_t val,
     const context_traits_t& t) {
     if (!creation_is_postponed) {
         throw ccl::exception("Cannot set 'ccl::stream_attr_id::context'`for constructed stream");
     }
-    native_context = ccl::unified_device_context_type{ val }.get(); //context_traits_t::type
-    is_context_enabled = true;
-    return native_context;
-}
+    native_context.second = ccl::unified_context_type{ val }.get(); //context_traits_t::type
+    native_context.first = true;
+    return native_context.second;
+}*/
 
 typename ccl_stream::ordinal_traits_t::type ccl_stream::set_attribute_value(
     typename ordinal_traits_t::type val,
diff --git a/src/common/stream/stream.hpp b/src/common/stream/stream.hpp
index 3822e9c3b..65dc6e1cb 100644
--- a/src/common/stream/stream.hpp
+++ b/src/common/stream/stream.hpp
@@ -14,33 +14,38 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
 #include "common/utils/utils.hpp"
 #include "common/stream/stream_provider_dispatcher.hpp"
 
 #include "coll/coll_common_attributes.hpp"
+#include "internal_types.hpp"
 
 namespace ccl {
-class environment; //friend-zone
+namespace detail {
+class environment;
 }
+} // namespace ccl
+
 /*
-ccl_status_t CCL_API ccl_stream_create(ccl_stream_type_t type,
+ccl::status CCL_API ccl_stream_create(stream_type type,
                                           void* native_stream,
                                           ccl_stream_t* stream);
 */
 class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
 public:
     friend class stream_provider_dispatcher;
-    friend class ccl::environment;
+    friend class ccl::detail::environment;
     /*
-    friend ccl_status_t CCL_API ccl_stream_create(ccl_stream_type_t type,
+    friend ccl::status CCL_API ccl_stream_create(stream_type type,
                                void* native_stream,
                                ccl_stream_t* stream);*/
     using stream_native_t = stream_provider_dispatcher::stream_native_t;
+    using stream_native_handle_t = stream_provider_dispatcher::stream_native_handle_t;
 
     ccl_stream() = delete;
     ccl_stream(const ccl_stream& other) = delete;
@@ -50,11 +55,12 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
 
     using stream_provider_dispatcher::get_native_stream;
 
-    ccl_stream_type_t get_type() const {
+    stream_type get_type() const {
         return type;
     }
+
     bool is_sycl_device_stream() const {
-        return (type == ccl_stream_cpu || type == ccl_stream_gpu);
+        return (type == stream_type::cpu || type == stream_type::gpu);
     }
 
     static std::unique_ptr<ccl_stream> create(stream_native_t& native_stream,
@@ -62,7 +68,7 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
 
     //Export Attributes
     using version_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::version>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::version>;
     typename version_traits_t::return_type set_attribute_value(typename version_traits_t::type val,
                                                                const version_traits_t& t);
 
@@ -70,28 +76,28 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
         const version_traits_t& id) const;
 
     using native_handle_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id,
-                                               ccl::stream_attr_id::native_handle>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id,
+                                              ccl::stream_attr_id::native_handle>;
     typename native_handle_traits_t::return_type& get_attribute_value(
         const native_handle_traits_t& id);
 
     using device_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::device>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::device>;
     typename device_traits_t::return_type& get_attribute_value(const device_traits_t& id);
 
     using context_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::context>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::context>;
     typename context_traits_t::return_type& get_attribute_value(const context_traits_t& id);
 
     typename context_traits_t::return_type& set_attribute_value(typename context_traits_t::type val,
                                                                 const context_traits_t& t);
-
+    /*
     typename context_traits_t::return_type& set_attribute_value(
         typename context_traits_t::handle_t val,
         const context_traits_t& t);
-
+*/
     using ordinal_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::ordinal>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::ordinal>;
     typename ordinal_traits_t::return_type set_attribute_value(typename ordinal_traits_t::type val,
                                                                const ordinal_traits_t& t);
 
@@ -99,28 +105,28 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
         const ordinal_traits_t& id) const;
 
     using index_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::index>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::index>;
     typename index_traits_t::return_type set_attribute_value(typename index_traits_t::type val,
                                                              const index_traits_t& t);
 
     const typename index_traits_t::return_type& get_attribute_value(const index_traits_t& id) const;
 
     using flags_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::flags>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::flags>;
     typename flags_traits_t::return_type set_attribute_value(typename flags_traits_t::type val,
                                                              const flags_traits_t& t);
 
     const typename flags_traits_t::return_type& get_attribute_value(const flags_traits_t& id) const;
 
     using mode_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::mode>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::mode>;
     typename mode_traits_t::return_type set_attribute_value(typename mode_traits_t::type val,
                                                             const mode_traits_t& t);
 
     const typename mode_traits_t::return_type& get_attribute_value(const mode_traits_t& id) const;
 
     using priority_traits_t =
-        ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::priority>;
+        ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, ccl::stream_attr_id::priority>;
     typename priority_traits_t::return_type set_attribute_value(
         typename priority_traits_t::type val,
         const priority_traits_t& t);
@@ -131,33 +137,17 @@ class alignas(CACHELINE_SIZE) ccl_stream : public stream_provider_dispatcher {
     void build_from_params();
 
 private:
-    template <
-        class NativeStream,
-        typename std::enable_if<std::is_class<typename std::remove_cv<NativeStream>::type>::value,
-                                int>::type = 0>
-    ccl_stream(ccl_stream_type_t stream_type,
-               NativeStream& native_stream,
-               const ccl::library_version& version)
-            : stream_provider_dispatcher(native_stream),
-              type(stream_type),
-              version(version) {}
-    template <class NativeStreamHandle,
-              typename std::enable_if<
-                  not std::is_class<typename std::remove_cv<NativeStreamHandle>::type>::value,
-                  int>::type = 0>
-    ccl_stream(ccl_stream_type_t stream_type,
-               NativeStreamHandle native_stream,
-               const ccl::library_version& version)
-            : stream_provider_dispatcher(native_stream),
-              type(stream_type),
-              version(version) {}
-
-    ccl_stream(ccl_stream_type_t stream_type, const ccl::library_version& version)
-            : stream_provider_dispatcher(),
-              type(stream_type),
-              version(version) {}
-
-    ccl_stream_type_t type;
+    ccl_stream(stream_type type,
+               stream_native_t& native_stream,
+               const ccl::library_version& version);
+
+    ccl_stream(stream_type type,
+               stream_native_handle_t native_stream,
+               const ccl::library_version& version);
+
+    ccl_stream(stream_type type, const ccl::library_version& version);
+
+    stream_type type;
     const ccl::library_version version;
     typename ordinal_traits_t::return_type ordinal_val;
     typename index_traits_t::return_type index_val;
diff --git a/src/common/stream/stream_provider_dispatcher.hpp b/src/common/stream/stream_provider_dispatcher.hpp
index bf7957998..e8a508524 100644
--- a/src/common/stream/stream_provider_dispatcher.hpp
+++ b/src/common/stream/stream_provider_dispatcher.hpp
@@ -22,81 +22,62 @@
 #include <CL/sycl.hpp>
 #endif
 
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
-class ccl_stream;
-/*
-#ifdef MULTI_GPU_SUPPORT
-namespace native
-{
-    class ccl_device;
-}
-#endif
-*/
+/**
+ * Supported stream types
+ */
+enum class stream_type : int {
+    host = 0,
+    cpu,
+    gpu,
+
+    last_value
+};
 
+class ccl_stream;
 class stream_provider_dispatcher {
 public:
-#ifdef MULTI_GPU_SUPPORT
-    using stream_native_device_t = typename ccl::unified_device_type::ccl_native_t;
-    using stream_native_context_t = typename ccl::unified_device_context_type::ccl_native_t;
-    using stream_native_t = typename ccl::unified_stream_type::ccl_native_t;
-    using stream_native_handle_t = typename ccl::unified_stream_type::handle_t;
-#else
-#ifdef CCL_ENABLE_SYCL
-    using stream_native_t = cl::sycl::queue;
-    using stream_native_device_t = cl::sycl::device;
-    using stream_native_context_t = typename ccl::unified_device_context_type::ccl_native_t;
     using stream_native_handle_t = typename ccl::unified_stream_type::handle_t;
-#else
-    using stream_native_handle_t = typename ccl::unified_stream_type::handle_t;
-    using stream_native_t =typename ccl::unified_stream_type::ccl_native_t;
+    using stream_native_t = typename ccl::unified_stream_type::ccl_native_t;
     using stream_native_device_t = typename ccl::unified_device_type::ccl_native_t;
-    using stream_native_context_t = typename ccl::unified_device_context_type::ccl_native_t;
-#endif
-#endif
+    ;
+    using stream_native_context_t = typename ccl::unified_context_type::ccl_native_t;
+
     stream_native_t get_native_stream() const;
 
+#ifdef CCL_ENABLE_SYCL
+    stream_native_t get_native_stream(size_t idx) const;
+#endif /* CCL_ENABLE_SYCL */
+
     const stream_native_device_t& get_native_device() const;
     stream_native_device_t& get_native_device();
 
     std::string to_string() const;
 
-    template <
-        class NativeStream,
-        typename std::enable_if<std::is_class<typename std::remove_cv<NativeStream>::type>::value,
-                                int>::type = 0>
-    static std::unique_ptr<ccl_stream> create(NativeStream& native_stream,
+    // available admissions to create stream
+    static std::unique_ptr<ccl_stream> create(stream_native_t& native_stream,
                                               const ccl::library_version& version);
-
-    template <class NativeStreamHandle,
-              typename std::enable_if<
-                  not std::is_class<typename std::remove_cv<NativeStreamHandle>::type>::value,
-                  int>::type = 0>
-    static std::unique_ptr<ccl_stream> create(NativeStreamHandle& native_stream,
+    static std::unique_ptr<ccl_stream> create(stream_native_handle_t native_handle,
                                               const ccl::library_version& version);
-
     static std::unique_ptr<ccl_stream> create(stream_native_device_t device,
                                               const ccl::library_version& version);
     static std::unique_ptr<ccl_stream> create(stream_native_device_t device,
                                               stream_native_context_t context,
                                               const ccl::library_version& version);
+    template <class T>
+    using optional = std::pair<bool, T>;
 
 protected:
-    template <
-        class NativeStream,
-        typename std::enable_if<std::is_class<typename std::remove_cv<NativeStream>::type>::value,
-                                int>::type = 0>
-    stream_provider_dispatcher(NativeStream& stream);
-
-    template <class NativeStreamHandle,
-              typename std::enable_if<
-                  not std::is_class<typename std::remove_cv<NativeStreamHandle>::type>::value,
-                  int>::type = 0>
-    stream_provider_dispatcher(NativeStreamHandle stream);
-    stream_provider_dispatcher();
+    optional<stream_native_device_t> native_device;
+    optional<stream_native_context_t> native_context;
 
-    stream_native_device_t native_device;
-    stream_native_context_t native_context;
     bool creation_is_postponed{ false };
+
     stream_native_t native_stream;
+
+#ifdef CCL_ENABLE_SYCL
+    /* FIXME: tmp w/a for MT support in queue */
+    std::vector<stream_native_t> native_streams;
+#endif /* CCL_ENABLE_SYCL */
 };
diff --git a/src/common/stream/stream_provider_dispatcher_impl.hpp b/src/common/stream/stream_provider_dispatcher_impl.hpp
index 9b97ceb2b..9c265d526 100644
--- a/src/common/stream/stream_provider_dispatcher_impl.hpp
+++ b/src/common/stream/stream_provider_dispatcher_impl.hpp
@@ -22,65 +22,70 @@
 #endif /* CCL_ENABLE_SYCL */
 
 // Creation from class-type: cl::sycl::queue or native::ccl_device::devie_queue
-template <class NativeStream,
-          typename std::enable_if<std::is_class<typename std::remove_cv<NativeStream>::type>::value,
-                                  int>::type>
 std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    NativeStream& native_stream,
+    stream_native_t& native_stream,
     const ccl::library_version& version) {
-    static_assert(std::is_same<NativeStream, stream_native_t>::value, "Unsupported 'NativeStream'");
-
-    ccl_stream_type_t type = ccl_stream_host;
+    stream_type type = stream_type::host;
 #ifdef CCL_ENABLE_SYCL
-    if (native_stream.get_device().is_host())
-    {
-        type = ccl_stream_host;
+    if (native_stream.get_device().is_host()) {
+        type = stream_type::host;
     }
-    else if(native_stream.get_device().is_cpu())
-    {
-        type = ccl_stream_cpu;
+    else if (native_stream.get_device().is_cpu()) {
+        type = stream_type::cpu;
     }
-    else if(native_stream.get_device().is_gpu())
-    {
-        type = ccl_stream_gpu;
+    else if (native_stream.get_device().is_gpu()) {
+        type = stream_type::gpu;
     }
-    else
-    {
-        throw ccl::invalid_argument("CORE", "create_stream", std::string("Unsupported SYCL queue's device type:\n") +
-                                    native_stream.get_device().template get_info<cl::sycl::info::device::name>() +
-                                    std::string("Supported types: host, cpu, gpu"));
+    else {
+        throw ccl::invalid_argument(
+            "CORE",
+            "create_stream",
+            std::string("Unsupported SYCL queue's device type:\n") +
+                native_stream.get_device().template get_info<cl::sycl::info::device::name>() +
+                std::string("Supported types: host, cpu, gpu"));
     }
-    LOG_INFO("SYCL queue type: ", static_cast<int>(type), " device: ",
+
+    std::unique_ptr<ccl_stream> ret(new ccl_stream(type, native_stream, version));
+    ret->native_device.second = native_stream.get_device();
+    ret->native_device.first = true;
+    ret->native_context.second = native_stream.get_context();
+    ret->native_context.first = true;
+    LOG_INFO("SYCL queue type: ",
+             static_cast<int>(type),
+             " device: ",
              native_stream.get_device().template get_info<cl::sycl::info::device::name>());
+
+#else
+#ifdef MULTI_GPU_SUPPORT
+    LOG_INFO("L0 queue type: gpu - supported only");
+    type = stream_type::gpu;
+    std::unique_ptr<ccl_stream> ret(new ccl_stream(type, native_stream, version));
+    ret->native_device.second = native_stream->get_owner().lock();
+    ret->native_device.first = true;
+    ret->native_context.second = native_stream->get_ctx().lock();
+    ret->native_context.first = true;
 #else
-    #if MULTI_GPU_SUPPORT
-        LOG_INFO("L0 queue type: gpu - supported only");
-        type = ccl_stream_gpu;
-    #endif
+    std::unique_ptr<ccl_stream> ret(new ccl_stream(type, native_stream, version));
+#endif
 #endif /* CCL_ENABLE_SYCL */
 
-    return std::unique_ptr<ccl_stream>(new ccl_stream(type, native_stream, version));
+    return ret;
 }
 
 // Creation from handles: cl_queue or ze_device_queue_handle_t
-template <class NativeStreamHandle,
-          typename std::enable_if<
-              not std::is_class<typename std::remove_cv<NativeStreamHandle>::type>::value,
-              int>::type>
 std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
-    NativeStreamHandle& native_stream,
+    stream_native_handle_t native_stream,
     const ccl::library_version& version) {
-    static_assert(std::is_same<NativeStreamHandle, stream_native_handle_t>::value,
-                  "Unsupported 'NativeStream'");
-    return std::unique_ptr<ccl_stream>(new ccl_stream(ccl_stream_gpu, native_stream, version));
+    return std::unique_ptr<ccl_stream>(new ccl_stream(stream_type::gpu, native_stream, version));
 }
 
 // Postponed creation from device
 std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
     stream_native_device_t device,
     const ccl::library_version& version) {
-    auto ret = std::unique_ptr<ccl_stream>(new ccl_stream(ccl_stream_gpu, version));
-    ret->native_device = device;
+    auto ret = std::unique_ptr<ccl_stream>(new ccl_stream(stream_type::gpu, version));
+    ret->native_device.second = device;
+    ret->native_device.first = true;
     return ret;
 }
 
@@ -90,50 +95,40 @@ std::unique_ptr<ccl_stream> stream_provider_dispatcher::create(
     stream_native_context_t context,
     const ccl::library_version& version) {
     auto ret = stream_provider_dispatcher::create(device, version);
-    ret->native_context = context;
+    ret->native_context.second = context;
+    ret->native_context.first = true;
     return ret;
 }
 
-template <class NativeStream,
-          typename std::enable_if<std::is_class<typename std::remove_cv<NativeStream>::type>::value,
-                                  int>::type>
-stream_provider_dispatcher::stream_provider_dispatcher(NativeStream& stream)
-        : native_stream(stream) {}
-
-template <class NativeStreamHandle,
-          typename std::enable_if<
-              not std::is_class<typename std::remove_cv<NativeStreamHandle>::type>::value,
-              int>::type>
-stream_provider_dispatcher::stream_provider_dispatcher(NativeStreamHandle stream) {
-    creation_is_postponed = true;
-    LOG_INFO("Requested postponed stream creation from native handle");
-    /*
-#ifdef CCL_ENABLE_SYCL
-    native_stream = stream_native_t{stream};
-#else
-    native_stream = ccl::unified_stream_type{stream}.get();
-#endif*/
-}
+stream_provider_dispatcher::stream_native_t stream_provider_dispatcher::get_native_stream() const {
+    if (creation_is_postponed) {
+        throw ccl::exception("native stream is not set");
+    }
 
-stream_provider_dispatcher::stream_provider_dispatcher() {
-    creation_is_postponed = true;
-    LOG_INFO("Requested postponed stream creation from empty");
+    return native_stream;
 }
 
-stream_provider_dispatcher::stream_native_t stream_provider_dispatcher::get_native_stream() const {
+#ifdef CCL_ENABLE_SYCL
+stream_provider_dispatcher::stream_native_t stream_provider_dispatcher::get_native_stream(
+    size_t idx) const {
     if (creation_is_postponed) {
         throw ccl::exception("native stream is not set");
     }
 
-    return native_stream;
+    if (idx >= native_streams.size()) {
+        throw ccl::exception("unexpected stream idx");
+    }
+
+    return native_streams[idx];
 }
+#endif /* CCL_ENABLE_SYCL */
 
 const stream_provider_dispatcher::stream_native_device_t&
 stream_provider_dispatcher::get_native_device() const {
-    if (creation_is_postponed) {
-        throw ccl::exception("native device is not set");
+    if (!native_device.first) {
+        throw ccl::exception(std::string(__FUNCTION__) + " - stream has no native device");
     }
-    return native_device;
+    return native_device.second;
 }
 
 stream_provider_dispatcher::stream_native_device_t&
@@ -144,7 +139,7 @@ stream_provider_dispatcher::get_native_device() {
 
 std::string stream_provider_dispatcher::to_string() const {
     if (creation_is_postponed) {
-        throw ccl::exception("native device is not set");
+        throw ccl::exception("stream is not properly created yet");
     }
     std::stringstream ss;
 #ifdef CCL_ENABLE_SYCL
diff --git a/src/common/utils/buffer.hpp b/src/common/utils/buffer.hpp
index 81aed9d4a..44fcb2e93 100644
--- a/src/common/utils/buffer.hpp
+++ b/src/common/utils/buffer.hpp
@@ -13,195 +13,195 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#include <iostream>
-#include <stddef.h>
-
-#include "common/log/log.hpp"
-
-enum class ccl_buffer_type { DIRECT, INDIRECT };
-
-inline std::ostream& operator<<(std::ostream& os, const ccl_buffer_type& type) {
-    os << static_cast<std::underlying_type<ccl_buffer_type>::type>(type);
-    return os;
-}
-
-class ccl_buffer {
-private:
-    void* src;
-    ssize_t size; /* max available size, for sanity checks */
-    size_t offset;
-    ccl_buffer_type type;
-
-    bool check_offset(size_t access_size = 0) const {
-        bool result = true;
-
-        if ((std::numeric_limits<size_t>::max() - offset) < access_size) {
-            result = false;
-            LOG_ERROR("unexpected (offset + access_size): ",
-                      ", offset ",
-                      offset,
-                      ", access_size ",
-                      access_size);
-        }
-
-        if ((size != -1) && (offset + access_size > (size_t)size)) {
-            result = false;
-            LOG_ERROR("unexpected (offset + access_size): ",
-                      "size ",
-                      size,
-                      ", offset ",
-                      offset,
-                      ", access_size ",
-                      access_size);
-        }
-
-        return result;
-    }
-
-public:
-    ccl_buffer(void* src) = delete;
-
-    ccl_buffer(void* src, ssize_t size, size_t offset, ccl_buffer_type type)
-            : src(src),
-              size(size),
-              offset(offset),
-              type(type) {
-        LOG_DEBUG("create: src ", src, ", size ", size, ", offset ", offset, ", type ", type);
-        CCL_ASSERT(check_offset());
-    }
-
-    ccl_buffer() : ccl_buffer(nullptr, -1, 0, ccl_buffer_type::DIRECT) {}
-    ccl_buffer(void* src, ssize_t size) : ccl_buffer(src, size, 0, ccl_buffer_type::DIRECT) {}
-    ccl_buffer(void* src, ssize_t size, size_t offset)
-            : ccl_buffer(src, size, offset, ccl_buffer_type::DIRECT) {}
-    ccl_buffer(void* src, ssize_t size, ccl_buffer_type type) : ccl_buffer(src, size, 0, type) {}
-
-    ccl_buffer(const ccl_buffer& buf)
-            : src(buf.src),
-              size(buf.size),
-              offset(buf.offset),
-              type(buf.type) {
-        CCL_ASSERT(check_offset());
-    }
-
-    void set(void* src, ssize_t size, size_t offset, ccl_buffer_type type) {
-        LOG_DEBUG("set: src ",
-                  src,
-                  ", size ",
-                  size,
-                  ", offset ",
-                  offset,
-                  ", type ",
-                  type,
-                  ", old src: ",
-                  this->src);
-        CCL_ASSERT(src, "new src is null");
-
-        this->src = src;
-        this->size = size;
-        this->offset = offset;
-        this->type = type;
-
-        CCL_ASSERT(check_offset());
-    }
-
-    void set(void* src) {
-        set(src, -1, 0, ccl_buffer_type::DIRECT);
-    }
-    void set(void* src, ssize_t size) {
-        set(src, size, 0, ccl_buffer_type::DIRECT);
-    }
-    void set(void* src, ssize_t size, ccl_buffer_type type) {
-        set(src, size, 0, type);
-    }
-    void set(void* src, ssize_t size, size_t offset) {
-        set(src, size, offset, ccl_buffer_type::DIRECT);
-    }
-
-    void* get_src() const {
-        return src;
-    }
-    ssize_t get_size() const {
-        return size;
-    }
-    size_t get_offset() const {
-        return offset;
-    }
-    ccl_buffer_type get_type() const {
-        return type;
-    }
-
-    ccl_buffer operator+(size_t val) {
-        return ccl_buffer(src, size, offset + val, type);
-    }
-
-    ccl_buffer operator+(size_t val) const {
-        return ccl_buffer(src, size, offset + val, type);
-    }
-
-    ccl_buffer operator-(size_t val) {
-        CCL_ASSERT(offset >= val, "unexpected decrement value ", val);
-        return ccl_buffer(src, size, offset - val, type);
-    }
-
-    ccl_buffer operator+(int val) {
-        return ccl_buffer(src, size, offset + val, type);
-    }
-
-    ccl_buffer operator-(int val) {
-        CCL_ASSERT(offset >= (size_t)val, "unexpected decrement value ", val);
-        return ccl_buffer(src, size, offset - val, type);
-    }
-
-    ccl_buffer& operator+=(size_t val) {
-        offset += val;
-        CCL_ASSERT(check_offset());
-        return *this;
-    }
-
-    size_t get_difference(ccl_buffer buf) {
-        CCL_ASSERT((get_ptr() >= buf.get_ptr()), "difference between pointers < 0");
-        return (static_cast<char*>(get_ptr()) - static_cast<char*>(buf.get_ptr()));
-    }
-
-    void* get_ptr(size_t access_size = 0) const {
-        CCL_ASSERT(check_offset(access_size));
-
-        if (!src)
-            return nullptr;
-
-        if (type == ccl_buffer_type::DIRECT)
-            return ((char*)src + offset);
-        else {
-            return (*((char**)src)) ? (*((char**)src) + offset) : nullptr;
-        }
-    }
-
-    operator bool() const {
-        if (type == ccl_buffer_type::DIRECT)
-            return src;
-        else
-            return (src && (*(void**)src));
-    }
-
-    bool operator==(ccl_buffer const& other) const {
-        return ((get_ptr() == other.get_ptr()) && (get_type() == other.get_type()));
-    }
-
-    bool operator!=(ccl_buffer const& other) const {
-        return !(*this == other);
-    }
-
-    bool operator>(ccl_buffer const& other) const {
-        CCL_ASSERT(get_type() == other.get_type(), "types should match");
-        return (get_ptr() > other.get_ptr());
-    }
-
-    friend std::ostream& operator<<(std::ostream& out, const ccl_buffer& buf) {
-        out << "(src: " << buf.get_src() << ", size " << buf.get_size() << ", off "
-            << buf.get_offset() << ", type: " << buf.get_type() << ")";
-        return out;
-    }
-};
+#pragma once
+
+#include <iostream>
+#include <stddef.h>
+
+#include "common/log/log.hpp"
+
+enum class ccl_buffer_type { DIRECT, INDIRECT };
+
+inline std::ostream& operator<<(std::ostream& os, const ccl_buffer_type& type) {
+    os << static_cast<std::underlying_type<ccl_buffer_type>::type>(type);
+    return os;
+}
+
+class ccl_buffer {
+private:
+    void* src;
+    ssize_t size; /* max available size, for sanity checks */
+    size_t offset;
+    ccl_buffer_type type;
+
+    bool check_offset(size_t access_size = 0) const {
+        bool result = true;
+
+        if ((std::numeric_limits<size_t>::max() - offset) < access_size) {
+            result = false;
+            LOG_ERROR("unexpected (offset + access_size): ",
+                      ", offset ",
+                      offset,
+                      ", access_size ",
+                      access_size);
+        }
+
+        if ((size != -1) && (offset + access_size > (size_t)size)) {
+            result = false;
+            LOG_ERROR("unexpected (offset + access_size): ",
+                      "size ",
+                      size,
+                      ", offset ",
+                      offset,
+                      ", access_size ",
+                      access_size);
+        }
+
+        return result;
+    }
+
+public:
+    ccl_buffer(void* src) = delete;
+
+    ccl_buffer(void* src, ssize_t size, size_t offset, ccl_buffer_type type)
+            : src(src),
+              size(size),
+              offset(offset),
+              type(type) {
+        LOG_DEBUG("create: src ", src, ", size ", size, ", offset ", offset, ", type ", type);
+        CCL_ASSERT(check_offset());
+    }
+
+    ccl_buffer() : ccl_buffer(nullptr, -1, 0, ccl_buffer_type::DIRECT) {}
+    ccl_buffer(void* src, ssize_t size) : ccl_buffer(src, size, 0, ccl_buffer_type::DIRECT) {}
+    ccl_buffer(void* src, ssize_t size, size_t offset)
+            : ccl_buffer(src, size, offset, ccl_buffer_type::DIRECT) {}
+    ccl_buffer(void* src, ssize_t size, ccl_buffer_type type) : ccl_buffer(src, size, 0, type) {}
+
+    ccl_buffer(const ccl_buffer& buf)
+            : src(buf.src),
+              size(buf.size),
+              offset(buf.offset),
+              type(buf.type) {
+        CCL_ASSERT(check_offset());
+    }
+
+    void set(void* src, ssize_t size, size_t offset, ccl_buffer_type type) {
+        LOG_DEBUG("set: src ",
+                  src,
+                  ", size ",
+                  size,
+                  ", offset ",
+                  offset,
+                  ", type ",
+                  type,
+                  ", old src: ",
+                  this->src);
+        CCL_ASSERT(src, "new src is null");
+
+        this->src = src;
+        this->size = size;
+        this->offset = offset;
+        this->type = type;
+
+        CCL_ASSERT(check_offset());
+    }
+
+    void set(void* src) {
+        set(src, -1, 0, ccl_buffer_type::DIRECT);
+    }
+    void set(void* src, ssize_t size) {
+        set(src, size, 0, ccl_buffer_type::DIRECT);
+    }
+    void set(void* src, ssize_t size, ccl_buffer_type type) {
+        set(src, size, 0, type);
+    }
+    void set(void* src, ssize_t size, size_t offset) {
+        set(src, size, offset, ccl_buffer_type::DIRECT);
+    }
+
+    void* get_src() const {
+        return src;
+    }
+    ssize_t get_size() const {
+        return size;
+    }
+    size_t get_offset() const {
+        return offset;
+    }
+    ccl_buffer_type get_type() const {
+        return type;
+    }
+
+    ccl_buffer operator+(size_t val) {
+        return ccl_buffer(src, size, offset + val, type);
+    }
+
+    ccl_buffer operator+(size_t val) const {
+        return ccl_buffer(src, size, offset + val, type);
+    }
+
+    ccl_buffer operator-(size_t val) {
+        CCL_ASSERT(offset >= val, "unexpected decrement value ", val);
+        return ccl_buffer(src, size, offset - val, type);
+    }
+
+    ccl_buffer operator+(int val) {
+        return ccl_buffer(src, size, offset + val, type);
+    }
+
+    ccl_buffer operator-(int val) {
+        CCL_ASSERT(offset >= (size_t)val, "unexpected decrement value ", val);
+        return ccl_buffer(src, size, offset - val, type);
+    }
+
+    ccl_buffer& operator+=(size_t val) {
+        offset += val;
+        CCL_ASSERT(check_offset());
+        return *this;
+    }
+
+    size_t get_difference(ccl_buffer buf) {
+        CCL_ASSERT((get_ptr() >= buf.get_ptr()), "difference between pointers < 0");
+        return (static_cast<char*>(get_ptr()) - static_cast<char*>(buf.get_ptr()));
+    }
+
+    void* get_ptr(size_t access_size = 0) const {
+        CCL_ASSERT(check_offset(access_size));
+
+        if (!src)
+            return nullptr;
+
+        if (type == ccl_buffer_type::DIRECT)
+            return ((char*)src + offset);
+        else {
+            return (*((char**)src)) ? (*((char**)src) + offset) : nullptr;
+        }
+    }
+
+    operator bool() const {
+        if (type == ccl_buffer_type::DIRECT)
+            return src;
+        else
+            return (src && (*(void**)src));
+    }
+
+    bool operator==(ccl_buffer const& other) const {
+        return ((get_ptr() == other.get_ptr()) && (get_type() == other.get_type()));
+    }
+
+    bool operator!=(ccl_buffer const& other) const {
+        return !(*this == other);
+    }
+
+    bool operator>(ccl_buffer const& other) const {
+        CCL_ASSERT(get_type() == other.get_type(), "types should match");
+        return (get_ptr() > other.get_ptr());
+    }
+
+    friend std::ostream& operator<<(std::ostream& out, const ccl_buffer& buf) {
+        out << "(src: " << buf.get_src() << ", size " << buf.get_size() << ", off "
+            << buf.get_offset() << ", type: " << buf.get_type() << ")";
+        return out;
+    }
+};
diff --git a/src/common/utils/enums.hpp b/src/common/utils/enums.hpp
index fd9ede8de..d6250fc70 100644
--- a/src/common/utils/enums.hpp
+++ b/src/common/utils/enums.hpp
@@ -19,7 +19,7 @@
 #include <type_traits>
 
 namespace utils {
-namespace details {
+namespace detail {
 struct failure_callback {
     template <class T>
     static const char *invoke(T val, const char *message) noexcept {
@@ -27,7 +27,7 @@ struct failure_callback {
         return message;
     }
 };
-} // namespace details
+} // namespace detail
 
 template <int Limit>
 struct enum_to_str {
diff --git a/src/common/utils/tree.hpp b/src/common/utils/tree.hpp
index 51156e6df..5cd3aabf6 100644
--- a/src/common/utils/tree.hpp
+++ b/src/common/utils/tree.hpp
@@ -21,7 +21,7 @@
 
 class ccl_bin_tree {
 public:
-    ccl_bin_tree(size_t comm_size, size_t rank, bool is_main = true)
+    ccl_bin_tree(int comm_size, int rank, bool is_main = true)
             : comm_size(comm_size),
               rank(rank),
               is_main(is_main) {
@@ -34,7 +34,7 @@ class ccl_bin_tree {
                 r = height > 0 ? 1 << (height - 1) : -1;
             }
             else {
-                if (comm_size == 1u << height) {
+                if (comm_size == 1 << height) {
                     r = height > 0 ? (1 << height) - 1 : -1;
                 }
                 else {
@@ -54,26 +54,26 @@ class ccl_bin_tree {
     ccl_bin_tree(const ccl_bin_tree& other) = default;
     ccl_bin_tree& operator=(const ccl_bin_tree& other) = default;
 
-    ssize_t left() const {
+    int left() const {
         return l;
     }
 
-    ssize_t right() const {
+    int right() const {
         return r;
     }
 
-    ssize_t parent() const {
+    int parent() const {
         return p;
     }
 
-    ccl_bin_tree copy_with_new_root(size_t new_root) const {
+    ccl_bin_tree copy_with_new_root(int new_root) const {
         ccl_bin_tree copy(*this);
-        ssize_t root = static_cast<ssize_t>(new_root);
+        int root = static_cast<int>(new_root);
 
         //if current node will become a new root or node was a default root - the tree must be reconstruced
         if (copy.rank == root || copy.rank == default_root) {
             //create part of tree with the default root
-            copy = ccl_bin_tree(static_cast<size_t>(comm_size),
+            copy = ccl_bin_tree(static_cast<int>(comm_size),
                                 copy.rank == default_root ? root : default_root,
                                 is_main);
             copy.rank = root;
@@ -92,13 +92,13 @@ class ccl_bin_tree {
     }
 
 private:
-    void reset_connections(ssize_t new_root) {
+    void reset_connections(int new_root) {
         swap_if_any_of(p, default_root, new_root);
         swap_if_any_of(l, default_root, new_root);
         swap_if_any_of(r, default_root, new_root);
     }
 
-    static void swap_if_any_of(ssize_t& node, ssize_t val1, ssize_t val2) {
+    static void swap_if_any_of(int& node, int val1, int val2) {
         if (node == val1) {
             node = val2;
         }
@@ -109,12 +109,12 @@ class ccl_bin_tree {
 
     void calc_height(bool main_tree) {
         if (main_tree || rank == default_root) {
-            while ((rank & (1u << height)) == 0 && (1u << height) < comm_size) {
+            while ((rank & (1 << height)) == 0 && (1 << height) < comm_size) {
                 ++height;
             }
         }
         else {
-            while ((rank & (1u << height)) != 0 && (1u << height) < comm_size) {
+            while ((rank & (1 << height)) != 0 && (1 << height) < comm_size) {
                 ++height;
             }
         }
@@ -122,7 +122,7 @@ class ccl_bin_tree {
 
     void calc_parent() {
         //find a parent using height, assume that rank is a right child
-        ssize_t possible_parent_as_left = rank + (1 << height);
+        int possible_parent_as_left = rank + (1 << height);
         //right child has a bit `1` a the position `height + 1` due to it is calculated as `parent + 2^(heightP-1)`
         //where heightP is parent's height i.e height + 1
 
@@ -149,7 +149,7 @@ class ccl_bin_tree {
 
     void calc_right() {
         r = rank + (1 << (height - 1));
-        ssize_t limit = comm_size - 1;
+        int limit = comm_size - 1;
 
         if (r > limit) {
             auto height_tmp = height;
@@ -166,20 +166,20 @@ class ccl_bin_tree {
         }
     }
 
-    ssize_t comm_size;
-    ssize_t rank;
-    ssize_t height = 0;
-    ssize_t p = -1;
-    ssize_t l = -1;
-    ssize_t r = -1;
+    int comm_size;
+    int rank;
+    int height = 0;
+    int p = -1;
+    int l = -1;
+    int r = -1;
     bool is_main;
 
-    static const ssize_t default_root = 0;
+    static const int default_root = 0;
 };
 
 class ccl_double_tree {
 public:
-    ccl_double_tree(size_t comm_size, size_t rank)
+    ccl_double_tree(int comm_size, int rank)
             : t1(comm_size, rank, true),
               t2(comm_size, rank, false) {
         //LOG_DEBUG("T1: ", t1);
@@ -204,7 +204,7 @@ class ccl_double_tree {
         return t2;
     }
 
-    ccl_double_tree copy_with_new_root(size_t new_root) const {
+    ccl_double_tree copy_with_new_root(int new_root) const {
         return ccl_double_tree(t1.copy_with_new_root(new_root), t2.copy_with_new_root(new_root));
     }
 
diff --git a/src/common/utils/utils.hpp b/src/common/utils/utils.hpp
index fc8fd6f52..4f7cfa92d 100644
--- a/src/common/utils/utils.hpp
+++ b/src/common/utils/utils.hpp
@@ -13,256 +13,257 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#if defined(__INTEL_COMPILER) || defined(__ICC)
-#include <immintrin.h>
-#endif
-
-#include <algorithm>
-#include <chrono>
-#include <functional>
-#include <malloc.h>
-#include <map>
-#include <mutex>
-#include <stddef.h>
-#include <stdlib.h>
-#include <time.h>
-#include <sstream>
-#include <vector>
-
-#include "common/utils/spinlock.hpp"
-
-/* common */
-
-#ifndef gettid
-#include <sys/syscall.h>
-#include <sys/types.h>
-#include <unistd.h>
-#define gettid() syscall(SYS_gettid)
-#endif
-
-#define CCL_CALL(expr) \
-    do { \
-        status = (expr); \
-        CCL_ASSERT(status == ccl_status_success, "bad status ", status); \
-    } while (0)
-
-#define unlikely(x_) __builtin_expect(!!(x_), 0)
-#define likely(x_)   __builtin_expect(!!(x_), 1)
-
-#ifndef container_of
-#define container_of(ptr, type, field) ((type*)((char*)ptr - offsetof(type, field)))
-#endif
-
-#define CACHELINE_SIZE 64
-#define ONE_MB         1048576
-#define TWO_MB         2097152
-
-#define CCL_MEMCPY(dest, src, n) std::copy((char*)(src), (char*)(src) + (n), (char*)(dest))
-
-/* malloc/realloc/free */
-
-#if 0 // defined(__INTEL_COMPILER) || defined(__ICC)
-#define CCL_MEMALIGN_IMPL(size, align) _mm_malloc(size, align)
-#define CCL_REALLOC_IMPL(old_ptr, old_size, new_size, align) \
-    ({ \
-        void* new_ptr = NULL; \
-        if (!old_ptr) \
-            new_ptr = _mm_malloc(new_size, align); \
-        else if (!old_size) \
-            _mm_free(old_ptr); \
-        else { \
-            new_ptr = _mm_malloc(new_size, align); \
-            memcpy(new_ptr, old_ptr, std::min(old_size, new_size)); \
-            _mm_free(old_ptr); \
-        } \
-        new_ptr; \
-    })
-#define CCL_CALLOC_IMPL(size, align) \
-    ({ \
-        void* ptr = _mm_malloc(size, align); \
-        memset(ptr, 0, size); \
-        ptr; \
-    })
-#define CCL_FREE_IMPL(ptr) _mm_free(ptr)
-#elif defined(__GNUC__)
-#define CCL_MEMALIGN_IMPL(size, align) \
-    ({ \
-        void* ptr = NULL; \
-        int pm_ret __attribute__((unused)) = posix_memalign((void**)(&ptr), align, size); \
-        ptr; \
-    })
-#define CCL_REALLOC_IMPL(old_ptr, old_size, new_size, align) realloc(old_ptr, new_size)
-#define CCL_CALLOC_IMPL(size, align)                         calloc(size, 1)
-#define CCL_FREE_IMPL(ptr)                                   free(ptr)
-#else
-#error "this compiler is not supported"
-#endif
-
-#define CCL_MEMALIGN_WRAPPER(size, align, name) \
-    ({ \
-        void* ptr = CCL_MEMALIGN_IMPL(size, align); \
-        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
-        ptr; \
-    })
-
-#define CCL_REALLOC_WRAPPER(old_ptr, old_size, new_size, align, name) \
-    ({ \
-        void* ptr = CCL_REALLOC_IMPL(old_ptr, old_size, new_size, align); \
-        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", new_size, ", out of memory, ", name); \
-        ptr; \
-    })
-
-#define CCL_CALLOC_WRAPPER(size, align, name) \
-    ({ \
-        void* ptr = CCL_CALLOC_IMPL(size, align); \
-        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
-        ptr; \
-    })
-
-#define CCL_MALLOC(size, name)          CCL_MEMALIGN_WRAPPER(size, CACHELINE_SIZE, name)
-#define CCL_MEMALIGN(size, align, name) CCL_MEMALIGN_WRAPPER(size, align, name)
-#define CCL_CALLOC(size, name)          CCL_CALLOC_WRAPPER(size, CACHELINE_SIZE, name)
-#define CCL_REALLOC(old_ptr, old_size, new_size, align, name) \
-    CCL_REALLOC_WRAPPER(old_ptr, old_size, new_size, align, name)
-#define CCL_FREE(ptr) CCL_FREE_IMPL(ptr)
-
-/* other */
-
-static inline size_t ccl_pof2(size_t number) {
-    size_t last_bit_mask = ((size_t)1 << (8 * sizeof(size_t) - 1));
-    if (number & last_bit_mask) {
-        return last_bit_mask;
-    }
-
-    size_t pof2 = 1;
-    while (pof2 <= number) {
-        pof2 <<= 1;
-    }
-    pof2 >>= 1;
-    return pof2;
-}
-
-static inline size_t ccl_aligned_sz(size_t size, size_t alignment) {
-    return ((size % alignment) == 0) ? size : ((size / alignment) + 1) * alignment;
-}
-
-static inline timespec ccl_from_time_point(
-    const std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> point) {
-    auto sec = std::chrono::time_point_cast<std::chrono::seconds>(point);
-    auto ns = std::chrono::time_point_cast<std::chrono::nanoseconds>(point) -
-              std::chrono::time_point_cast<std::chrono::nanoseconds>(sec);
-
-    return timespec{ .tv_sec = sec.time_since_epoch().count(), .tv_nsec = ns.count() };
-}
-
-template <class container>
-container tokenize(const std::string& input, char delimeter) {
-    std::istringstream ss(input);
-    container ret;
-    std::string str;
-    while (std::getline(ss, str, delimeter)) {
-        //use c++14 regex
-        std::stringstream converter;
-        converter << str;
-        typename container::value_type value;
-        converter >> value;
-        ret.push_back(value);
-    }
-    return ret;
-}
-
-template <typename T>
-void ccl_str_to_array(const char* input, std::vector<T>& output, char delimiter) {
-    std::stringstream ss(input);
-    T temp{};
-    while (ss >> temp) {
-        output.push_back(temp);
-        if (ss.peek() == delimiter) {
-            ss.ignore();
-        }
-    }
-}
-
-//TODO naite implementation, use TBB
-template <class Key,
-          class Value,
-          class = typename std::enable_if<std::is_pointer<Value>::value>::type>
-class concurrent_map {
-public:
-    using implementation = std::map<Key, Value>;
-    using value_type = typename implementation::value_type;
-    using lock_t = std::unique_lock<ccl_spinlock>;
-
-    template <class Impl>
-    using accessor = std::tuple<Impl, lock_t>;
-
-    using read_accessor =
-        std::tuple<std::reference_wrapper<typename std::add_const<implementation>::type>, lock_t>;
-    using write_accessor = std::tuple<std::reference_wrapper<implementation>, lock_t>;
-
-    concurrent_map() = default;
-    concurrent_map(concurrent_map<Key, Value>&& src) {
-        src.swap(get_write());
-    }
-
-    concurrent_map<Key, Value>& operator=(const concurrent_map<Key, Value>&& src) {
-        src.swap(get_write());
-        return *this;
-    }
-
-    concurrent_map(const concurrent_map<Key, Value>&) = delete;
-    concurrent_map<Key, Value>& operator=(const concurrent_map<Key, Value>&) = delete;
-
-    std::pair<Value, bool> insert(value_type&& value) {
-        Value ret = nullptr;
-        bool find = false;
-        {
-            std::unique_lock<ccl_spinlock> lock(guard);
-            auto pair = map.insert(std::move(value));
-            find = pair.second;
-            ret = pair.first->second;
-        }
-        return { ret, find };
-    }
-
-    Value find(const Key& key) {
-        Value ret = nullptr;
-        {
-            std::unique_lock<ccl_spinlock> lock(guard);
-            auto it = map.find(key);
-            if (it != map.end()) {
-                ret = it->second;
-            }
-        }
-        return ret;
-    }
-
-    read_accessor get_read() const {
-        return { std::cref(map), locker() };
-    }
-
-    write_accessor get_write() {
-        return { std::ref(map), locker() };
-    }
-
-    void swap(write_accessor&& rhs) {
-        {
-            std::unique_lock<ccl_spinlock> lock(guard);
-            std::swap(map, std::get<0>(rhs).get());
-        }
-    }
-
-    void swap(write_accessor& rhs) {
-        swap(rhs);
-    }
-
-private:
-    std::unique_lock<ccl_spinlock> locker() const {
-        return std::unique_lock<ccl_spinlock>(guard);
-    }
-
-    mutable ccl_spinlock guard;
-    implementation map;
-};
+#pragma once
+
+#if defined(__INTEL_COMPILER) || defined(__ICC)
+#include <immintrin.h>
+#endif
+
+#include <algorithm>
+#include <chrono>
+#include <functional>
+#include <malloc.h>
+#include <map>
+#include <mutex>
+#include <stddef.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sstream>
+#include <vector>
+
+#include "common/utils/spinlock.hpp"
+#include "internal_types.hpp"
+
+/* common */
+
+#ifndef gettid
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+#define gettid() syscall(SYS_gettid)
+#endif
+
+#define CCL_CALL(expr) \
+    do { \
+        status = (expr); \
+        CCL_ASSERT(status == ccl::status::success, "bad status ", status); \
+    } while (0)
+
+#define unlikely(x_) __builtin_expect(!!(x_), 0)
+#define likely(x_)   __builtin_expect(!!(x_), 1)
+
+#ifndef container_of
+#define container_of(ptr, type, field) ((type*)((char*)ptr - offsetof(type, field)))
+#endif
+
+#define CACHELINE_SIZE 64
+#define ONE_MB         1048576
+#define TWO_MB         2097152
+
+#define CCL_MEMCPY(dest, src, n) std::copy((char*)(src), (char*)(src) + (n), (char*)(dest))
+
+/* malloc/realloc/free */
+
+#if 0 // defined(__INTEL_COMPILER) || defined(__ICC)
+#define CCL_MEMALIGN_IMPL(size, align) _mm_malloc(size, align)
+#define CCL_REALLOC_IMPL(old_ptr, old_size, new_size, align) \
+    ({ \
+        void* new_ptr = NULL; \
+        if (!old_ptr) \
+            new_ptr = _mm_malloc(new_size, align); \
+        else if (!old_size) \
+            _mm_free(old_ptr); \
+        else { \
+            new_ptr = _mm_malloc(new_size, align); \
+            memcpy(new_ptr, old_ptr, std::min(old_size, new_size)); \
+            _mm_free(old_ptr); \
+        } \
+        new_ptr; \
+    })
+#define CCL_CALLOC_IMPL(size, align) \
+    ({ \
+        void* ptr = _mm_malloc(size, align); \
+        memset(ptr, 0, size); \
+        ptr; \
+    })
+#define CCL_FREE_IMPL(ptr) _mm_free(ptr)
+#elif defined(__GNUC__)
+#define CCL_MEMALIGN_IMPL(size, align) \
+    ({ \
+        void* ptr = NULL; \
+        int pm_ret __attribute__((unused)) = posix_memalign((void**)(&ptr), align, size); \
+        ptr; \
+    })
+#define CCL_REALLOC_IMPL(old_ptr, old_size, new_size, align) realloc(old_ptr, new_size)
+#define CCL_CALLOC_IMPL(size, align)                         calloc(size, 1)
+#define CCL_FREE_IMPL(ptr)                                   free(ptr)
+#else
+#error "this compiler is not supported"
+#endif
+
+#define CCL_MEMALIGN_WRAPPER(size, align, name) \
+    ({ \
+        void* ptr = CCL_MEMALIGN_IMPL(size, align); \
+        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
+        ptr; \
+    })
+
+#define CCL_REALLOC_WRAPPER(old_ptr, old_size, new_size, align, name) \
+    ({ \
+        void* ptr = CCL_REALLOC_IMPL(old_ptr, old_size, new_size, align); \
+        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", new_size, ", out of memory, ", name); \
+        ptr; \
+    })
+
+#define CCL_CALLOC_WRAPPER(size, align, name) \
+    ({ \
+        void* ptr = CCL_CALLOC_IMPL(size, align); \
+        CCL_THROW_IF_NOT(ptr, "CCL cannot allocate bytes: ", size, ", out of memory, ", name); \
+        ptr; \
+    })
+
+#define CCL_MALLOC(size, name)          CCL_MEMALIGN_WRAPPER(size, CACHELINE_SIZE, name)
+#define CCL_MEMALIGN(size, align, name) CCL_MEMALIGN_WRAPPER(size, align, name)
+#define CCL_CALLOC(size, name)          CCL_CALLOC_WRAPPER(size, CACHELINE_SIZE, name)
+#define CCL_REALLOC(old_ptr, old_size, new_size, align, name) \
+    CCL_REALLOC_WRAPPER(old_ptr, old_size, new_size, align, name)
+#define CCL_FREE(ptr) CCL_FREE_IMPL(ptr)
+
+/* other */
+
+static inline size_t ccl_pof2(size_t number) {
+    size_t last_bit_mask = ((size_t)1 << (8 * sizeof(size_t) - 1));
+    if (number & last_bit_mask) {
+        return last_bit_mask;
+    }
+
+    size_t pof2 = 1;
+    while (pof2 <= number) {
+        pof2 <<= 1;
+    }
+    pof2 >>= 1;
+    return pof2;
+}
+
+static inline size_t ccl_aligned_sz(size_t size, size_t alignment) {
+    return ((size % alignment) == 0) ? size : ((size / alignment) + 1) * alignment;
+}
+
+static inline timespec ccl_from_time_point(
+    const std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds> point) {
+    auto sec = std::chrono::time_point_cast<std::chrono::seconds>(point);
+    auto ns = std::chrono::time_point_cast<std::chrono::nanoseconds>(point) -
+              std::chrono::time_point_cast<std::chrono::nanoseconds>(sec);
+
+    return timespec{ .tv_sec = sec.time_since_epoch().count(), .tv_nsec = ns.count() };
+}
+
+template <class container>
+container tokenize(const std::string& input, char delimeter) {
+    std::istringstream ss(input);
+    container ret;
+    std::string str;
+    while (std::getline(ss, str, delimeter)) {
+        //use c++14 regex
+        std::stringstream converter;
+        converter << str;
+        typename container::value_type value;
+        converter >> value;
+        ret.push_back(value);
+    }
+    return ret;
+}
+
+template <typename T>
+void ccl_str_to_array(const char* input, std::vector<T>& output, char delimiter) {
+    std::stringstream ss(input);
+    T temp{};
+    while (ss >> temp) {
+        output.push_back(temp);
+        if (ss.peek() == delimiter) {
+            ss.ignore();
+        }
+    }
+}
+
+//TODO naite implementation, use TBB
+template <class Key,
+          class Value,
+          class = typename std::enable_if<std::is_pointer<Value>::value>::type>
+class concurrent_map {
+public:
+    using implementation = std::map<Key, Value>;
+    using value_type = typename implementation::value_type;
+    using lock_t = std::unique_lock<ccl_spinlock>;
+
+    template <class Impl>
+    using accessor = std::tuple<Impl, lock_t>;
+
+    using read_accessor =
+        std::tuple<std::reference_wrapper<typename std::add_const<implementation>::type>, lock_t>;
+    using write_accessor = std::tuple<std::reference_wrapper<implementation>, lock_t>;
+
+    concurrent_map() = default;
+    concurrent_map(concurrent_map<Key, Value>&& src) {
+        src.swap(get_write());
+    }
+
+    concurrent_map<Key, Value>& operator=(const concurrent_map<Key, Value>&& src) {
+        src.swap(get_write());
+        return *this;
+    }
+
+    concurrent_map(const concurrent_map<Key, Value>&) = delete;
+    concurrent_map<Key, Value>& operator=(const concurrent_map<Key, Value>&) = delete;
+
+    std::pair<Value, bool> insert(value_type&& value) {
+        Value ret = nullptr;
+        bool find = false;
+        {
+            std::unique_lock<ccl_spinlock> lock(guard);
+            auto pair = map.insert(std::move(value));
+            find = pair.second;
+            ret = pair.first->second;
+        }
+        return { ret, find };
+    }
+
+    Value find(const Key& key) {
+        Value ret = nullptr;
+        {
+            std::unique_lock<ccl_spinlock> lock(guard);
+            auto it = map.find(key);
+            if (it != map.end()) {
+                ret = it->second;
+            }
+        }
+        return ret;
+    }
+
+    read_accessor get_read() const {
+        return { std::cref(map), locker() };
+    }
+
+    write_accessor get_write() {
+        return { std::ref(map), locker() };
+    }
+
+    void swap(write_accessor&& rhs) {
+        {
+            std::unique_lock<ccl_spinlock> lock(guard);
+            std::swap(map, std::get<0>(rhs).get());
+        }
+    }
+
+    void swap(write_accessor& rhs) {
+        swap(rhs);
+    }
+
+private:
+    std::unique_lock<ccl_spinlock> locker() const {
+        return std::unique_lock<ccl_spinlock>(guard);
+    }
+
+    mutable ccl_spinlock guard;
+    implementation map;
+};
diff --git a/src/common/utils/version.cpp b/src/common/utils/version.cpp
new file mode 100644
index 000000000..554befb87
--- /dev/null
+++ b/src/common/utils/version.cpp
@@ -0,0 +1,37 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "oneapi/ccl/config.h"
+#include "oneapi/ccl/types.hpp"
+#include "common/utils/version.hpp"
+#include "oneapi/ccl/native_device_api/export_api.hpp"
+
+namespace utils {
+
+ccl::library_version get_library_version() {
+    ccl::library_version version{};
+
+    version.major = CCL_MAJOR_VERSION;
+    version.minor = CCL_MINOR_VERSION;
+    version.update = CCL_UPDATE_VERSION;
+    version.product_status = CCL_PRODUCT_STATUS;
+    version.build_date = CCL_PRODUCT_BUILD_DATE;
+    version.full = CCL_PRODUCT_FULL;
+    version.cl_backend_name = ccl::backend_traits::name();
+
+    return version;
+}
+
+} // namespace utils
diff --git a/src/common/utils/version.hpp b/src/common/utils/version.hpp
new file mode 100644
index 000000000..662d437bf
--- /dev/null
+++ b/src/common/utils/version.hpp
@@ -0,0 +1,24 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/types.hpp"
+
+namespace utils {
+
+ccl::library_version get_library_version();
+
+} // namespace utils
diff --git a/src/communicator_impl.hpp b/src/communicator_impl.hpp
index dca0d6515..d0a5a4af7 100644
--- a/src/communicator_impl.hpp
+++ b/src/communicator_impl.hpp
@@ -13,234 +13,212 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr.hpp"
-#include "oneapi/ccl/ccl_communicator.hpp"
-
-#include "common/comm/l0/comm_context_id.hpp"
-#include "communicator_impl_details.hpp"
-
-//TODO
-/*
-namespace ccl
-{
-struct comm_split_attr_impl
-{
-    constexpr static int color_default()
-    {
-        return 0;
-    }
-    ccl::library_version version;
-};
-
-struct device_attr_impl
-{
-    constexpr static device_topology_type class_default()
-    {
-        return device_topology_type::ring;
-    }
-    constexpr static group_split_type group_default()
-    {
-        return group_split_type::process;
-    }
-    device_topology_type current_preferred_topology_class = class_default();
-    group_split_type current_preferred_topology_group = group_default();
-};
-}*/
-
-
-namespace ccl {
-/* TODO temporary function for UT compilation: would be part of ccl::environment in final
-template <class event_type,
-          class ...attr_value_pair_t>
-event create_event_from_attr(event_type& native_event_handle,
-                             typename unified_device_context_type::ccl_native_t context,
-                             attr_value_pair_t&&...avps)
-{
-    ccl::library_version ret {};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    event str {event::impl_value_t(new event::impl_t(native_event_handle, context, ret))};
-    int expander [] {(str.template set<attr_value_pair_t::idx()>(avps.val()), 0)...};
-    str.build_from_params();
-
-    return str;
-}
-*/
-
-template <class DeviceType, class ContextType>
-CCL_API vector_class<communicator> communicator::create_communicators(
-    const size_t cluster_devices_size,
-    const vector_class<DeviceType>& local_devices,
-    ContextType& context,
-    shared_ptr_class<kvs_interface> kvs) {
-    vector_class<communicator> ret;
-    throw std::runtime_error(std::string(__FUNCTION__) + " - not implemented");
-    return ret;
-}
-
-using rank_t = size_t;
-
-template <class DeviceType, class ContextType>
-CCL_API vector_class<communicator> communicator::create_communicators(
-    const size_t cluster_devices_size, /*global devics count*/
-    const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-    ContextType& context,
-    shared_ptr_class<kvs_interface> kvs) {
-
-    return comm_impl_dispatch_selector<CL_BACKEND_TYPE>::create_communicators_selector(cluster_devices_size, local_rank_device_map, context, kvs);
-#if 0
-    vector_class<rank_t> local_thread_ranks;
-    local_thread_ranks.reserve(local_rank_device_map.size());
-    std::transform(
-        local_rank_device_map.begin(),
-        local_rank_device_map.end(),
-        std::back_inserter(local_thread_ranks),
-        [](const typename vector_class<pair_class<rank_t, DeviceType>>::value_type& val) {
-            return val.first;
-        });
-    group_context::comm_group_t thread_group =
-        group_context::instance().group_by_kvs(local_thread_ranks, cluster_devices_size, kvs);
-
-    vector_class<DeviceType> local_thread_devices;
-    local_thread_devices.reserve(local_rank_device_map.size());
-    std::transform(
-        local_rank_device_map.begin(),
-        local_rank_device_map.end(),
-        std::back_inserter(local_thread_devices),
-        [](const typename vector_class<pair_class<rank_t, DeviceType>>::value_type& val) {
-            return val.second;
-        });
-
-    auto ret = thread_group->create_communicators(local_thread_devices);
-    return ret;
-#endif
-}
-
-template <class DeviceType, class ContextType>
-CCL_API vector_class<communicator> communicator::create_communicators(
-    const size_t cluster_devices_size, /*global devics count*/
-    const map_class<rank_t, DeviceType>& local_rank_device_map,
-    ContextType& context,
-    shared_ptr_class<kvs_interface> kvs)
-
-{
-    return comm_impl_dispatch_selector<CL_BACKEND_TYPE>::create_communicators_selector(cluster_devices_size, local_rank_device_map, context, kvs);
-#if 0
-    vector_class<rank_t> local_thread_ranks;
-    local_thread_ranks.reserve(local_rank_device_map.size());
-    std::transform(local_rank_device_map.begin(),
-                   local_rank_device_map.end(),
-                   std::back_inserter(local_thread_ranks),
-                   [](const typename map_class<rank_t, DeviceType>::value_type& val) {
-                       return val.first;
-                   });
-    group_context::comm_group_t thread_group =
-        group_context::instance().group_by_kvs(local_thread_ranks, cluster_devices_size, kvs);
-
-    vector_class<DeviceType> local_thread_devices;
-    local_thread_devices.reserve(local_rank_device_map.size());
-    std::transform(local_rank_device_map.begin(),
-                   local_rank_device_map.end(),
-                   std::back_inserter(local_thread_devices),
-                   [](const typename map_class<rank_t, DeviceType>::value_type& val) {
-                       return val.second;
-                   });
-
-    auto ret = thread_group->create_communicators(local_thread_devices);
-    return ret;
-#endif
-}
-
-/*CCL_API bool communicator::is_ready() const
-{
-    return get_impl()->is_ready();
-}*/
-
-/**
- * Creates a new host communicator with externally provided size, rank and kvs.
- * Implementation is platform specific and non portable.
- * @return host communicator
- */
-communicator communicator::create_communicator() {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-
-    LOG_DEBUG("Create host communicator");
-
-    communicator_interface_ptr impl =
-        communicator_interface::create_communicator_impl();
-
-    return communicator(std::move(impl));
-}
-
-/**
- * Creates a new host communicator with user supplied size and kvs.
- * Rank will be assigned automatically.
- * @param size user-supplied total number of ranks
- * @param kvs key-value store for ranks wire-up
- * @return host communicator
- */
-communicator communicator::create_communicator(const size_t size,
-                                               shared_ptr_class<kvs_interface> kvs) {
-    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
-
-    LOG_DEBUG("Create host communicator");
-
-    communicator_interface_ptr impl =
-        communicator_interface::create_communicator_impl(size, kvs);
-
-    return communicator(std::move(impl));
-}
-
-/**
- * Creates a new host communicator with user supplied size, rank and kvs.
- * @param size user-supplied total number of ranks
- * @param rank user-supplied rank
- * @param kvs key-value store for ranks wire-up
- * @return host communicator
- */
-communicator communicator::create_communicator(const size_t size,
-                                               const size_t rank,
-                                               shared_ptr_class<kvs_interface> kvs) {
-    
-    LOG_DEBUG("Create host communicator: size ", size, ", rank ", rank);
-
-    communicator_interface_ptr impl =
-        communicator_interface::create_communicator_impl(size, rank, kvs);
-
-    return communicator(std::move(impl));
-}
-
-} // namespace ccl
-
-/***************************TypeGenerations*********************************************************/
-#define API_DEVICE_COMM_CREATE_WO_RANK_EXPLICIT_INSTANTIATION(DeviceType, ContextType) \
-    template ccl::vector_class<ccl::communicator> CCL_API \
-    ccl::communicator::create_communicators( \
-        const size_t comm_size, \
-        const ccl::vector_class<DeviceType>& local_devices, \
-        ContextType& context, \
-        ccl::shared_ptr_class<ccl::kvs_interface> kvs);
-
-#define API_DEVICE_COMM_CREATE_WITH_RANK_IN_VECTOR_EXPLICIT_INSTANTIATION(DeviceType, ContextType) \
-    template ccl::vector_class<ccl::communicator> CCL_API \
-    ccl::communicator::create_communicators( \
-        const size_t comm_size, \
-        const ccl::vector_class<ccl::pair_class<ccl::rank_t, DeviceType>>& local_rank_device_map, \
-        ContextType& context, \
-        ccl::shared_ptr_class<ccl::kvs_interface> kvs);
-
-#define API_DEVICE_COMM_CREATE_WITH_RANK_IN_MAP_EXPLICIT_INSTANTIATION(DeviceType, ContextType) \
-    template ccl::vector_class<ccl::communicator> CCL_API \
-    ccl::communicator::create_communicators( \
-        const size_t comm_size, \
-        const ccl::map_class<ccl::rank_t, DeviceType>& local_rank_device_map, \
-        ContextType& context, \
-        ccl::shared_ptr_class<ccl::kvs_interface> kvs);
+#pragma once
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
+#include "oneapi/ccl/comm_split_attr_ids_traits.hpp"
+#include "oneapi/ccl/comm_split_attr.hpp"
+#include "oneapi/ccl/communicator.hpp"
+
+#include "common/comm/l0/comm_context_id.hpp"
+#include "communicator_impl_details.hpp"
+
+//TODO
+/*
+namespace ccl
+{
+struct comm_split_attr_impl
+{
+    constexpr static int color_default()
+    {
+        return 0;
+    }
+    ccl::library_version version;
+};
+
+struct device_attr_impl
+{
+    constexpr static device_topology_type class_default()
+    {
+        return device_topology_type::ring;
+    }
+    constexpr static group_split_type group_default()
+    {
+        return group_split_type::process;
+    }
+    device_topology_type current_preferred_topology_class = class_default();
+    group_split_type current_preferred_topology_group = group_default();
+};
+}*/
+
+namespace ccl {
+
+namespace v1 {
+
+template <class DeviceType, class ContextType>
+CCL_API vector_class<communicator> communicator::create_communicators(
+    const int size,
+    const vector_class<DeviceType>& devices,
+    const ContextType& context,
+    shared_ptr_class<kvs_interface> kvs) {
+    vector_class<communicator> ret;
+    throw std::runtime_error(std::string(__FUNCTION__) + " - not implemented");
+    return ret;
+}
+
+using rank_t = int;
+
+template <class DeviceType, class ContextType>
+CCL_API vector_class<communicator> communicator::create_communicators(
+    const int size,
+    const vector_class<pair_class<int, DeviceType>>& devices,
+    const ContextType& context,
+    shared_ptr_class<kvs_interface> kvs) {
+    return comm_impl_dispatch_selector<CL_BACKEND_TYPE>::create_communicators_selector(
+        size, devices, context, kvs);
+#if 0
+    vector_class<int> local_thread_ranks;
+    local_thread_ranks.reserve(devices.size());
+    std::transform(
+        devices.begin(),
+        devices.end(),
+        std::back_inserter(local_thread_ranks),
+        [](const typename vector_class<pair_class<int, DeviceType>>::value_type& val) {
+            return val.first;
+        });
+    group_context::comm_group_t thread_group =
+        group_context::instance().group_by_kvs(local_thread_ranks, size, kvs);
+
+    vector_class<DeviceType> local_thread_devices;
+    local_thread_devices.reserve(devices.size());
+    std::transform(
+        devices.begin(),
+        devices.end(),
+        std::back_inserter(local_thread_devices),
+        [](const typename vector_class<pair_class<int, DeviceType>>::value_type& val) {
+            return val.second;
+        });
+
+    auto ret = thread_group->create_communicators(local_thread_devices);
+    return ret;
+#endif
+}
+
+template <class DeviceType, class ContextType>
+CCL_API vector_class<communicator> communicator::create_communicators(
+    const int size,
+    const map_class<int, DeviceType>& devices,
+    const ContextType& context,
+    shared_ptr_class<kvs_interface> kvs)
+
+{
+    return comm_impl_dispatch_selector<CL_BACKEND_TYPE>::create_communicators_selector(
+        size, devices, context, kvs);
+#if 0
+    vector_class<int> local_thread_ranks;
+    local_thread_ranks.reserve(devices.size());
+    std::transform(devices.begin(),
+                   devices.end(),
+                   std::back_inserter(local_thread_ranks),
+                   [](const typename map_class<int, DeviceType>::value_type& val) {
+                       return val.first;
+                   });
+    group_context::comm_group_t thread_group =
+        group_context::instance().group_by_kvs(local_thread_ranks, size, kvs);
+
+    vector_class<DeviceType> local_thread_devices;
+    local_thread_devices.reserve(devices.size());
+    std::transform(devices.begin(),
+                   devices.end(),
+                   std::back_inserter(local_thread_devices),
+                   [](const typename map_class<int, DeviceType>::value_type& val) {
+                       return val.second;
+                   });
+
+    auto ret = thread_group->create_communicators(local_thread_devices);
+    return ret;
+#endif
+}
+
+/*CCL_API bool communicator::is_ready() const
+{
+    return get_impl()->is_ready();
+}*/
+
+/**
+ * Creates a new host communicator with externally provided size, rank and kvs.
+ * Implementation is platform specific and non portable.
+ * @return host communicator
+ */
+communicator communicator::create_communicator(const comm_attr& attr) {
+    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
+
+    LOG_DEBUG("Create host communicator");
+
+    communicator_interface_ptr impl = communicator_interface::create_communicator_impl();
+
+    return communicator(std::move(impl));
+}
+
+/**
+ * Creates a new host communicator with user supplied size and kvs.
+ * Rank will be assigned automatically.
+ * @param size user-supplied total number of ranks
+ * @param kvs key-value store for ranks wire-up
+ * @return host communicator
+ */
+communicator communicator::create_communicator(const int size,
+                                               shared_ptr_class<kvs_interface> kvs,
+                                               const comm_attr& attr) {
+    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - is not implemented");
+
+    LOG_DEBUG("Create host communicator");
+
+    communicator_interface_ptr impl = communicator_interface::create_communicator_impl(size, kvs);
+
+    return communicator(std::move(impl));
+}
+
+/**
+ * Creates a new host communicator with user supplied size, rank and kvs.
+ * @param size user-supplied total number of ranks
+ * @param rank user-supplied rank
+ * @param kvs key-value store for ranks wire-up
+ * @return host communicator
+ */
+communicator communicator::create_communicator(const int size,
+                                               const int rank,
+                                               shared_ptr_class<kvs_interface> kvs,
+                                               const comm_attr& attr) {
+    LOG_DEBUG("Create host communicator: size ", size, ", rank ", rank);
+
+    communicator_interface_ptr impl =
+        communicator_interface::create_communicator_impl(size, rank, kvs);
+
+    return communicator(std::move(impl));
+}
+
+} // namespace v1
+
+} // namespace ccl
+
+/***************************TypeGenerations*********************************************************/
+#define API_COMM_CREATE_WO_RANK_EXPLICIT_INSTANTIATION(DeviceType, ContextType) \
+    template ccl::vector_class<ccl::communicator> CCL_API ccl::communicator::create_communicators( \
+        const int comm_size, \
+        const ccl::vector_class<DeviceType>& local_devices, \
+        const ContextType& context, \
+        ccl::shared_ptr_class<ccl::kvs_interface> kvs);
+
+#define API_COMM_CREATE_WITH_RANK_IN_VECTOR_EXPLICIT_INSTANTIATION(DeviceType, ContextType) \
+    template ccl::vector_class<ccl::communicator> CCL_API ccl::communicator::create_communicators( \
+        const int comm_size, \
+        const ccl::vector_class<ccl::pair_class<int, DeviceType>>& local_rank_device_map, \
+        const ContextType& context, \
+        ccl::shared_ptr_class<ccl::kvs_interface> kvs);
+
+#define API_COMM_CREATE_WITH_RANK_IN_MAP_EXPLICIT_INSTANTIATION(DeviceType, ContextType) \
+    template ccl::vector_class<ccl::communicator> CCL_API ccl::communicator::create_communicators( \
+        const int comm_size, \
+        const ccl::map_class<int, DeviceType>& local_rank_device_map, \
+        const ContextType& context, \
+        ccl::shared_ptr_class<ccl::kvs_interface> kvs);
diff --git a/src/communicator_impl_details.hpp b/src/communicator_impl_details.hpp
index 17970a555..76e305a55 100644
--- a/src/communicator_impl_details.hpp
+++ b/src/communicator_impl_details.hpp
@@ -13,10 +13,12 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_exception.hpp"
+#include "oneapi/ccl/exception.hpp"
 #include "common/comm/comm_interface.hpp"
 #include "common/comm/single_device_communicator/single_device_communicator.hpp"
 
+#include "common/env/env.hpp"
+
 namespace ccl {
 
 /**
@@ -24,219 +26,243 @@ namespace ccl {
  */
 
 template <class Context>
-struct context_extractor
-{
-    static Context& extract(Context &ctx) {return ctx; }
+struct context_extractor {
+    static const Context& extract(const Context& ctx) {
+        return ctx;
+    }
 };
 
-template<>
-struct context_extractor<ccl::context>
-{
-    static typename unified_device_context_type::ccl_native_t& extract(ccl::context &ctx) {return ctx.get_native(); }
+template <>
+struct context_extractor<ccl::context> {
+    static const typename unified_context_type::ccl_native_t& extract(const ccl::context& ctx) {
+        return ctx.get_native();
+    }
 };
 
-template<class impl>
-struct comm_impl_base_dispatch
-{
-    static void validate_contract(const size_t cluster_devices_size,
-                                  const size_t table_size)
-    {
-        if (table_size == 0)
-        {
-            throw ccl::invalid_argument("API", "create_communicators", "`local_rank_device_map` cannot be empty");
+template <class impl>
+struct comm_impl_base_dispatch {
+    static void validate_contract(const size_t cluster_devices_size, const size_t table_size) {
+        if (table_size == 0) {
+            throw ccl::invalid_argument(
+                "API", "create_communicators", "`local_rank_device_map` cannot be empty");
+        }
+
+        if (table_size > cluster_devices_size) {
+            throw ccl::invalid_argument(
+                "API",
+                "create_communicators",
+                "`local_rank_device_map` size: " + std::to_string(table_size) +
+                    " must not exceed total size: " + std::to_string(cluster_devices_size));
         }
 
-        if (table_size > cluster_devices_size)
-        {
-            throw ccl::invalid_argument("API", "create_communicators", "`local_rank_device_map` size: " +
-                                    std::to_string(table_size) + " must not exceed total size: " +
-                                    std::to_string(cluster_devices_size));
+        /* Indicate that multiple devices are not supported, don't throw anything if kernel_path env variable
+         * is set to enable our testing with partial functionality. */
+        if (table_size > 1 && ccl::global_data::env().kernel_path.empty()) {
+            throw ccl::unimplemented("API", "create_communicators", "for multiple devices");
         }
     }
 
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devics count*/
-                const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
-        map_class<rank_t, DeviceType> converted_map;
-        std::transform(local_rank_device_map.begin(), local_rank_device_map.end(),
-                       std::inserter(converted_map, converted_map.end()), [](const pair_class<rank_t, DeviceType>& val)
-                       {
+        const size_t cluster_devices_size, /*global devices count*/
+        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        map_class<int, DeviceType> converted_map;
+        std::transform(local_rank_device_map.begin(),
+                       local_rank_device_map.end(),
+                       std::inserter(converted_map, converted_map.end()),
+                       [](const pair_class<int, DeviceType>& val) {
                            return std::make_pair(val.first, val.second);
                        });
-        if (local_rank_device_map.size() != converted_map.size())
-        {
+        if (local_rank_device_map.size() != converted_map.size()) {
             std::stringstream ss;
             ss << "found duplicated ranks in `local_rank_device_map`:\n";
-            for (const auto& v : local_rank_device_map)
-            {
+            for (const auto& v : local_rank_device_map) {
                 ss << std::to_string(v.first) << ", ";
             }
             throw ccl::invalid_argument("API", "create_communicators", ss.str());
         }
-        return impl::template create_communicators_selector(cluster_devices_size, converted_map, context_extractor<ContextType>::extract(context), kvs);
+        return impl::template create_communicators_selector(
+            cluster_devices_size,
+            converted_map,
+            context_extractor<ContextType>::extract(context),
+            kvs);
     }
 };
 
-template<cl_backend_type type>
-struct comm_impl_dispatch_selector {
-};
+template <cl_backend_type type>
+struct comm_impl_dispatch_selector {};
 
 #if !defined(CCL_ENABLE_SYCL) and !defined(MULTI_GPU_SUPPORT)
-template<>
-struct comm_impl_dispatch_selector<cl_backend_type::empty_backend> :
- public comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::empty_backend>>
-{
-    using base_t = comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::empty_backend>>;
+template <>
+struct comm_impl_dispatch_selector<cl_backend_type::empty_backend>
+        : public comm_impl_base_dispatch<
+              comm_impl_dispatch_selector<cl_backend_type::empty_backend>> {
+    using base_t =
+        comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::empty_backend>>;
 
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devics count*/
-                const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs)
-    {
-        return base_t::template create_communicators_selector<DeviceType>(cluster_devices_size, local_rank_device_map, context_extractor<ContextType>::extract(context), kvs);
+        const size_t cluster_devices_size, /*global devices count*/
+        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        return base_t::template create_communicators_selector<DeviceType>(
+            cluster_devices_size,
+            local_rank_device_map,
+            context_extractor<ContextType>::extract(context),
+            kvs);
     }
 
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devics count*/
-                const map_class<rank_t, DeviceType>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, DeviceType>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
         base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
-        if (local_rank_device_map.size() != 1)
-        {
-            throw ccl::unsupported("API", "create_communicators", "`local_rank_device_map` size: " +
-                                   std::to_string(local_rank_device_map.size()) + " but must be: 1 for your configuration: " +
-                                   backend_traits::name());
+        if (local_rank_device_map.size() != 1) {
+            throw ccl::unsupported(
+                "API",
+                "create_communicators",
+                "`local_rank_device_map` size: " + std::to_string(local_rank_device_map.size()) +
+                    " but must be: 1 for your configuration: " + backend_traits::name());
         }
 
         LOG_TRACE("Create host communicator");
 
         ccl::communicator_interface_ptr impl =
-            ccl::communicator_interface::create_communicator_impl(cluster_devices_size, local_rank_device_map.begin()->first, kvs);
+            ccl::communicator_interface::create_communicator_impl(
+                cluster_devices_size, local_rank_device_map.begin()->first, kvs);
         ccl::vector_class<ccl::communicator> ret;
-        ret.push_back(ccl::communicator(std::move(impl)) );
+        ret.push_back(ccl::communicator(std::move(impl)));
         return ret;
     }
 };
 #endif
 
 #if defined(CCL_ENABLE_SYCL) and !defined(MULTI_GPU_SUPPORT)
-template<>
-struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl> :
- public comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>>
-{
-    using base_t = comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>>;
+template <>
+struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>
+        : public comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>> {
+    using base_t =
+        comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl>>;
 
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devics count*/
-                const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs)
-    {
-        return base_t::template create_communicators_selector<DeviceType>(cluster_devices_size, local_rank_device_map, context_extractor<ContextType>::extract(context), kvs);
+        const size_t cluster_devices_size, /*global devices count*/
+        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        return base_t::template create_communicators_selector<DeviceType>(
+            cluster_devices_size,
+            local_rank_device_map,
+            context_extractor<ContextType>::extract(context),
+            kvs);
     }
 
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devices count*/
-                const map_class<rank_t, ccl::device_index_type>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, ccl::device_index_type>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
         base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
 
-        map_class<rank_t, cl::sycl::device> converted_device_map;
-        std::transform(local_rank_device_map.begin(), local_rank_device_map.end(),
+        map_class<int, cl::sycl::device> converted_device_map;
+        std::transform(local_rank_device_map.begin(),
+                       local_rank_device_map.end(),
                        std::inserter(converted_device_map, converted_device_map.end()),
-                       [](const typename map_class<rank_t, ccl::device_index_type>::value_type& val)
-                       {
-                           return std::make_pair(val.first, ccl::create_from_index(val.second).get());
+                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
+                           return std::make_pair(val.first,
+                                                 ccl::create_from_index(val.second).get());
                        });
-        return create_communicators_selector(cluster_devices_size, converted_device_map, context_extractor<ContextType>::extract(context), kvs);
+        return create_communicators_selector(cluster_devices_size,
+                                             converted_device_map,
+                                             context_extractor<ContextType>::extract(context),
+                                             kvs);
     }
 
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devices count*/
-                const map_class<rank_t,ccl::device>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
-        map_class<rank_t, cl::sycl::device> converted_device_map;
-        std::transform(local_rank_device_map.begin(), local_rank_device_map.end(),
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, ccl::device>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        map_class<int, cl::sycl::device> converted_device_map;
+        std::transform(local_rank_device_map.begin(),
+                       local_rank_device_map.end(),
                        std::inserter(converted_device_map, converted_device_map.end()),
-                       [](const typename map_class<rank_t,ccl::device>::value_type& val)
-                       {
+                       [](const typename map_class<int, ccl::device>::value_type& val) {
                            return std::make_pair(val.first, val.second.get_native());
                        });
-        return create_communicators_selector(cluster_devices_size, converted_device_map, context_extractor<ContextType>::extract(context), kvs);
+        return create_communicators_selector(cluster_devices_size,
+                                             converted_device_map,
+                                             context_extractor<ContextType>::extract(context),
+                                             kvs);
     }
 
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devices count*/
-                const map_class<rank_t, cl::sycl::device>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, cl::sycl::device>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
         base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
 
         auto it = local_rank_device_map.begin();
         const cl::sycl::device& dev = it->second;
 
-        if (dev.is_host())
-        {
-            it = std::find_if_not(local_rank_device_map.begin(), local_rank_device_map.end(),
-                                  [](const typename map_class<rank_t, cl::sycl::device>::value_type& val) {
-                                    return val.second.is_host();
-                                });
+        if (dev.is_host()) {
+            it = std::find_if_not(
+                local_rank_device_map.begin(),
+                local_rank_device_map.end(),
+                [](const typename map_class<int, cl::sycl::device>::value_type& val) {
+                    return val.second.is_host();
+                });
         }
-        else if(dev.is_cpu())
-        {
-            it = std::find_if_not(local_rank_device_map.begin(), local_rank_device_map.end(),
-                                  [](const typename map_class<rank_t, cl::sycl::device>::value_type& val) {
-                                    return val.second.is_cpu();
-                                });
+        else if (dev.is_cpu()) {
+            it = std::find_if_not(
+                local_rank_device_map.begin(),
+                local_rank_device_map.end(),
+                [](const typename map_class<int, cl::sycl::device>::value_type& val) {
+                    return val.second.is_cpu();
+                });
         }
-        else if(dev.is_gpu())
-        {
-            it = std::find_if_not(local_rank_device_map.begin(), local_rank_device_map.end(),
-                                  [](const typename map_class<rank_t, cl::sycl::device>::value_type& val) {
-                                    return val.second.is_gpu();
-                                });
+        else if (dev.is_gpu()) {
+            it = std::find_if_not(
+                local_rank_device_map.begin(),
+                local_rank_device_map.end(),
+                [](const typename map_class<int, cl::sycl::device>::value_type& val) {
+                    return val.second.is_gpu();
+                });
         }
-        else if(dev.is_accelerator())
-        {
-            it = std::find_if_not(local_rank_device_map.begin(), local_rank_device_map.end(),
-                                  [](const typename map_class<rank_t, cl::sycl::device>::value_type& val) {
-                                    return val.second.is_accelerator();
-                                });
+        else if (dev.is_accelerator()) {
+            it = std::find_if_not(
+                local_rank_device_map.begin(),
+                local_rank_device_map.end(),
+                [](const typename map_class<int, cl::sycl::device>::value_type& val) {
+                    return val.second.is_accelerator();
+                });
         }
-        else
-        {
-            throw ccl::invalid_argument("API", "create_communicators", "invalid `cl::sycl::device` type");
+        else {
+            throw ccl::invalid_argument(
+                "API", "create_communicators", "invalid `cl::sycl::device` type");
         }
 
-        if (it != local_rank_device_map.end())
-        {
-            throw ccl::invalid_argument("API", "create_communicators", "mixed collection of `cl::sycl::device` are not supported");
+        if (it != local_rank_device_map.end()) {
+            throw ccl::invalid_argument("API",
+                                        "create_communicators",
+                                        "mixed collection of `cl::sycl::device` are not supported");
         }
 
-
-        if (local_rank_device_map.size() != 1)
-        {
-            throw ccl::unsupported("API", "create_communicators", "`local_rank_device_map` size: " +
-                                   std::to_string(local_rank_device_map.size()) + " but must be: 1 for your configuration: " +
-                                   backend_traits::name());
+        if (local_rank_device_map.size() != 1) {
+            throw ccl::unsupported(
+                "API",
+                "create_communicators",
+                "`local_rank_device_map` size: " + std::to_string(local_rank_device_map.size()) +
+                    " but must be: 1 for your configuration: " + backend_traits::name());
         }
 
         // if (dev.is_host() || dev.is_cpu())
@@ -253,19 +279,23 @@ struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl> :
 
         /* reset iterator after std::find_if_not */
         it = local_rank_device_map.begin();
-        size_t rank = it->first;
+        int rank = it->first;
         auto& device = it->second;
 
-        LOG_DEBUG("Create single device communicator from SYCL device (sycl and !mgpu), after find_if rank ", rank);
+        LOG_DEBUG(
+            "Create single device communicator from SYCL device (sycl and !mgpu), after find_if rank ",
+            rank);
 
         std::shared_ptr<ikvs_wrapper> kvs_wrapper(new users_kvs(kvs));
-        std::shared_ptr<atl_wrapper> atl =
-        std::shared_ptr<atl_wrapper>(new atl_wrapper(cluster_devices_size, { rank }, kvs_wrapper));
+        std::shared_ptr<atl_wrapper> atl = std::shared_ptr<atl_wrapper>(
+            new atl_wrapper(cluster_devices_size, { rank }, kvs_wrapper));
 
-        ccl::comm_split_attr attr = create_comm_split_attr(
-        ccl::attr_val<ccl::comm_split_attr_id::group>(ccl::group_split_type::undetermined));
+        ccl::comm_split_attr attr =
+            preview::create_comm_split_attr(ccl::attr_val<ccl::comm_split_attr_id::group>(
+                ccl::split_group::cluster /*ccl::group_split_type::undetermined*/));
         ccl::communicator_interface_ptr impl =
-        ccl::communicator_interface::create_communicator_impl(device, context, rank, cluster_devices_size, attr, atl);
+            ccl::communicator_interface::create_communicator_impl(
+                device, context, rank, cluster_devices_size, attr, atl);
 
         //TODO use gpu_comm_attr to automatically visit()
         //auto single_dev_comm = std::dynamic_pointer_cast<single_device_communicator>(impl);
@@ -274,15 +304,15 @@ struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl> :
         ret.push_back(ccl::communicator(std::move(impl)));
         return ret;
 
-    /*
+        /*
     //collect ranks
-    vector_class<rank_t> local_thread_ranks;
+    vector_class<int> local_thread_ranks;
     local_thread_ranks.reserve(local_rank_device_map.size());
     std::transform(
         local_rank_device_map.begin(),
         local_rank_device_map.end(),
         std::back_inserter(local_thread_ranks),
-        [](const typename vector_class<pair_class<rank_t, DeviceType>>::value_type& val) {
+        [](const typename vector_class<pair_class<int, DeviceType>>::value_type& val) {
             return val.first;
         });
 
@@ -292,7 +322,7 @@ struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl> :
         local_rank_device_map.begin(),
         local_rank_device_map.end(),
         std::back_inserter(local_thread_devices),
-        [](const typename vector_class<pair_class<rank_t, DeviceType>>::value_type& val) {
+        [](const typename vector_class<pair_class<int, DeviceType>>::value_type& val) {
             return val.second;
         });
 
@@ -306,202 +336,209 @@ struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl> :
 #endif //defined(CCL_ENABLE_SYCL) and !defined(MULTI_GPU_SUPPORT)
 
 #if defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
-template<>
-struct comm_impl_dispatch_selector<cl_backend_type::l0> :
- public comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::l0>>
-{
+template <>
+struct comm_impl_dispatch_selector<cl_backend_type::l0>
+        : public comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::l0>> {
     using base_t = comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::l0>>;
 
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devics count*/
-                const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs)
-    {
-        return base_t::template create_communicators_selector<DeviceType>(cluster_devices_size, local_rank_device_map, context_extractor<ContextType>::extract(context), kvs);
+        const size_t cluster_devices_size, /*global devices count*/
+        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        return base_t::template create_communicators_selector<DeviceType>(
+            cluster_devices_size,
+            local_rank_device_map,
+            context_extractor<ContextType>::extract(context),
+            kvs);
     }
 
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devices count*/
-                const map_class<rank_t, ccl::device>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
-        map_class<rank_t, ccl::device_index_type> converted_device_map;
-        std::transform(local_rank_device_map.begin(), local_rank_device_map.end(),
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, ccl::device>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        map_class<int, ccl::device_index_type> converted_device_map;
+        std::transform(local_rank_device_map.begin(),
+                       local_rank_device_map.end(),
                        std::inserter(converted_device_map, converted_device_map.end()),
-                       [](const typename map_class<rank_t,ccl::device>::value_type& val)
-                       {
-                           return std::make_pair(val.first, val.second.get_native()->get_device_path());
+                       [](const typename map_class<int, ccl::device>::value_type& val) {
+                           return std::make_pair(val.first,
+                                                 val.second.get_native()->get_device_path());
                        });
-        return create_communicators_selector(cluster_devices_size, converted_device_map, context_extractor<ContextType>::extract(context), kvs);
+        return create_communicators_selector(cluster_devices_size,
+                                             converted_device_map,
+                                             context_extractor<ContextType>::extract(context),
+                                             kvs);
     }
 
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devices count*/
-                const map_class<rank_t, ccl::device_index_type>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, typename unified_device_type::ccl_native_t>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        map_class<int, ccl::device_index_type> converted_device_map;
+        std::transform(
+            local_rank_device_map.begin(),
+            local_rank_device_map.end(),
+            std::inserter(converted_device_map, converted_device_map.end()),
+            [](const typename map_class<int, typename unified_device_type::ccl_native_t>::
+                   value_type& val) {
+                return std::make_pair(val.first, val.second->get_device_path());
+            });
+        return create_communicators_selector(cluster_devices_size,
+                                             converted_device_map,
+                                             context_extractor<ContextType>::extract(context),
+                                             kvs);
+    }
 
+    template <class ContextType>
+    static vector_class<communicator> create_communicators_selector(
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, ccl::device_index_type>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
         base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
 
         //collect ranks
-        vector_class<rank_t> local_thread_ranks;
+        vector_class<int> local_thread_ranks;
         local_thread_ranks.reserve(local_rank_device_map.size());
         std::transform(local_rank_device_map.begin(),
-                   local_rank_device_map.end(),
-                   std::back_inserter(local_thread_ranks),
-                   [](const typename map_class<rank_t, ccl::device_index_type>::value_type& val) {
-                       return val.first;
-                   });
+                       local_rank_device_map.end(),
+                       std::back_inserter(local_thread_ranks),
+                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
+                           return val.first;
+                       });
         group_context::comm_group_t thread_group =
             group_context::instance().group_by_kvs(local_thread_ranks, cluster_devices_size, kvs);
 
         vector_class<ccl::device_index_type> local_thread_devices;
         local_thread_devices.reserve(local_rank_device_map.size());
         std::transform(local_rank_device_map.begin(),
-                   local_rank_device_map.end(),
-                   std::back_inserter(local_thread_devices),
-                   [](const typename map_class<rank_t, ccl::device_index_type>::value_type& val) {
-                       return val.second;
-                   });
+                       local_rank_device_map.end(),
+                       std::back_inserter(local_thread_devices),
+                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
+                           return val.second;
+                       });
 
-        auto ret = thread_group->create_communicators_group(local_thread_devices, context_extractor<ContextType>::extract(context));
+        auto ret = thread_group->create_communicators_group(
+            local_thread_devices, context_extractor<ContextType>::extract(context));
         return ret;
     }
 };
 #endif //defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
 
-
 #if defined(MULTI_GPU_SUPPORT) and defined(CCL_ENABLE_SYCL)
-template<>
-struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0> :
- public comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0>>
-{
-    using base_t = comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0>>;
+template <>
+struct comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0>
+        : public comm_impl_base_dispatch<
+              comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0>> {
+    using base_t =
+        comm_impl_base_dispatch<comm_impl_dispatch_selector<cl_backend_type::dpcpp_sycl_l0>>;
 
     template <class DeviceType, class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devics count*/
-                const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs)
-    {
-        return base_t::template create_communicators_selector<DeviceType>(cluster_devices_size, local_rank_device_map, context_extractor<ContextType>::extract(context), kvs);
+        const size_t cluster_devices_size, /*global devices count*/
+        const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        return base_t::template create_communicators_selector<DeviceType>(
+            cluster_devices_size,
+            local_rank_device_map,
+            context_extractor<ContextType>::extract(context),
+            kvs);
     }
 
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devices count*/
-                const map_class<rank_t, ccl::device>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
-        map_class<rank_t, cl::sycl::device> converted_device_map;
-        std::transform(local_rank_device_map.begin(), local_rank_device_map.end(),
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, ccl::device>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
+        map_class<int, cl::sycl::device> converted_device_map;
+        std::transform(local_rank_device_map.begin(),
+                       local_rank_device_map.end(),
                        std::inserter(converted_device_map, converted_device_map.end()),
-                       [](const typename map_class<rank_t, ccl::device>::value_type& val)
-                       {
+                       [](const typename map_class<int, ccl::device>::value_type& val) {
                            return std::make_pair(val.first, val.second.get_native());
                        });
-        return create_communicators_selector(cluster_devices_size, converted_device_map, context_extractor<ContextType>::extract(context), kvs);
+        return create_communicators_selector(cluster_devices_size,
+                                             converted_device_map,
+                                             context_extractor<ContextType>::extract(context),
+                                             kvs);
     }
 
+    // TODO: try to combine these 2 overload below
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devices count*/
-                const map_class<rank_t, cl::sycl::device>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, cl::sycl::device>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
         base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
 
-        if (local_rank_device_map.size() == 1)
-        {
-            auto it = local_rank_device_map.begin();
-
-            LOG_DEBUG("create single device communicator from SYCL device (sycl and mgpu)");
-
-            size_t rank = it->first;
-            auto& device = it->second;
-            std::shared_ptr<ikvs_wrapper> kvs_wrapper(new users_kvs(kvs));
-            std::shared_ptr<atl_wrapper> atl =
-            std::shared_ptr<atl_wrapper>(new atl_wrapper(cluster_devices_size, { rank }, kvs_wrapper));
-
-            ccl::comm_split_attr attr = create_comm_split_attr(
-            ccl::attr_val<ccl::comm_split_attr_id::group>(ccl::group_split_type::undetermined));
-            ccl::communicator_interface_ptr impl =
-            ccl::communicator_interface::create_communicator_impl(device, context, rank, cluster_devices_size, attr, atl);
-
-            //TODO use gpu_comm_attr to automatically visit()
-            //auto single_dev_comm = std::dynamic_pointer_cast<single_device_communicator>(impl);
-            //single_dev_comm->set_context(context);
-            ccl::vector_class<ccl::communicator> ret;
-            ret.push_back(ccl::communicator(std::move(impl)));
-            return ret;
-        }
-
         //collect ranks
-        vector_class<rank_t> local_thread_ranks;
+        vector_class<int> local_thread_ranks;
         local_thread_ranks.reserve(local_rank_device_map.size());
         std::transform(local_rank_device_map.begin(),
-                   local_rank_device_map.end(),
-                   std::back_inserter(local_thread_ranks),
-                   [](const typename map_class<rank_t, cl::sycl::device>::value_type& val) {
-                       return val.first;
-                   });
+                       local_rank_device_map.end(),
+                       std::back_inserter(local_thread_ranks),
+                       [](const typename map_class<int, cl::sycl::device>::value_type& val) {
+                           return val.first;
+                       });
         group_context::comm_group_t thread_group =
             group_context::instance().group_by_kvs(local_thread_ranks, cluster_devices_size, kvs);
 
         vector_class<cl::sycl::device> local_thread_devices;
         local_thread_devices.reserve(local_rank_device_map.size());
         std::transform(local_rank_device_map.begin(),
-                   local_rank_device_map.end(),
-                   std::back_inserter(local_thread_devices),
-                   [](const typename map_class<rank_t, cl::sycl::device>::value_type& val) {
-                       return val.second;
-                   });
+                       local_rank_device_map.end(),
+                       std::back_inserter(local_thread_devices),
+                       [](const typename map_class<int, cl::sycl::device>::value_type& val) {
+                           return val.second;
+                       });
 
-        auto ret = thread_group->create_communicators_group(local_thread_devices, context_extractor<ContextType>::extract(context));
+        auto ret = thread_group->create_communicators_group(
+            local_thread_devices, context_extractor<ContextType>::extract(context));
         return ret;
     }
 
     template <class ContextType>
     static vector_class<communicator> create_communicators_selector(
-                const size_t cluster_devices_size, /*global devices count*/
-                const map_class<rank_t, ccl::device_index_type>& local_rank_device_map,
-                ContextType& context,
-                shared_ptr_class<kvs_interface> kvs) {
-
+        const size_t cluster_devices_size, /*global devices count*/
+        const map_class<int, ccl::device_index_type>& local_rank_device_map,
+        const ContextType& context,
+        shared_ptr_class<kvs_interface> kvs) {
         base_t::validate_contract(cluster_devices_size, local_rank_device_map.size());
 
         //collect ranks
-        vector_class<rank_t> local_thread_ranks;
+        vector_class<int> local_thread_ranks;
         local_thread_ranks.reserve(local_rank_device_map.size());
         std::transform(local_rank_device_map.begin(),
-                   local_rank_device_map.end(),
-                   std::back_inserter(local_thread_ranks),
-                   [](const typename map_class<rank_t, ccl::device_index_type>::value_type& val) {
-                       return val.first;
-                   });
+                       local_rank_device_map.end(),
+                       std::back_inserter(local_thread_ranks),
+                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
+                           return val.first;
+                       });
         group_context::comm_group_t thread_group =
             group_context::instance().group_by_kvs(local_thread_ranks, cluster_devices_size, kvs);
 
         vector_class<ccl::device_index_type> local_thread_devices;
         local_thread_devices.reserve(local_rank_device_map.size());
         std::transform(local_rank_device_map.begin(),
-                   local_rank_device_map.end(),
-                   std::back_inserter(local_thread_devices),
-                   [](const typename map_class<rank_t, ccl::device_index_type>::value_type& val) {
-                       return val.second;
-                   });
+                       local_rank_device_map.end(),
+                       std::back_inserter(local_thread_devices),
+                       [](const typename map_class<int, ccl::device_index_type>::value_type& val) {
+                           return val.second;
+                       });
 
-        auto ret = thread_group->create_communicators_group(local_thread_devices, context_extractor<ContextType>::extract(context));
+        auto ret = thread_group->create_communicators_group(
+            local_thread_devices, context_extractor<ContextType>::extract(context));
         return ret;
     }
 };
 #endif //defined(MULTI_GPU_SUPPORT) and defined(CCL_ENABLE_SYCL)
-}
+} // namespace ccl
diff --git a/src/comp/bf16/bf16.cpp b/src/comp/bf16/bf16.cpp
index 77251e166..64ed2d8e1 100644
--- a/src/comp/bf16/bf16.cpp
+++ b/src/comp/bf16/bf16.cpp
@@ -13,7 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
 #include "comp/bf16/bf16.hpp"
@@ -21,15 +21,15 @@
 #include "common/utils/enums.hpp"
 
 #define CCL_FLOATS_IN_M512 16
-#define CCL_BF16_SHIFT    16
+#define CCL_BF16_SHIFT     16
 
 #ifdef CCL_BF16_COMPILER
 
 void ccl_bf16_reduce(const void* in_buf,
-                      size_t in_cnt,
-                      void* inout_buf,
-                      size_t* out_cnt,
-                      ccl::reduction reduction_op) {
+                     size_t in_cnt,
+                     void* inout_buf,
+                     size_t* out_cnt,
+                     ccl::reduction reduction_op) {
     LOG_DEBUG("BF16 reduction for %zu elements\n", in_cnt);
 
     if (out_cnt != nullptr) {
@@ -113,10 +113,10 @@ void ccl_convert_bf16_to_fp32_arrays(void* recv_buf_bf16, float* recv_buf, size_
 #else /* CCL_BF16_COMPILER */
 
 void ccl_bf16_reduce(const void* in_buf,
-                      size_t in_cnt,
-                      void* inout_buf,
-                      size_t* out_cnt,
-                      ccl::reduction reduction_op) {
+                     size_t in_cnt,
+                     void* inout_buf,
+                     size_t* out_cnt,
+                     ccl::reduction reduction_op) {
     CCL_FATAL("BF16 reduction is requested but CCL was compiled w/o BF16 support");
 }
 
diff --git a/src/comp/bf16/bf16.hpp b/src/comp/bf16/bf16.hpp
index c5f2d1f6e..229841d28 100644
--- a/src/comp/bf16/bf16.hpp
+++ b/src/comp/bf16/bf16.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 
 #ifdef CCL_BF16_TARGET_ATTRIBUTES
 #ifdef CCL_BF16_AVX512BF_COMPILER
@@ -28,10 +28,10 @@ void ccl_bf16_reduce(const void* in_buf, size_t in_cnt,
                      ccl::reduction reduction_op);
 #else
 void ccl_bf16_reduce(const void* in_buf,
-                      size_t in_cnt,
-                      void* inout_buf,
-                      size_t* out_cnt,
-                      ccl::reduction reduction_op);
+                     size_t in_cnt,
+                     void* inout_buf,
+                     size_t* out_cnt,
+                     ccl::reduction reduction_op);
 #endif
 
 void ccl_convert_fp32_to_bf16_arrays(void*, void*, size_t);
diff --git a/src/comp/bf16/bf16_intrisics.hpp b/src/comp/bf16/bf16_intrisics.hpp
index ecab89cd0..7e7851729 100644
--- a/src/comp/bf16/bf16_intrisics.hpp
+++ b/src/comp/bf16/bf16_intrisics.hpp
@@ -119,10 +119,10 @@ CCL_BF16_REDUCE_FUNC_DEFINITIONS(avx512bf);
 #endif
 
 INLINE_TARGET_ATTRIBUTE_ALL void ccl_bf16_reduce_impl(const void* in_buf,
-                                                       void* inout_buf,
-                                                       size_t in_cnt,
-                                                       ccl_bf16_reduction_func_ptr op,
-                                                       ccl_bf16_impl_type impl_type) {
+                                                      void* inout_buf,
+                                                      size_t in_cnt,
+                                                      ccl_bf16_reduction_func_ptr op,
+                                                      ccl_bf16_impl_type impl_type) {
     if (impl_type == ccl_bf16_avx512f)
         ccl_bf16_reduce_impl_avx512f(in_buf, inout_buf, in_cnt, op);
 #ifdef CCL_BF16_AVX512BF_COMPILER
diff --git a/src/comp/comp.cpp b/src/comp/comp.cpp
index ba8ccdb2f..6a1c136c5 100644
--- a/src/comp/comp.cpp
+++ b/src/comp/comp.cpp
@@ -13,7 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "comp/bf16/bf16.hpp"
 #include "comp/comp.hpp"
 #include "common/log/log.hpp"
@@ -49,60 +49,65 @@
         } \
     } while (0)
 
-ccl_status_t ccl_comp_copy(const void* in_buf,
-                           void* out_buf,
-                           size_t count,
-                           const ccl_datatype& dtype) {
+ccl::status ccl_comp_copy(const void* in_buf,
+                          void* out_buf,
+                          size_t count,
+                          const ccl_datatype& dtype) {
     CCL_ASSERT(in_buf, "in_buf is null");
     CCL_ASSERT(out_buf, "out_buf is null");
     CCL_MEMCPY(out_buf, in_buf, count * dtype.size());
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_comp_reduce(const void* in_buf,
-                             size_t in_count,
-                             void* inout_buf,
-                             size_t* out_count,
-                             const ccl_datatype& dtype,
-                             ccl::reduction reduction,
-                             ccl::reduction_fn reduction_fn,
-                             const ccl::fn_context* context) {
+ccl::status ccl_comp_reduce(const void* in_buf,
+                            size_t in_count,
+                            void* inout_buf,
+                            size_t* out_count,
+                            const ccl_datatype& dtype,
+                            ccl::reduction reduction,
+                            ccl::reduction_fn reduction_fn,
+                            const ccl::fn_context* context) {
     if (reduction == ccl::reduction::custom) {
         CCL_THROW_IF_NOT(reduction_fn, "custom reduction requires user callback");
         reduction_fn(in_buf, in_count, inout_buf, out_count, dtype.idx(), context);
-        return ccl_status_success;
+        return ccl::status::success;
     }
 
     size_t i;
     switch (dtype.idx()) {
-        case ccl::datatype::int8: CCL_REDUCE(char); break;
-        case ccl::datatype::int32: CCL_REDUCE(int); break;
+        case ccl::datatype::int8: CCL_REDUCE(int8_t); break;
+        case ccl::datatype::uint8: CCL_REDUCE(uint8_t); break;
+        case ccl::datatype::int16: CCL_REDUCE(int16_t); break;
+        case ccl::datatype::uint16: CCL_REDUCE(uint16_t); break;
+        case ccl::datatype::int32: CCL_REDUCE(int32_t); break;
+        case ccl::datatype::uint32: CCL_REDUCE(uint32_t); break;
+        case ccl::datatype::int64: CCL_REDUCE(int64_t); break;
+        case ccl::datatype::uint64: CCL_REDUCE(uint64_t); break;
+        case ccl::datatype::float16: CCL_FATAL("FP16 is unsupported yet"); break;
+        case ccl::datatype::float32: CCL_REDUCE(float); break;
+        case ccl::datatype::float64: CCL_REDUCE(double); break;
         case ccl::datatype::bfloat16:
             if (ccl::global_data::get().bf16_impl_type == ccl_bf16_none)
                 CCL_FATAL("CCL doesn't support reductions in BF16 on this CPU");
             ccl_bf16_reduce(in_buf, in_count, inout_buf, out_count, reduction);
             break;
-        case ccl::datatype::float32: CCL_REDUCE(float); break;
-        case ccl::datatype::float64: CCL_REDUCE(double); break;
-        case ccl::datatype::int64: CCL_REDUCE(int64_t); break;
-        case ccl::datatype::uint64: CCL_REDUCE(uint64_t); break;
         default: CCL_FATAL("unexpected value ", dtype.idx()); break;
     }
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_comp_batch_reduce(const void* in_buf,
-                                   const std::vector<size_t>& offsets,
-                                   size_t in_count,
-                                   void* inout_buf,
-                                   size_t* out_count,
-                                   const ccl_datatype& dtype,
-                                   ccl::reduction reduction,
-                                   ccl::reduction_fn reduction_fn,
-                                   const ccl::fn_context* context,
-                                   int bf16_keep_precision_mode,
-                                   float* tmp,
-                                   float* acc) {
+ccl::status ccl_comp_batch_reduce(const void* in_buf,
+                                  const std::vector<size_t>& offsets,
+                                  size_t in_count,
+                                  void* inout_buf,
+                                  size_t* out_count,
+                                  const ccl_datatype& dtype,
+                                  ccl::reduction reduction,
+                                  ccl::reduction_fn reduction_fn,
+                                  const ccl::fn_context* context,
+                                  int bf16_keep_precision_mode,
+                                  float* tmp,
+                                  float* acc) {
     if (bf16_keep_precision_mode) {
         //->acc, tmp fusion_buffer_cache???
 
@@ -137,7 +142,7 @@ ccl_status_t ccl_comp_batch_reduce(const void* in_buf,
         }
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 const char* ccl_reduction_to_str(ccl::reduction type) {
diff --git a/src/comp/comp.hpp b/src/comp/comp.hpp
index cca4fc13d..10170717d 100644
--- a/src/comp/comp.hpp
+++ b/src/comp/comp.hpp
@@ -16,30 +16,31 @@
 #pragma once
 
 #include "common/datatype/datatype.hpp"
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "internal_types.hpp"
 
-ccl_status_t ccl_comp_copy(const void* in_buf,
-                           void* out_buf,
-                           size_t count,
-                           const ccl_datatype& dtype);
-ccl_status_t ccl_comp_reduce(const void* in_buf,
-                             size_t in_count,
-                             void* inout_buf,
-                             size_t* out_count,
-                             const ccl_datatype& dtype,
-                             ccl::reduction reduction,
-                             ccl::reduction_fn reduction_fn,
-                             const ccl::fn_context* context = nullptr);
-ccl_status_t ccl_comp_batch_reduce(const void* in_buf,
-                                   const std::vector<size_t>& offsets,
-                                   size_t in_count,
-                                   void* inout_buf,
-                                   size_t* out_count,
-                                   const ccl_datatype& dtype,
-                                   ccl::reduction reduction,
-                                   ccl::reduction_fn reduction_fn,
-                                   const ccl::fn_context* context,
-                                   int bf16_keep_precision_mode,
-                                   float* tmp,
-                                   float* acc);
+ccl::status ccl_comp_copy(const void* in_buf,
+                          void* out_buf,
+                          size_t count,
+                          const ccl_datatype& dtype);
+ccl::status ccl_comp_reduce(const void* in_buf,
+                            size_t in_count,
+                            void* inout_buf,
+                            size_t* out_count,
+                            const ccl_datatype& dtype,
+                            ccl::reduction reduction,
+                            ccl::reduction_fn reduction_fn,
+                            const ccl::fn_context* context = nullptr);
+ccl::status ccl_comp_batch_reduce(const void* in_buf,
+                                  const std::vector<size_t>& offsets,
+                                  size_t in_count,
+                                  void* inout_buf,
+                                  size_t* out_count,
+                                  const ccl_datatype& dtype,
+                                  ccl::reduction reduction,
+                                  ccl::reduction_fn reduction_fn,
+                                  const ccl::fn_context* context,
+                                  int bf16_keep_precision_mode,
+                                  float* tmp,
+                                  float* acc);
 const char* ccl_reduction_to_str(ccl::reduction type);
diff --git a/src/context_impl.hpp b/src/context_impl.hpp
index 62bb2eb29..a1eb6c6a9 100644
--- a/src/context_impl.hpp
+++ b/src/context_impl.hpp
@@ -14,29 +14,27 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
 
-#include "oneapi/ccl/ccl_context_attr_ids.hpp"
-#include "oneapi/ccl/ccl_context_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_context.hpp"
+#include "oneapi/ccl/context_attr_ids.hpp"
+#include "oneapi/ccl/context_attr_ids_traits.hpp"
+#include "oneapi/ccl/context.hpp"
 
 #include "common/context/context.hpp"
+#include "common/utils/version.hpp"
+
 namespace ccl {
 
-template <class device_context_type, class... attr_value_pair_t>
-CCL_API context context::create_context_from_attr(device_context_type& native_device_context_handle,
-                                       attr_value_pair_t&&... avps) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    context str{ context::impl_value_t(new context::impl_t(native_device_context_handle, ret)) };
+namespace v1 {
+
+template <class context_type, class... attr_value_pair_t>
+CCL_API context context::create_context_from_attr(context_type& native_context_handle,
+                                                  attr_value_pair_t&&... avps) {
+    auto version = utils::get_library_version();
+
+    context str{ context::impl_value_t(new context::impl_t(native_context_handle, version)) };
     int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
     (void)expander;
     str.build_from_params();
@@ -44,52 +42,50 @@ CCL_API context context::create_context_from_attr(device_context_type& native_de
     return str;
 }
 
-template <class device_context_type, typename T>
-CCL_API context context::create_context(device_context_type&& native_device_context) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    return { context::impl_value_t(new context::impl_t(std::forward<device_context_type>(native_device_context), ret)) };
+template <class context_type, typename T>
+CCL_API context context::create_context(context_type&& native_context) {
+    auto version = utils::get_library_version();
+
+    return { context::impl_value_t(
+        new context::impl_t(std::forward<context_type>(native_context), version)) };
 }
 
 template <context_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<context_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<context_attr_id, attrId>::return_type&
 context::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<context_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<context_attr_id, attrId>{});
 }
 
 template<context_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename ccl::details::ccl_api_type_attr_traits<ccl::context_attr_id, attrId>::return_type context::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<context_attr_id, attrId>::return_type context::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<context_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<context_attr_id, attrId>{});
 }
 
+} // namespace v1
+
 } // namespace ccl
 
 /***************************TypeGenerations*********************************************************/
-#define API_DEVICE_CONTEXT_CREATION_FORCE_INSTANTIATION(native_device_context_type) \
-    template CCL_API ccl::context ccl::context::create_context(native_device_context_type&& ctx);   \
-    template CCL_API ccl::context ccl::context::create_context(native_device_context_type& ctx);    \
+#define API_CONTEXT_CREATION_FORCE_INSTANTIATION(native_context_type) \
+    template CCL_API ccl::context ccl::context::create_context(native_context_type&& ctx); \
+    template CCL_API ccl::context ccl::context::create_context(native_context_type& ctx); \
+    template CCL_API ccl::context ccl::context::create_context(const native_context_type& ctx);
 
-#define API_DEVICE_CONTEXT_FORCE_INSTANTIATION_SET(IN_attrId, IN_Value) \
-    template CCL_API typename ccl::details::ccl_api_type_attr_traits<ccl::context_attr_id, \
-                                                                     IN_attrId>::return_type \
+#define API_CONTEXT_FORCE_INSTANTIATION_SET(IN_attrId, IN_Value) \
+    template CCL_API typename ccl::detail::ccl_api_type_attr_traits<ccl::context_attr_id, \
+                                                                    IN_attrId>::return_type \
     ccl::context::set<IN_attrId, IN_Value>(const IN_Value& v);
 
-#define API_DEVICE_CONTEXT_FORCE_INSTANTIATION_GET(IN_attrId) \
-    template CCL_API const typename ccl::details:: \
-        ccl_api_type_attr_traits<ccl::context_attr_id, IN_attrId>::return_type& \
-        ccl::context::get<IN_attrId>() const;
+#define API_CONTEXT_FORCE_INSTANTIATION_GET(IN_attrId) \
+    template CCL_API const typename ccl::detail::ccl_api_type_attr_traits<ccl::context_attr_id, \
+                                                                          IN_attrId>::return_type& \
+    ccl::context::get<IN_attrId>() const;
 
-#define API_DEVICE_CONTEXT_FORCE_INSTANTIATION(IN_attrId, IN_Value) \
-    API_DEVICE_CONTEXT_FORCE_INSTANTIATION_SET(IN_attrId, IN_Value) \
-    API_DEVICE_CONTEXT_FORCE_INSTANTIATION_GET(IN_attrId)
+#define API_CONTEXT_FORCE_INSTANTIATION(IN_attrId, IN_Value) \
+    API_CONTEXT_FORCE_INSTANTIATION_SET(IN_attrId, IN_Value) \
+    API_CONTEXT_FORCE_INSTANTIATION_GET(IN_attrId)
diff --git a/src/datatype_attr_impl.hpp b/src/datatype_attr_impl.hpp
index 07e88bf8e..6d52e9980 100644
--- a/src/datatype_attr_impl.hpp
+++ b/src/datatype_attr_impl.hpp
@@ -13,26 +13,30 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_datatype_attr.hpp"
-
-namespace ccl {
-
-/**
- * datatype_attr attributes definition
- */
-template <datatype_attr_id attrId, class Value>
-CCL_API Value datatype_attr::set(const Value& v) {
-    return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<datatype_attr_id, attrId>{});
-}
-
-template <datatype_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<datatype_attr_id, attrId>::return_type&
-datatype_attr::get() const {
-    return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<datatype_attr_id, attrId>{});
-}
-
-} // namespace ccl
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/datatype_attr.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+/**
+ * datatype_attr attributes definition
+ */
+template <datatype_attr_id attrId, class Value>
+CCL_API Value datatype_attr::set(const Value& v) {
+    return get_impl()->set_attribute_value(
+        v, detail::ccl_api_type_attr_traits<datatype_attr_id, attrId>{});
+}
+
+template <datatype_attr_id attrId>
+CCL_API const typename detail::ccl_api_type_attr_traits<datatype_attr_id, attrId>::return_type&
+datatype_attr::get() const {
+    return get_impl()->get_attribute_value(
+        detail::ccl_api_type_attr_traits<datatype_attr_id, attrId>{});
+}
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/device_impl.hpp b/src/device_impl.hpp
index bd83fa1fb..745c774b5 100644
--- a/src/device_impl.hpp
+++ b/src/device_impl.hpp
@@ -14,30 +14,27 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
 
-#include "oneapi/ccl/ccl_device_attr_ids.hpp"
-#include "oneapi/ccl/ccl_device_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_device.hpp"
+#include "oneapi/ccl/device_attr_ids.hpp"
+#include "oneapi/ccl/device_attr_ids_traits.hpp"
+#include "oneapi/ccl/device.hpp"
 
 #include "common/device/device.hpp"
+#include "common/utils/version.hpp"
 
 namespace ccl {
 
+namespace v1 {
+
 template <class device_type, class... attr_value_pair_t>
 CCL_API device device::create_device_from_attr(device_type& native_device_handle,
-                                       attr_value_pair_t&&... avps) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    device str{ device::impl_value_t(new device::impl_t(native_device_handle, ret)) };
+                                               attr_value_pair_t&&... avps) {
+    auto version = utils::get_library_version();
+
+    device str{ device::impl_value_t(new device::impl_t(native_device_handle, version)) };
     int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
     (void)expander;
     str.build_from_params();
@@ -47,49 +44,47 @@ CCL_API device device::create_device_from_attr(device_type& native_device_handle
 
 template <class device_type, typename T>
 CCL_API device device::create_device(device_type&& native_device) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    return { device::impl_value_t(new device::impl_t(std::forward<device_type>(native_device), ret)) };
+    auto version = utils::get_library_version();
+
+    return { device::impl_value_t(
+        new device::impl_t(std::forward<device_type>(native_device), version)) };
 }
 
 template <device_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<device_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<device_attr_id, attrId>::return_type&
 device::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<device_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<device_attr_id, attrId>{});
 }
 
 template<device_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename ccl::details::ccl_api_type_attr_traits<ccl::device_attr_id, attrId>::return_type device::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<device_attr_id, attrId>::return_type device::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<device_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<device_attr_id, attrId>{});
 }
 
+} // namespace v1
+
 } // namespace ccl
 
 /***************************TypeGenerations*********************************************************/
 #define API_DEVICE_CREATION_FORCE_INSTANTIATION(native_device_type) \
     template CCL_API ccl::device ccl::device::create_device(native_device_type&& dev); \
-    template CCL_API ccl::device ccl::device::create_device(native_device_type& dev);
+    template CCL_API ccl::device ccl::device::create_device(native_device_type& dev); \
+    template CCL_API ccl::device ccl::device::create_device(const native_device_type& dev);
 
 #define API_DEVICE_FORCE_INSTANTIATION_SET(IN_attrId, IN_Value) \
-    template CCL_API typename ccl::details::ccl_api_type_attr_traits<ccl::device_attr_id, \
-                                                                     IN_attrId>::return_type \
+    template CCL_API typename ccl::detail::ccl_api_type_attr_traits<ccl::device_attr_id, \
+                                                                    IN_attrId>::return_type \
     ccl::device::set<IN_attrId, IN_Value>(const IN_Value& v);
 
 #define API_DEVICE_FORCE_INSTANTIATION_GET(IN_attrId) \
-    template CCL_API const typename ccl::details:: \
-        ccl_api_type_attr_traits<ccl::device_attr_id, IN_attrId>::return_type& \
-        ccl::device::get<IN_attrId>() const;
+    template CCL_API const typename ccl::detail::ccl_api_type_attr_traits<ccl::device_attr_id, \
+                                                                          IN_attrId>::return_type& \
+    ccl::device::get<IN_attrId>() const;
 
 #define API_DEVICE_FORCE_INSTANTIATION(IN_attrId, IN_Value) \
     API_DEVICE_FORCE_INSTANTIATION_SET(IN_attrId, IN_Value) \
diff --git a/src/environment_impl.hpp b/src/environment_impl.hpp
index 78b701204..000178da2 100644
--- a/src/environment_impl.hpp
+++ b/src/environment_impl.hpp
@@ -14,10 +14,15 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_environment.hpp"
+
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/environment.hpp"
 
 #include "coll/coll_attributes.hpp"
 
+#include "common/comm/comm_common_attr.hpp"
+#include "comm_attr_impl.hpp"
+
 #include "common/comm/comm_split_common_attr.hpp"
 #include "comm_split_attr_impl.hpp"
 
@@ -28,9 +33,12 @@
 #include "common/global/global.hpp"
 #include "common/comm/comm.hpp"
 
-#include "oneapi/ccl/ccl_communicator.hpp"
+#include "oneapi/ccl/communicator.hpp"
 
 #include "oneapi/ccl/native_device_api/export_api.hpp"
+#include "common/utils/version.hpp"
+
+#include "internal_types.hpp"
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl.hpp>
@@ -38,26 +46,31 @@
 
 #define CCL_CHECK_AND_THROW(result, diagnostic) \
     do { \
-        if (result != ccl_status_success) { \
+        if (result != ccl::status::success) { \
             throw ccl::exception(diagnostic); \
         } \
     } while (0);
 
 namespace ccl {
 
-//Device
+namespace detail {
+
+/******************** DEVICE ********************/
+
 template <class native_device_type, typename T>
 device CCL_API environment::create_device(native_device_type&& native_device) const {
     return device::create_device(std::forward<native_device_type>(native_device));
 }
 
-//Device context
+/******************** CONTEXT ********************/
+
 template <class native_device_contex_type, typename T>
-context CCL_API environment::create_context(native_device_contex_type&& native_device_context) const {
-    return context::create_context(std::forward<native_device_contex_type>(native_device_context));
+context CCL_API environment::create_context(native_device_contex_type&& native_context) const {
+    return context::create_context(std::forward<native_device_contex_type>(native_context));
 }
 
-//Stream
+/******************** STREAM ********************/
+
 template <class native_stream_type, typename T>
 stream CCL_API environment::create_stream(native_stream_type& native_stream) {
     return stream::create_stream(native_stream);
@@ -69,41 +82,27 @@ stream CCL_API environment::create_stream(native_stream_type& native_stream,
     return stream::create_stream(native_stream, native_ctx);
 }
 
-// //Event
-// template <class event_type, typename T>
-// event CCL_API environment::create_event(event_type& native_event) {
-//     return event::create_event(native_event);
-// }
-
-// template <class event_handle_type, typename T>
-// event CCL_API
-// environment::create_event(event_handle_type native_event_handle,
-//                           typename unified_device_context_type::ccl_native_t context) {
-//     return event::create_event(native_event_handle, context);
-// }
-
+/******************** COMMUNICATOR ********************/
 
-//Device communicator
 template <class DeviceType, class ContextType>
 vector_class<communicator> CCL_API
-environment::create_communicators(const size_t devices_size,
-                                         const vector_class<DeviceType>& local_devices,
-                                         ContextType& context,
-                                         shared_ptr_class<kvs_interface> kvs) const {
-    return communicator::create_communicators(
-        devices_size, local_devices, context, kvs);
+environment::create_communicators(const int comm_size,
+                                  const vector_class<DeviceType>& local_devices,
+                                  const ContextType& context,
+                                  shared_ptr_class<kvs_interface> kvs,
+                                  const comm_attr& attr) const {
+    return communicator::create_communicators(comm_size, local_devices, context, kvs);
 }
 
 template <class DeviceType, class ContextType>
 vector_class<communicator> CCL_API environment::create_communicators(
-    const size_t comm_size,
-    const vector_class<pair_class<rank_t, DeviceType>>& local_rank_device_map,
-    ContextType& context,
-    shared_ptr_class<kvs_interface> kvs) const {
-
-    return communicator::create_communicators(
-        comm_size, local_rank_device_map, context, kvs);
-/*
+    const int comm_size,
+    const vector_class<pair_class<int, DeviceType>>& local_rank_device_map,
+    const ContextType& context,
+    shared_ptr_class<kvs_interface> kvs,
+    const comm_attr& attr) const {
+    return communicator::create_communicators(comm_size, local_rank_device_map, context, kvs);
+    /*
     (void)context;
     vector_class<communicator> ret;
     ret.push_back(create_single_device_communicator(comm_size,
@@ -117,13 +116,13 @@ vector_class<communicator> CCL_API environment::create_communicators(
 
 template <class DeviceType, class ContextType>
 vector_class<communicator> CCL_API
-environment::create_communicators(const size_t comm_size,
-                                         const map_class<rank_t, DeviceType>& local_rank_device_map,
-                                         ContextType& context,
-                                         shared_ptr_class<kvs_interface> kvs) const {
-    return communicator::create_communicators(
-        comm_size, local_rank_device_map, context, kvs);
-/*
+environment::create_communicators(const int comm_size,
+                                  const map_class<int, DeviceType>& local_rank_device_map,
+                                  const ContextType& context,
+                                  shared_ptr_class<kvs_interface> kvs,
+                                  const comm_attr& attr) const {
+    return communicator::create_communicators(comm_size, local_rank_device_map, context, kvs);
+    /*
     (void)context;
     vector_class<communicator> ret;
     ret.push_back(create_single_device_communicator(comm_size,
@@ -135,68 +134,57 @@ environment::create_communicators(const size_t comm_size,
 */
 }
 
-template <class ccl_api_type, class... args_type>
-ccl_api_type CCL_API environment::create_postponed_api_type(args_type... args) const {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-    // TODO: ccl_api_type is private constructor, so `static_cast`  fails always. Fix it
-    //static_assert(std::is_constructible<ccl_api_type, args_type..., ccl::library_version>::value, "Cannot construct `ccl_api_type` from given `args_type...`");
-    return ccl_api_type(std::forward<args_type>(args)..., ret);
-}
+} // namespace detail
+
 } // namespace ccl
 
-/***************************TypeGenerations*********************************************************/
-#define CREATE_OP_ATTR_INSTANTIATION(Attr) \
-    template Attr CCL_API ccl::environment::create_postponed_api_type<Attr>() const;
+/******************** TypeGenerations ********************/
 
 #define CREATE_DEV_COMM_INSTANTIATION(DeviceType, ContextType) \
     template ccl::vector_class<ccl::communicator> CCL_API \
-    ccl::environment::create_communicators<DeviceType, ContextType>( \
-        const size_t devices_size, \
+    ccl::detail::environment::create_communicators<DeviceType, ContextType>( \
+        const int comm_size, \
         const ccl::vector_class<DeviceType>& local_devices, \
-        ContextType& context, \
-        ccl::shared_ptr_class<ccl::kvs_interface> kvs) const; \
+        const ContextType& context, \
+        ccl::shared_ptr_class<ccl::kvs_interface> kvs, \
+        const comm_attr& attr) const; \
 \
     template ccl::vector_class<ccl::communicator> CCL_API \
-    ccl::environment::create_communicators<DeviceType, ContextType>( \
-        const size_t cluster_devices_size, \
-        const ccl::vector_class<ccl::pair_class<ccl::rank_t, DeviceType>>& local_devices, \
-        ContextType& context, \
-        ccl::shared_ptr_class<ccl::kvs_interface> kvs) const; \
+    ccl::detail::environment::create_communicators<DeviceType, ContextType>( \
+        const int comm_size, \
+        const ccl::vector_class<ccl::pair_class<int, DeviceType>>& local_devices, \
+        const ContextType& context, \
+        ccl::shared_ptr_class<ccl::kvs_interface> kvs, \
+        const comm_attr& attr) const; \
 \
     template ccl::vector_class<ccl::communicator> CCL_API \
-    ccl::environment::create_communicators<DeviceType, ContextType>( \
-        const size_t cluster_devices_size, \
-        const ccl::map_class<ccl::rank_t, DeviceType>& local_devices, \
-        ContextType& context, \
-        ccl::shared_ptr_class<ccl::kvs_interface> kvs) const;
+    ccl::detail::environment::create_communicators<DeviceType, ContextType>( \
+        const int comm_size, \
+        const ccl::map_class<int, DeviceType>& local_devices, \
+        const ContextType& context, \
+        ccl::shared_ptr_class<ccl::kvs_interface> kvs, \
+        const comm_attr& attr) const;
 
 #define CREATE_STREAM_INSTANTIATION(native_stream_type) \
-    template ccl::stream CCL_API ccl::environment::create_stream(native_stream_type& native_stream);
+    template ccl::stream CCL_API ccl::detail::environment::create_stream( \
+        native_stream_type& native_stream);
 
 #define CREATE_STREAM_EXT_INSTANTIATION(device_type, native_context_type) \
-    template ccl::stream CCL_API ccl::environment::create_stream(device_type& device, \
-                                                                 native_context_type& native_ctx);
+    template ccl::stream CCL_API ccl::detail::environment::create_stream( \
+        device_type& device, native_context_type& native_ctx);
 
 #define CREATE_CONTEXT_INSTANTIATION(native_context_type) \
-    template ccl::context CCL_API ccl::environment::create_context(native_context_type&& native_ctx) const; \
-    template ccl::context CCL_API ccl::environment::create_context(native_context_type& native_ctx) const;
+    template ccl::context CCL_API ccl::detail::environment::create_context( \
+        native_context_type&& native_ctx) const; \
+    template ccl::context CCL_API ccl::detail::environment::create_context( \
+        native_context_type& native_ctx) const; \
+    template ccl::context CCL_API ccl::detail::environment::create_context( \
+        const native_context_type& native_ctx) const;
 
 #define CREATE_DEVICE_INSTANTIATION(native_device_type) \
-    template ccl::device CCL_API ccl::environment::create_device(native_device_type&& native_device) const; \
-    template ccl::device CCL_API ccl::environment::create_device(native_device_type& native_device) const;
-
-/*
-#define CREATE_EVENT_INSTANTIATION(native_event_type) \
-    template ccl::event CCL_API ccl::environment::create_event(native_event_type& native_event);
-
-#define CREATE_EVENT_EXT_INSTANTIATION(event_handle_type) \
-    template ccl::event CCL_API ccl::environment::create_event( \
-        event_handle_type native_event_handle, \
-        typename unified_device_context_type::ccl_native_t context);
-*/
+    template ccl::device CCL_API ccl::detail::environment::create_device( \
+        native_device_type&& native_device) const; \
+    template ccl::device CCL_API ccl::detail::environment::create_device( \
+        native_device_type& native_device) const; \
+    template ccl::device CCL_API ccl::detail::environment::create_device( \
+        const native_device_type& native_device) const;
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index 468ccc80c..54c39f8cd 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -41,7 +41,6 @@ size_t ccl_executor::calculate_atl_ep_count(size_t worker_count) {
 }
 
 atl_attr_t ccl_executor::generate_atl_attr(const ccl::env_data& env) {
-
     atl_attr_t attr;
 
     attr.ep_count = calculate_atl_ep_count(env.worker_count);
@@ -63,18 +62,16 @@ std::unique_ptr<ccl_sched_queue> ccl_executor::create_sched_queue(size_t idx,
                                                                   size_t ep_per_worker) {
     std::vector<size_t> ep_vec(ep_per_worker);
     std::iota(std::begin(ep_vec), std::end(ep_vec), idx * ep_per_worker);
-    std::unique_ptr<ccl_sched_queue> sched_queue{ new ccl_sched_queue(ep_vec) };
+    std::unique_ptr<ccl_sched_queue> sched_queue{ new ccl_sched_queue(idx, ep_vec) };
     return sched_queue;
 }
 
 ccl_executor::ccl_executor(const char* main_addr) {
-
     auto& env = ccl::global_data::env();
 
-    get_worker_idx_fn =
-        (env.enable_fusion || env.enable_unordered_coll)
-            ? &ccl_executor::get_worker_idx_by_sched_id
-            : &ccl_executor::get_worker_idx_round_robin;
+    get_worker_idx_fn = (env.enable_fusion || env.enable_unordered_coll)
+                            ? &ccl_executor::get_worker_idx_by_sched_id
+                            : &ccl_executor::get_worker_idx_round_robin;
 
     /* generate ATL attr for all future communicators */
     atl_wrapper::attr = generate_atl_attr(env);
@@ -85,19 +82,17 @@ ccl_executor::ccl_executor(const char* main_addr) {
 }
 
 void ccl_executor::start_workers() {
-
     auto& env = ccl::global_data::env();
 
     auto worker_count = env.worker_count;
     auto ep_count = calculate_atl_ep_count(worker_count);
 
     if (env.worker_offload) {
-        CCL_THROW_IF_NOT(
-            env.worker_affinity.size() >= get_local_proc_count() * worker_count,
-            "unexpected worker affinity length ",
-            env.worker_affinity.size(),
-            ", should be ",
-            get_local_proc_count() * worker_count);
+        CCL_THROW_IF_NOT(env.worker_affinity.size() >= get_local_proc_count() * worker_count,
+                         "unexpected worker affinity length ",
+                         env.worker_affinity.size(),
+                         ", should be ",
+                         get_local_proc_count() * worker_count);
     }
 
     size_t ep_per_worker = ep_count / worker_count;
@@ -113,10 +108,9 @@ void ccl_executor::start_workers() {
         }
 
         if (env.worker_offload) {
-            size_t affinity =
-                env.worker_affinity[get_local_proc_idx() * worker_count + idx];
+            size_t affinity = env.worker_affinity[get_local_proc_idx() * worker_count + idx];
 
-            CCL_THROW_IF_NOT(workers.back()->start(affinity) == ccl_status_success,
+            CCL_THROW_IF_NOT(workers.back()->start(affinity) == ccl::status::success,
                              "failed to start worker # ",
                              idx);
 
@@ -143,7 +137,7 @@ ccl_executor::~ccl_executor() {
 
     for (size_t idx = 0; idx < workers.size(); idx++) {
         if (ccl::global_data::env().worker_offload) {
-            if (workers[idx]->stop() != ccl_status_success) {
+            if (workers[idx]->stop() != ccl::status::success) {
                 LOG_ERROR("failed to stop worker # ", idx);
             }
             else
@@ -196,10 +190,10 @@ void ccl_executor::update_workers() {
 }
 
 // TODO: Rework to support listener
-//ccl_status_t ccl_executor::create_listener(ccl_resize_fn_t resize_func) {
+//ccl::status ccl_executor::create_listener(ccl_resize_fn_t resize_func) {
 //    if (listener) {
 //        LOG_ERROR("attempt to create listener twice");
-//        return ccl_status_runtime_error;
+//        return ccl::status::runtime_error;
 //    }
 //
 //    if (resize_func != NULL)
@@ -216,7 +210,7 @@ void ccl_executor::update_workers() {
 //
 //    LOG_DEBUG("started listener");
 //
-//    return ccl_status_success;
+//    return ccl::status::success;
 //}
 
 void ccl_executor::start(ccl_extra_sched* extra_sched) {
@@ -274,30 +268,28 @@ size_t ccl_executor::get_worker_count() const {
     return workers.size();
 }
 void ccl_executor::set_local_coord() {
-
-    // TODO: works only for hydra
-    const char* mpi_local_ranks_env = "MPI_LOCALNRANKS";
-    const char* mpi_local_id_env = "MPI_LOCALRANKID";
-
-    char* local_count = getenv(mpi_local_ranks_env);
-    if (local_count) {
-        char* local_id = getenv(mpi_local_id_env);
-        if (local_id) {
-            local_proc_count = std::atoi(local_count);
-            local_proc_idx = std::atoi(local_id);
-            return;
-        }
+    /* hydra specific env variables */
+    const char idx_env_name[] = "MPI_LOCALRANKID";
+    const char count_env_name[] = "MPI_LOCALNRANKS";
+
+    char* idx_env = getenv(idx_env_name);
+    char* count_env = getenv(count_env_name);
+
+    if (!(idx_env && count_env)) {
+        local_proc_idx = 0;
+        local_proc_count = 1;
+
+        LOG_INFO("WARNING: ",
+                 idx_env_name,
+                 " or ",
+                 count_env_name,
+                 " not found. Use default: local_proc_idx ",
+                 local_proc_idx,
+                 " , local_proc_count ",
+                 local_proc_count);
+    }
+    else {
+        local_proc_idx = std::atoi(idx_env);
+        local_proc_count = std::atoi(count_env);
     }
-
-    local_proc_count = 1;
-    local_proc_idx = 0;
-
-    LOG_INFO("WARNING: ",
-             mpi_local_ranks_env,
-             " or ",
-             mpi_local_id_env,
-             " not found. Use default: ",
-             local_proc_count,
-             " , ",
-             local_proc_idx);
 }
diff --git a/src/exec/exec.hpp b/src/exec/exec.hpp
index ec455654c..8b2e6e6b8 100644
--- a/src/exec/exec.hpp
+++ b/src/exec/exec.hpp
@@ -21,6 +21,7 @@
 #include "common/request/request.hpp"
 #include "exec/thread/listener.hpp"
 #include "sched/extra_sched.hpp"
+#include "internal_types.hpp"
 
 #include <memory>
 #include <vector>
@@ -72,7 +73,7 @@ class alignas(CACHELINE_SIZE) ccl_executor {
     size_t get_worker_count() const;
 
     // TODO: Rework to support listener
-    //    ccl_status_t create_listener(ccl_resize_fn_t resize_func);
+    //    ccl::status create_listener(ccl_resize_fn_t resize_func);
     void update_workers();
     void lock_workers();
     void unlock_workers();
@@ -90,7 +91,6 @@ class alignas(CACHELINE_SIZE) ccl_executor {
     static atl_attr_t generate_atl_attr(const ccl::env_data& env);
 
 private:
-
     size_t get_worker_idx_round_robin(ccl_sched* sched);
     size_t get_worker_idx_by_sched_id(ccl_sched* sched);
 
diff --git a/src/exec/thread/base_thread.cpp b/src/exec/thread/base_thread.cpp
index 57dfe51bd..1a2544799 100644
--- a/src/exec/thread/base_thread.cpp
+++ b/src/exec/thread/base_thread.cpp
@@ -17,7 +17,7 @@
 #include "common/utils/yield.hpp"
 #include "exec/thread/base_thread.hpp"
 
-ccl_status_t ccl_base_thread::start(int affinity) {
+ccl::status ccl_base_thread::start(int affinity) {
     LOG_DEBUG(name(), " ", idx);
 
     start_affinity = affinity;
@@ -34,17 +34,17 @@ ccl_status_t ccl_base_thread::start(int affinity) {
     if (err) {
         LOG_ERROR(
             "error while creating ", name(), " thread #", idx, " pthread_create returns ", err);
-        return ccl_status_runtime_error;
+        return ccl::status::runtime_error;
     }
 
     while (!started.load(std::memory_order_relaxed)) {
         ccl_yield(ccl::global_data::env().yield_type);
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_base_thread::stop() {
+ccl::status ccl_base_thread::stop() {
     LOG_DEBUG(name(), " # ", idx);
 
     void* exit_code;
@@ -69,10 +69,10 @@ ccl_status_t ccl_base_thread::stop() {
                   ")");
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_base_thread::set_affinity(int affinity) {
+ccl::status ccl_base_thread::set_affinity(int affinity) {
     LOG_DEBUG(name(), " # ", idx, ", affinity ", affinity);
 
     int pthread_err;
@@ -83,15 +83,15 @@ ccl_status_t ccl_base_thread::set_affinity(int affinity) {
 
     if ((pthread_err = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset)) != 0) {
         LOG_ERROR("pthread_setaffinity_np failed, err ", pthread_err);
-        return ccl_status_runtime_error;
+        return ccl::status::runtime_error;
     }
 
     if (get_affinity() != affinity) {
         LOG_ERROR(name(), " ", idx, " is not pinned ", affinity);
-        return ccl_status_runtime_error;
+        return ccl::status::runtime_error;
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 int ccl_base_thread::get_affinity() {
diff --git a/src/exec/thread/base_thread.hpp b/src/exec/thread/base_thread.hpp
index a8c2de0fd..f3f6bb124 100644
--- a/src/exec/thread/base_thread.hpp
+++ b/src/exec/thread/base_thread.hpp
@@ -19,6 +19,7 @@
 #include <pthread.h>
 
 #include "common/log/log.hpp"
+#include "internal_types.hpp"
 
 #define CCL_UNDEFINED_CPU_ID (-1)
 
@@ -38,8 +39,8 @@ class ccl_base_thread {
     ccl_base_thread& operator=(const ccl_base_thread&) = delete;
     ccl_base_thread& operator=(ccl_base_thread&&) = delete;
 
-    ccl_status_t start(int affinity);
-    ccl_status_t stop();
+    ccl::status start(int affinity);
+    ccl::status stop();
 
     size_t get_idx() {
         return idx;
@@ -62,7 +63,7 @@ class ccl_base_thread {
     std::atomic<bool> started{ false };
 
 private:
-    ccl_status_t set_affinity(int affinity);
+    ccl::status set_affinity(int affinity);
 
     const size_t idx;
 
diff --git a/src/exec/thread/service_worker.cpp b/src/exec/thread/service_worker.cpp
index 40a2f7937..5e35c9bd3 100644
--- a/src/exec/thread/service_worker.cpp
+++ b/src/exec/thread/service_worker.cpp
@@ -21,7 +21,7 @@ ccl_service_worker::ccl_service_worker(size_t idx,
         : ccl_worker(idx, std::move(data_queue)),
           fusion_manager(fusion_manager) {}
 
-ccl_status_t ccl_service_worker::do_work(size_t& processed_count) {
+ccl::status ccl_service_worker::do_work(size_t& processed_count) {
     fusion_manager.execute();
     return ccl_worker::do_work(processed_count);
 }
diff --git a/src/exec/thread/service_worker.hpp b/src/exec/thread/service_worker.hpp
index afcc0a243..3006e470c 100644
--- a/src/exec/thread/service_worker.hpp
+++ b/src/exec/thread/service_worker.hpp
@@ -17,6 +17,7 @@
 
 #include "exec/thread/worker.hpp"
 #include "fusion/fusion.hpp"
+#include "internal_types.hpp"
 
 class ccl_service_worker : public ccl_worker {
 public:
@@ -25,7 +26,7 @@ class ccl_service_worker : public ccl_worker {
                        ccl_fusion_manager& fusion_manager);
     ~ccl_service_worker() = default;
 
-    ccl_status_t do_work(size_t& processed_count);
+    ccl::status do_work(size_t& processed_count);
 
 private:
     ccl_fusion_manager& fusion_manager;
diff --git a/src/exec/thread/worker.cpp b/src/exec/thread/worker.cpp
index 6ceb2b6d9..722a9a730 100644
--- a/src/exec/thread/worker.cpp
+++ b/src/exec/thread/worker.cpp
@@ -38,7 +38,7 @@ void ccl_worker::add(ccl_sched* sched) {
     CCL_ASSERT(!sched->bin);
     CCL_ASSERT(sched->get_in_bin_status() != ccl_sched_in_bin_added);
 
-    if (sched->strict_start_order) {
+    if (sched->strict_order) {
         /* to keep valid non-completed req until safe releasing */
         sched->req->increase_counter(1);
         strict_sched_queue->add(sched);
@@ -48,16 +48,16 @@ void ccl_worker::add(ccl_sched* sched) {
     }
 }
 
-ccl_status_t ccl_worker::do_work(size_t& processed_count) {
+ccl::status ccl_worker::do_work(size_t& processed_count) {
     do_work_counter++;
 
     auto ret = process_strict_sched_queue();
-    if (ret != ccl_status_success)
+    if (ret != ccl::status::success)
         return ret;
 
     ret = process_sched_queue(processed_count,
                               (do_work_counter % CCL_WORKER_PROCESS_ALL_ITERS) ? false : true);
-    if (ret != ccl_status_success)
+    if (ret != ccl::status::success)
         return ret;
 
 #ifdef ENABLE_DEBUG
@@ -66,13 +66,13 @@ ccl_status_t ccl_worker::do_work(size_t& processed_count) {
     }
 #endif
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_worker::process_strict_sched_queue() {
+ccl::status ccl_worker::process_strict_sched_queue() {
     auto& queue = strict_sched_queue->peek();
     if (queue.empty())
-        return ccl_status_success;
+        return ccl::status::success;
 
     size_t erased_scheds = 0;
 
@@ -114,7 +114,7 @@ ccl_status_t ccl_worker::process_strict_sched_queue() {
                 and return to strict starting for current operation on the next call
             */
             std::vector<ccl_sched*>(sched_it, queue.end()).swap(queue);
-            return ccl_status_success;
+            return ccl::status::success;
         }
         else {
             /* now it is safe to release this sched */
@@ -124,16 +124,16 @@ ccl_status_t ccl_worker::process_strict_sched_queue() {
 
     queue.clear();
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_worker::process_sched_queue(size_t& completed_sched_count, bool process_all) {
+ccl::status ccl_worker::process_sched_queue(size_t& completed_sched_count, bool process_all) {
     completed_sched_count = 0;
     if (process_all) {
         auto bins = sched_queue->peek_all();
 
         if (bins.empty())
-            return ccl_status_success;
+            return ccl::status::success;
 
         size_t completed_sched_count_local = 0;
         for (auto& bin : bins) {
@@ -144,17 +144,17 @@ ccl_status_t ccl_worker::process_sched_queue(size_t& completed_sched_count, bool
         if (completed_sched_count)
             LOG_DEBUG("process_all, completed_sched_count ", completed_sched_count);
 
-        return ccl_status_success;
+        return ccl::status::success;
     }
     else {
         ccl_sched_bin* bin = sched_queue->peek();
         if (!bin)
-            return ccl_status_success;
+            return ccl::status::success;
         return process_sched_bin(bin, completed_sched_count);
     }
 }
 
-ccl_status_t ccl_worker::process_sched_bin(ccl_sched_bin* bin, size_t& completed_sched_count) {
+ccl::status ccl_worker::process_sched_bin(ccl_sched_bin* bin, size_t& completed_sched_count) {
     CCL_ASSERT(bin);
     completed_sched_count = 0;
 
@@ -174,7 +174,7 @@ ccl_status_t ccl_worker::process_sched_bin(ccl_sched_bin* bin, size_t& completed
 
     //    if (ccl::global_data::get().is_ft_enabled) {
     //        if (atl_status != ATL_STATUS_SUCCESS)
-    //            return ccl_status_blocked_due_to_resize;
+    //            return ccl::status::blocked_due_to_resize;
     //    }
     //    else {
     //        CCL_THROW_IF_NOT(atl_status == ATL_STATUS_SUCCESS, "bad status ", atl_status);
@@ -213,7 +213,7 @@ ccl_status_t ccl_worker::process_sched_bin(ccl_sched_bin* bin, size_t& completed
         }
     }
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 void ccl_worker::clear_queue() {
@@ -232,7 +232,7 @@ static inline bool ccl_worker_check_conditions(ccl_worker* worker,
     }
 
     if (ccl::global_data::get().is_ft_enabled &&
-        unlikely(do_work_status == ccl_status_blocked_due_to_resize ||
+        unlikely(do_work_status == ccl::status::blocked_due_to_resize ||
                  iter_count % CCL_WORKER_CHECK_UPDATE_ITERS == 0)) {
         if (worker->should_lock.load(std::memory_order_acquire)) {
             worker->clear_queue();
@@ -262,15 +262,19 @@ static inline bool ccl_worker_check_conditions(ccl_worker* worker,
 
 static void* ccl_worker_func(void* args) {
     auto worker = static_cast<ccl_worker*>(args);
-    LOG_INFO("worker_idx ", worker->get_idx());
+
+    auto worker_idx = worker->get_idx();
+
+    LOG_INFO("worker_idx ", worker_idx);
 
     size_t iter_count = 0;
     size_t processed_count = 0;
     size_t max_spin_count = ccl::global_data::env().spin_count;
     size_t spin_count = max_spin_count;
-    ccl_status_t ret;
+    ccl::status ret;
 
     ccl::global_data::get().is_worker_thread = true;
+
     worker->started = true;
 
     do {
@@ -281,13 +285,13 @@ static void* ccl_worker_func(void* args) {
                 break;
         }
         catch (ccl::exception& ccl_e) {
-            CCL_FATAL("worker ", worker->get_idx(), " caught internal exception: ", ccl_e.what());
+            CCL_FATAL("worker ", worker_idx, " caught internal exception: ", ccl_e.what());
         }
         catch (std::exception& e) {
-            CCL_FATAL("worker ", worker->get_idx(), " caught exception: ", e.what());
+            CCL_FATAL("worker ", worker_idx, " caught exception: ", e.what());
         }
         catch (...) {
-            CCL_FATAL("worker ", worker->get_idx(), " caught general exception");
+            CCL_FATAL("worker ", worker_idx, " caught general exception");
         }
 
         iter_count++;
diff --git a/src/exec/thread/worker.hpp b/src/exec/thread/worker.hpp
index 8e9dddbee..59ab3cf07 100644
--- a/src/exec/thread/worker.hpp
+++ b/src/exec/thread/worker.hpp
@@ -13,55 +13,56 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#include "exec/thread/base_thread.hpp"
-#include "sched/queue/strict_queue.hpp"
-#include "sched/queue/queue.hpp"
-
-#include <memory>
-#include <list>
-#include <pthread.h>
-
-class ccl_executor;
-
-class ccl_worker : public ccl_base_thread {
-public:
-    ccl_worker() = delete;
-    ccl_worker(const ccl_worker& other) = delete;
-    ccl_worker& operator=(const ccl_worker& other) = delete;
-    ccl_worker(size_t idx, std::unique_ptr<ccl_sched_queue> queue);
-    virtual ~ccl_worker() = default;
-    virtual void* get_this() override {
-        return static_cast<void*>(this);
-    };
-
-    virtual const std::string& name() const override {
-        static const std::string name("worker");
-        return name;
-    };
-
-    void add(ccl_sched* sched);
-
-    virtual ccl_status_t do_work(size_t& processed_count);
-
-    void clear_queue();
-
-    void reset_queue(std::unique_ptr<ccl_sched_queue>&& queue) {
-        clear_queue();
-        sched_queue = std::move(queue);
-    }
-
-    std::atomic<bool> should_lock;
-    std::atomic<bool> is_locked;
-
-private:
-    ccl_status_t process_strict_sched_queue();
-    ccl_status_t process_sched_queue(size_t& processed_count, bool process_all);
-    ccl_status_t process_sched_bin(ccl_sched_bin* bin, size_t& processed_count);
-
-    size_t do_work_counter = 0;
-
-    std::unique_ptr<ccl_strict_sched_queue> strict_sched_queue;
-    std::unique_ptr<ccl_sched_queue> sched_queue;
-};
+#pragma once
+
+#include "exec/thread/base_thread.hpp"
+#include "sched/queue/strict_queue.hpp"
+#include "sched/queue/queue.hpp"
+#include "internal_types.hpp"
+
+#include <memory>
+#include <list>
+#include <pthread.h>
+
+class ccl_executor;
+
+class ccl_worker : public ccl_base_thread {
+public:
+    ccl_worker() = delete;
+    ccl_worker(const ccl_worker& other) = delete;
+    ccl_worker& operator=(const ccl_worker& other) = delete;
+    ccl_worker(size_t idx, std::unique_ptr<ccl_sched_queue> queue);
+    virtual ~ccl_worker() = default;
+    virtual void* get_this() override {
+        return static_cast<void*>(this);
+    };
+
+    virtual const std::string& name() const override {
+        static const std::string name("worker");
+        return name;
+    };
+
+    void add(ccl_sched* sched);
+
+    virtual ccl::status do_work(size_t& processed_count);
+
+    void clear_queue();
+
+    void reset_queue(std::unique_ptr<ccl_sched_queue>&& queue) {
+        clear_queue();
+        sched_queue = std::move(queue);
+    }
+
+    std::atomic<bool> should_lock;
+    std::atomic<bool> is_locked;
+
+private:
+    ccl::status process_strict_sched_queue();
+    ccl::status process_sched_queue(size_t& processed_count, bool process_all);
+    ccl::status process_sched_bin(ccl_sched_bin* bin, size_t& processed_count);
+
+    size_t do_work_counter = 0;
+
+    std::unique_ptr<ccl_strict_sched_queue> strict_sched_queue;
+    std::unique_ptr<ccl_sched_queue> sched_queue;
+};
diff --git a/src/fusion/fusion.cpp b/src/fusion/fusion.cpp
index fc13fb754..2e910ea83 100644
--- a/src/fusion/fusion.cpp
+++ b/src/fusion/fusion.cpp
@@ -20,23 +20,23 @@
 
 #define CCL_FUSION_CHECK_SCHEDS_ITERS (1024)
 
-ccl_status_t complete_user_request(const void* ctx) {
+ccl::status complete_user_request(const void* ctx) {
     ccl_master_sched* sched = (ccl_master_sched*)ctx;
     LOG_DEBUG("complete fusion request: ", static_cast<ccl_request*>(sched));
     sched->complete();
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t release_fusion_buf(const void* ctx) {
+ccl::status release_fusion_buf(const void* ctx) {
     void* buf = (void*)ctx;
 
     if (ccl::global_data::get().fusion_manager)
         ccl::global_data::get().fusion_manager->release_buffer(buf);
 
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t release_fusion_buf_for_cached_sched(ccl_sched* sched, const void* ctx) {
+ccl::status release_fusion_buf_for_cached_sched(ccl_sched* sched, const void* ctx) {
     return release_fusion_buf(ctx);
 }
 
@@ -312,7 +312,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
 
 #ifdef CCL_ENABLE_SYCL
             if (stream && stream->is_sycl_device_stream())
-                entry_factory::make_entry<sycl_copy_device_to_host_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
                     part_scheds[idx].get(),
                     ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.sycl_send_buf),
                                exec_queue[global_copy_idx]->coll_param.count * dtype_size,
@@ -350,7 +350,7 @@ ccl_master_sched* ccl_fusion_manager::build_sched() {
 
 #ifdef CCL_ENABLE_SYCL
             if (stream && stream->is_sycl_device_stream())
-                entry_factory::make_entry<sycl_copy_host_to_device_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
                     part_scheds[idx].get(),
                     ccl_buffer(fusion_buf, buf_cache.get_buf_size(), offset),
                     ccl_buffer(&(exec_queue[global_copy_idx]->coll_param.sycl_recv_buf),
diff --git a/src/init_attr_impl.hpp b/src/init_attr_impl.hpp
new file mode 100644
index 000000000..b4a62c4fc
--- /dev/null
+++ b/src/init_attr_impl.hpp
@@ -0,0 +1,69 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/init_attr_ids.hpp"
+#include "oneapi/ccl/init_attr_ids_traits.hpp"
+#include "oneapi/ccl/init_attr.hpp"
+
+namespace ccl {
+
+class init_attr_impl {
+public:
+    /**
+     * `version` operations
+     */
+    using version_traits_t = detail::ccl_api_type_attr_traits<init_attr_id, init_attr_id::version>;
+
+    const typename version_traits_t::return_type& get_attribute_value(
+        const version_traits_t& id) const {
+        return version;
+    }
+
+    typename version_traits_t::return_type set_attribute_value(typename version_traits_t::type val,
+                                                               const version_traits_t& t) {
+        (void)t;
+        throw ccl::exception("Set value for 'ccl::init_attr_id::version' is not allowed");
+        return version;
+    }
+
+    init_attr_impl(const typename version_traits_t::return_type& version) : version(version) {}
+
+protected:
+    typename version_traits_t::return_type version;
+};
+
+namespace v1 {
+
+/**
+ * init_attr attributes definition
+ */
+template <init_attr_id attrId, class Value>
+Value init_attr::set(const Value& v) {
+    return get_impl()->set_attribute_value(
+        v, detail::ccl_api_type_attr_traits<init_attr_id, attrId>{});
+}
+
+template <init_attr_id attrId>
+const typename detail::ccl_api_type_attr_traits<init_attr_id, attrId>::return_type& init_attr::get()
+    const {
+    return get_impl()->get_attribute_value(
+        detail::ccl_api_type_attr_traits<init_attr_id, attrId>{});
+}
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/internal_types.hpp b/src/internal_types.hpp
new file mode 100644
index 000000000..a53abdfe1
--- /dev/null
+++ b/src/internal_types.hpp
@@ -0,0 +1,74 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "oneapi/ccl/types.hpp"
+
+namespace ccl {
+
+enum class group_split_type : int { // TODO fill in this enum with the actual values
+    undetermined = -1,
+    //device,
+    thread,
+    process,
+    //socket,
+    //node,
+    cluster,
+
+    last_value
+};
+
+/**
+ * Supported device topology type
+ */
+enum device_topology_type : int {
+    undetermined = -1,
+    ring,
+    a2a,
+
+    last_class_value
+};
+
+// TODO: refactor core code and remove this enum?
+enum status : int {
+    success = 0,
+    out_of_resource,
+    invalid_arguments,
+    runtime_error,
+    blocked_due_to_resize,
+
+    last_value
+};
+
+/* in_buf, in_count, in_dtype, out_buf, out_count, out_dtype, context */
+typedef void (*prologue_fn)(const void*,
+                            size_t,
+                            ccl::datatype,
+                            void**,
+                            size_t*,
+                            ccl::datatype*,
+                            const ccl::fn_context*);
+
+/* in_buf, in_count, in_dtype, out_buf, out_count, out_dtype, context */
+typedef void (*epilogue_fn)(const void*,
+                            size_t,
+                            ccl::datatype,
+                            void*,
+                            size_t*,
+                            ccl::datatype,
+                            const ccl::fn_context*);
+
+} // namespace ccl
diff --git a/src/kvs_attr_impl.hpp b/src/kvs_attr_impl.hpp
new file mode 100644
index 000000000..6a38cae35
--- /dev/null
+++ b/src/kvs_attr_impl.hpp
@@ -0,0 +1,46 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/kvs_attr.hpp"
+
+namespace ccl {
+
+namespace v1 {
+
+/**
+ * kvs_attr attributes definition
+ */
+template <kvs_attr_id attrId, class Value>
+CCL_API Value kvs_attr::set(const Value& v) {
+    return get_impl()->set_attribute_value(v,
+                                           detail::ccl_api_type_attr_traits<kvs_attr_id, attrId>{});
+}
+
+template <kvs_attr_id attrId>
+CCL_API const typename detail::ccl_api_type_attr_traits<kvs_attr_id, attrId>::type& kvs_attr::get()
+    const {
+    return get_impl()->get_attribute_value(detail::ccl_api_type_attr_traits<kvs_attr_id, attrId>{});
+}
+
+template <kvs_attr_id attrId>
+CCL_API bool kvs_attr::is_valid() const noexcept {
+    return get_impl()->is_valid<attrId>();
+}
+
+} // namespace v1
+
+} // namespace ccl
diff --git a/src/kvs_impl.hpp b/src/kvs_impl.hpp
index db809dec4..085e794cd 100644
--- a/src/kvs_impl.hpp
+++ b/src/kvs_impl.hpp
@@ -14,10 +14,13 @@
  limitations under the License.
 */
 #pragma once
+
 #include <cstring>
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_kvs.hpp"
+
 #include "atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h"
+#include "common/log/log.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/kvs.hpp"
 
 namespace ccl {
 
@@ -28,15 +31,17 @@ class kvs_impl {
         inter_kvs->kvs_main_server_address_reserve(addr.data());
         inter_kvs->kvs_init(addr.data());
     }
+
     kvs_impl(const kvs::address_type& addr) : addr(addr) {
         inter_kvs = std::shared_ptr<internal_kvs>(new internal_kvs());
         inter_kvs->kvs_init(addr.data());
     }
+
     kvs::address_type get_addr() {
         return addr;
     }
 
-    vector_class<char> get(const string_class& key) const {
+    vector_class<char> get(const string_class& key) {
         char ret[128];
         inter_kvs->kvs_get_value_by_name_key(prefix.c_str(), key.c_str(), ret);
         size_t ret_len = strlen(ret);
@@ -50,8 +55,11 @@ class kvs_impl {
         return ret_vec;
     }
 
-    void set(const string_class& key, const vector_class<char>& data) const {
-        inter_kvs->kvs_set_value(prefix.c_str(), key.c_str(), data.data() ? data.data() : "");
+    void set(const string_class& key, const vector_class<char>& data) {
+        CCL_THROW_IF_NOT(!data.empty(), "data should have at least one element");
+        CCL_THROW_IF_NOT(data.back() == '\0', "data should have terminating symbol");
+        CCL_THROW_IF_NOT(data.data(), "data pointer should be non-null");
+        inter_kvs->kvs_set_value(prefix.c_str(), key.c_str(), data.data());
     }
 
     std::shared_ptr<internal_kvs> get() {
@@ -64,19 +72,21 @@ class kvs_impl {
     kvs::address_type addr;
 };
 
+namespace v1 {
+
 kvs::address_type CCL_API kvs::get_address() const {
     return pimpl->get_addr();
 }
 
-vector_class<char> CCL_API kvs::get(const string_class& key) const {
+vector_class<char> CCL_API kvs::get(const string_class& key) {
     return pimpl->get(key);
 }
 
-void CCL_API kvs::set(const string_class& key, const vector_class<char>& data) const {
+void CCL_API kvs::set(const string_class& key, const vector_class<char>& data) {
     pimpl->set(key, data);
 }
 
-CCL_API kvs::kvs(const kvs::address_type& addr) {
+CCL_API kvs::kvs(const kvs::address_type& addr, const kvs_attr& attr) {
     pimpl = std::unique_ptr<kvs_impl>(new kvs_impl(addr));
 }
 
@@ -84,10 +94,12 @@ CCL_API const kvs_impl& kvs::get_impl() {
     return *pimpl;
 }
 
-CCL_API kvs::kvs() {
+CCL_API kvs::kvs(const kvs_attr& attr) {
     pimpl = std::unique_ptr<kvs_impl>(new kvs_impl());
 }
 
 CCL_API kvs::~kvs() {}
 
+} // namespace v1
+
 } // namespace ccl
diff --git a/src/native_device_api/api_explicit_instantiation.hpp b/src/native_device_api/api_explicit_instantiation.hpp
index 2e598a084..3bc19b89f 100644
--- a/src/native_device_api/api_explicit_instantiation.hpp
+++ b/src/native_device_api/api_explicit_instantiation.hpp
@@ -14,8 +14,12 @@
  limitations under the License.
 */
 namespace native {
-template struct memory<char, ccl_device, ccl_context>;
-template struct memory<int, ccl_device, ccl_context>;
+template struct memory<int8_t, ccl_device, ccl_context>;
+template struct memory<uint8_t, ccl_device, ccl_context>;
+template struct memory<int16_t, ccl_device, ccl_context>;
+template struct memory<uint16_t, ccl_device, ccl_context>;
+template struct memory<int32_t, ccl_device, ccl_context>;
+template struct memory<uint32_t, ccl_device, ccl_context>;
 template struct memory<int64_t, ccl_device, ccl_context>;
 template struct memory<uint64_t, ccl_device, ccl_context>;
 template struct memory<float, ccl_device, ccl_context>;
diff --git a/src/native_device_api/compiler_ccl_wrappers_dispatcher.hpp b/src/native_device_api/compiler_ccl_wrappers_dispatcher.hpp
index 36dad408b..10eddc80b 100644
--- a/src/native_device_api/compiler_ccl_wrappers_dispatcher.hpp
+++ b/src/native_device_api/compiler_ccl_wrappers_dispatcher.hpp
@@ -18,7 +18,7 @@
 #if defined(MULTI_GPU_SUPPORT)
 #include "oneapi/ccl/native_device_api/export_api.hpp"
 #include "oneapi/ccl/native_device_api/l0/declarations.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 
 #ifdef CCL_ENABLE_SYCL
 #include <CL/sycl/backend/level_zero.hpp>
@@ -28,42 +28,64 @@
 #include "oneapi/ccl/native_device_api/l0/utils.hpp"
 
 namespace native {
-namespace details {
+namespace detail {
 static ccl_device_driver::device_ptr get_runtime_device_impl(const ccl::device_index_type& path) {
     return get_platform().get_device(path);
 }
-} // namespace details
+
+} // namespace detail
 
 template <class DeviceType,
           typename std::enable_if<not std::is_same<typename std::remove_cv<DeviceType>::type,
                                                    ccl::device_index_type>::value,
                                   int>::type = 0>
-CCL_API ccl_device_driver::device_ptr get_runtime_device(const DeviceType& device) {
+/*CCL_API*/ ccl_device_driver::device_ptr get_runtime_device(const DeviceType& device) {
     static_assert(std::is_same<typename ccl::unified_device_type::ccl_native_t, DeviceType>::value,
                   "Unsupported 'DeviceType'");
     size_t driver_idx = 0; // limitation for OPENCL/SYCL
     size_t device_id = 0;
 #ifdef CCL_ENABLE_SYCL
-    device_id = native::details::get_sycl_device_id(device);
+    device_id = native::detail::get_sycl_device_id(device);
 #endif
     ccl::device_index_type path(driver_idx, device_id, ccl::unused_index_value);
 
-    return details::get_runtime_device_impl(path);
+    return detail::get_runtime_device_impl(path);
 }
 
 template <class DeviceType,
           typename std::enable_if<std::is_same<typename std::remove_cv<DeviceType>::type,
                                                ccl::device_index_type>::value,
                                   int>::type = 0>
-CCL_API ccl_device_driver::device_ptr get_runtime_device(const DeviceType& device) {
-    return details::get_runtime_device_impl(device);
+/*CCL_API*/ ccl_device_driver::device_ptr get_runtime_device(const DeviceType& device) {
+    return detail::get_runtime_device_impl(device);
 }
 
+template <class ContextType>
+/*CCL_API*/ ccl_driver_context_ptr get_runtime_context(const ContextType& ctx) {
+#ifdef CCL_ENABLE_SYCL
+    static_assert(
+        std::is_same<typename std::remove_cv<ContextType>::type, cl::sycl::context>::value,
+        "Invalid ContextType");
+    auto l0_handle_ptr = ctx.template get_native<cl::sycl::backend::level_zero>();
+    if (!l0_handle_ptr) {
+        throw std::runtime_error(std::string(__FUNCTION__) +
+                                 " - failed for sycl context: handle is nullptr!");
+    }
+    auto& drivers = get_platform().get_drivers();
+    assert(drivers.size() == 1 && "Only one driver supported for L0 at now");
+    return drivers.begin()->second->create_context_from_handle(l0_handle_ptr);
+#else
+    return ctx;
+#endif
+}
 } // namespace native
 
 template native::ccl_device_driver::device_ptr native::get_runtime_device(
     const ccl::device_index_type& path);
 
+template native::ccl_driver_context_ptr native::get_runtime_context(
+    const ccl::unified_context_type::ccl_native_t& ctx);
+
 #ifdef CCL_ENABLE_SYCL
 template native::ccl_device_driver::device_ptr native::get_runtime_device(
     const cl::sycl::device& device);
diff --git a/src/native_device_api/empty/export.cpp b/src/native_device_api/empty/export.cpp
index 59789a7b4..a2fab125e 100644
--- a/src/native_device_api/empty/export.cpp
+++ b/src/native_device_api/empty/export.cpp
@@ -13,27 +13,27 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_config.h"
+#include "oneapi/ccl/config.h"
 #if !defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
 
 #include "oneapi/ccl/native_device_api/empty/export.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/log/log.hpp"
 #include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
 
 namespace ccl {
 
-generic_device_context_type<cl_backend_type::empty_backend>::ccl_native_t
-generic_device_context_type<cl_backend_type::empty_backend>::get() noexcept {
-    return /*const_cast<generic_device_context_type<cl_backend_type::l0>::ccl_native_t>*/ (
-        static_cast<const generic_device_context_type<cl_backend_type::empty_backend>*>(this)->get());
+generic_context_type<cl_backend_type::empty_backend>::ccl_native_t
+generic_context_type<cl_backend_type::empty_backend>::get() noexcept {
+    return /*const_cast<generic_context_type<cl_backend_type::l0>::ccl_native_t>*/ (
+        static_cast<const generic_context_type<cl_backend_type::empty_backend>*>(this)->get());
 }
 
-const generic_device_context_type<cl_backend_type::empty_backend>::ccl_native_t&
-generic_device_context_type<cl_backend_type::empty_backend>::get() const noexcept {
+const generic_context_type<cl_backend_type::empty_backend>::ccl_native_t&
+generic_context_type<cl_backend_type::empty_backend>::get() const noexcept {
     //TODO
     return context; //native::get_platform();
 }
-}
+} // namespace ccl
 
 #endif //#if !defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
diff --git a/src/native_device_api/interop_utils.cpp b/src/native_device_api/interop_utils.cpp
index cb15e9328..54bfbfbe6 100644
--- a/src/native_device_api/interop_utils.cpp
+++ b/src/native_device_api/interop_utils.cpp
@@ -14,7 +14,9 @@
  limitations under the License.
 */
 #include "oneapi/ccl/native_device_api/interop_utils.hpp"
+#include "common/log/log.hpp"
 #include "common/utils/enums.hpp"
+
 #if defined(MULTI_GPU_SUPPORT)
 #include "oneapi/ccl/native_device_api/l0/primitives.hpp"
 #endif
@@ -25,7 +27,7 @@
 #endif
 
 namespace native {
-namespace details {
+namespace detail {
 #if defined(MULTI_GPU_SUPPORT) && defined(CCL_ENABLE_SYCL)
 
 size_t get_sycl_device_id(const cl::sycl::device& device) {
@@ -82,8 +84,9 @@ std::string to_string(usm_support_mode val) {
         .choose(val, "UNKNOWN");
 }
 
-int get_platform_type_index(const ccl::unified_device_type::ccl_native_t& device) {
-    int index = 2; //`gpu` for default L0 backend
+size_t get_platform_type_index(const ccl::unified_device_type::ccl_native_t& device) {
+    size_t index = 2; //`gpu` for default L0 backend
+
 #ifdef CCL_ENABLE_SYCL
     if (device.is_host()) {
         index = 0;
@@ -101,41 +104,59 @@ int get_platform_type_index(const ccl::unified_device_type::ccl_native_t& device
         throw ccl::exception(std::string(__PRETTY_FUNCTION__) + " - Invalid device type");
     }
 #endif
+
     return index;
 }
 
 #if defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
-assoc_retult check_assoc_device_memory(const void* mem,
+assoc_result check_assoc_device_memory(const void* mem,
                                        const ccl::unified_device_type::ccl_native_t& device,
-                                       const ccl::unified_device_context_type::ccl_native_t& ctx) {
-    assoc_retult ret{ usm_support_mode::direct, mem, "" };
+                                       const ccl::unified_context_type::ccl_native_t& ctx) {
+    assoc_result ret{ usm_support_mode::direct, mem, "" };
+
 #ifdef CCL_ENABLE_SYCL
+
     cl::sycl::usm::alloc pointer_type = cl::sycl::get_pointer_type(mem, ctx);
 
-    using usm_thruth_table =
-        std::array<usm_support_mode, utils::enum_to_underlying(cl::sycl::usm::alloc::unknown)>;
-    constexpr int platform_types_configurations_count = 4; /*host, cpu, gpu, accel*/
-    constexpr std::array<usm_thruth_table, platform_types_configurations_count> usm_target_table{ {
+    using usm_truth_table =
+        std::array<usm_support_mode, utils::enum_to_underlying(cl::sycl::usm::alloc::unknown) + 1>;
+
+    constexpr int platform_config_count = 4; /*host, cpu, gpu, accel*/
+    constexpr std::array<usm_truth_table, platform_config_count> usm_target_table{ {
         { { usm_support_mode::direct,
             usm_support_mode::prohibited,
-            usm_support_mode::shared } }, //host conf:  host, device, shared
+            usm_support_mode::shared,
+            usm_support_mode::direct } }, //host conf:  host, device, shared, unknown
         { { usm_support_mode::direct,
             usm_support_mode::prohibited,
-            usm_support_mode::shared } }, //cpu conf:  host, device, shared
+            usm_support_mode::shared,
+            usm_support_mode::direct } }, //cpu conf:  host, device, shared, unknown
         { { usm_support_mode::prohibited,
             usm_support_mode::need_conversion,
-            usm_support_mode::shared } }, //gpu conf:  host, device, shared
+            usm_support_mode::shared,
+            usm_support_mode::prohibited } }, //gpu conf:  host, device, shared, unknown
         { { usm_support_mode::prohibited,
             usm_support_mode::prohibited,
-            usm_support_mode::shared } } //accel conf:  host, device, shared
+            usm_support_mode::shared,
+            usm_support_mode::prohibited } } //accel conf:  host, device, shared, unknown
     } };
-    int platform_configuration_type_idx = get_platform_type_index(device);
+
+    auto platform_type_index = get_platform_type_index(device);
+
+    auto pointer_type_idx = utils::enum_to_underlying(pointer_type);
+    CCL_THROW_IF_NOT(pointer_type_idx < usm_target_table[platform_type_index].size(),
+                     "usm_type index ",
+                     pointer_type_idx,
+                     " is larger that array size ",
+                     usm_target_table[platform_type_index].size());
+
     std::get<assoc_result_index::SUPPORT_MODE>(ret) =
-        usm_target_table[platform_configuration_type_idx][utils::enum_to_underlying(pointer_type)];
+        usm_target_table[platform_type_index][pointer_type_idx];
+
     if (std::get<assoc_result_index::SUPPORT_MODE>(ret) == usm_support_mode::prohibited) {
         std::stringstream ss;
         ss << "Incompatible USM type requested: " << usm_to_string(pointer_type)
-           << ", for ccl_device: " << std::to_string(platform_configuration_type_idx);
+           << ", for ccl_device: " << std::to_string(platform_type_index);
         std::get<assoc_result_index::ERROR_CAUSE>(ret) = ss.str();
     }
 #else
@@ -144,9 +165,33 @@ assoc_retult check_assoc_device_memory(const void* mem,
     return ret;
 }
 
+usm_support_mode check_assoc_device_memory(const std::vector<void*>& mems,
+                                           const ccl::unified_device_type::ccl_native_t& device,
+                                           const ccl::unified_context_type::ccl_native_t& ctx) {
+    usm_support_mode ret = usm_support_mode::direct;
+    std::string err_msg;
+
+    for (size_t idx = 0; idx < mems.size(); idx++) {
+        usm_support_mode mode;
+        std::tie(mode, std::ignore, err_msg) = check_assoc_device_memory(mems[idx], device, ctx);
+
+        if (idx > 0)
+            CCL_THROW_IF_NOT(mode == ret, "different USM modes between buffers: ", err_msg);
+
+        ret = mode;
+
+        CCL_THROW_IF_NOT((mode == usm_support_mode::direct) || (mode == usm_support_mode::shared) ||
+                             (mode == usm_support_mode::need_conversion),
+                         "unsupported USM configuration: ",
+                         err_msg);
+    }
+
+    return ret;
+}
+
 #endif //defined(MULTI_GPU_SUPPORT) || defined(CCL_ENABLE_SYCL)
 
-std::string to_string(const assoc_retult& res) {
+std::string to_string(const assoc_result& res) {
     std::stringstream ss;
     ss << "Mem: " << std::get<assoc_result_index::POINTER_VALUE>(res)
        << ", is: " << to_string(std::get<assoc_result_index::SUPPORT_MODE>(res));
@@ -156,5 +201,5 @@ std::string to_string(const assoc_retult& res) {
     }
     return ss.str();
 }
-} // namespace details
+} // namespace detail
 } // namespace native
diff --git a/src/native_device_api/l0/base.cpp b/src/native_device_api/l0/base.cpp
index e1837ac59..88a664b0c 100644
--- a/src/native_device_api/l0/base.cpp
+++ b/src/native_device_api/l0/base.cpp
@@ -57,23 +57,6 @@ std::string CCL_API to_string(ze_device_type_t type) {
     return "";
 }
 
-std::string to_string(ze_memory_access_cap_flags_t cap) {
-    std::string ret;
-    if (cap & ZE_MEMORY_ACCESS_CAP_FLAG_RW) {
-        ret += "ZE_MEMORY_ACCESS_CAP_FLAG_RW";
-    }
-    if (cap & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC ) {
-        ret += ret.empty() ? "ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC" : "|ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC";
-    }
-    if (cap & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT) {
-        ret += ret.empty() ? "ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT" : "|ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT";
-    }
-    if (cap & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC) {
-        ret += ret.empty() ? "ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC" : "|ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC";
-    }
-    return ret;
-}
-
 std::string CCL_API to_string(ze_memory_type_t type) {
     switch (type) {
         case ZE_MEMORY_TYPE_UNKNOWN: return "ZE_MEMORY_TYPE_UNKNOWN";
@@ -88,6 +71,26 @@ std::string CCL_API to_string(ze_memory_type_t type) {
     return "";
 }
 
+std::string to_string(ze_memory_access_cap_flags_t cap) {
+    std::string ret;
+    if (cap & ZE_MEMORY_ACCESS_CAP_FLAG_RW) {
+        ret += "ZE_MEMORY_ACCESS_CAP_FLAG_RW";
+    }
+    if (cap & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC) {
+        ret +=
+            ret.empty() ? "ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC" : "|ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC";
+    }
+    if (cap & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT) {
+        ret += ret.empty() ? "ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT"
+                           : "|ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT";
+    }
+    if (cap & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC) {
+        ret += ret.empty() ? "ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC"
+                           : "|ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC";
+    }
+    return ret;
+}
+
 std::string CCL_API to_string(const ze_device_properties_t& device_properties,
                               const std::string& prefix) {
     std::stringstream ss;
@@ -103,7 +106,8 @@ std::string CCL_API to_string(const ze_device_properties_t& device_properties,
 
     // TODO L0: need to_string() for supported flags printing
     ss << "Supported flags: " << (bool)device_properties.flags << prefix
-       << "coreClockRate: " << device_properties.coreClockRate << prefix
+       << "coreClockRate: " << device_properties.coreClockRate
+       << prefix
        // << "maxCommandQueues: " << device_properties.maxCommandQueues << prefix
        << "maxCommandQueuePriority: " << device_properties.maxCommandQueuePriority << prefix
        << "numThreadsPerEU: " << device_properties.numThreadsPerEU << prefix
@@ -165,6 +169,29 @@ std::string CCL_API to_string(const ze_memory_allocation_properties_t& prop) {
     return ss.str();
 }
 
+std::string CCL_API to_string(const ze_device_mem_alloc_desc_t& mem_descr) {
+    std::stringstream ss;
+    std::string flag;
+
+    if (mem_descr.flags & ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED) {
+        flag = "ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED";
+    }
+    if (mem_descr.flags & ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED) {
+        flag = flag + " | " + "ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED";
+    }
+    if (mem_descr.flags & ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32) {
+        flag = flag + " | " + "ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32";
+    }
+    else {
+        throw std::runtime_error(std::string("Unknown ze_device_mem_alloc_desc_t flag: ") +
+                                 std::to_string(static_cast<int>(mem_descr.flags)));
+    }
+
+    ss << "stype: " << mem_descr.stype << ", pNext: " << (void*)mem_descr.pNext
+       << ", flags: " << flag << ", ordinal: " << mem_descr.ordinal;
+    return ss.str();
+}
+
 // TODO L0: need to_string() for supported flags printing
 std::string CCL_API to_string(const ze_device_p2p_properties_t& properties) {
     std::stringstream ss;
diff --git a/src/native_device_api/l0/context.cpp b/src/native_device_api/l0/context.cpp
index 6b6ff1553..1e2f13ab2 100644
--- a/src/native_device_api/l0/context.cpp
+++ b/src/native_device_api/l0/context.cpp
@@ -20,10 +20,34 @@
 #include "oneapi/ccl/native_device_api/l0/driver.hpp"
 #include "oneapi/ccl/native_device_api/l0/platform.hpp"
 
-#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
-
 namespace native {
 
-  ccl_context::ccl_context(handle_t h, owner_ptr_t&& platform): 
-            base(h, std::move(platform), std::weak_ptr<ccl_context> { }) {  }
-}// namespace native
+ccl_context::ccl_context(handle_t h, owner_ptr_t&& platform)
+        : base(h, std::move(platform), std::weak_ptr<ccl_context>{}) {}
+
+// Thread safe array
+context_array_t::context_array_accessor context_array_t::access() {
+    return context_array_accessor(m, contexts);
+}
+
+// Thread safe context storage holder
+ze_context_handle_t ccl_context_holder::get() {
+    return nullptr;
+}
+
+std::shared_ptr<ccl_context> ccl_context_holder::emplace(ccl_device_driver* key,
+                                                         std::shared_ptr<ccl_context>&& ctx) {
+    std::unique_lock<std::mutex> lock(m); //TODO use shared lock
+
+    context_array_t& cont = drivers_context[key];
+    auto acc = cont.access();
+    acc.get().push_back(std::move(ctx));
+    return acc.get().back();
+}
+
+context_array_t& ccl_context_holder::get_context_storage(ccl_device_driver* driver) {
+    std::unique_lock<std::mutex> lock(m); //TODO use shared lock
+    context_array_t& cont = drivers_context[driver];
+    return cont;
+}
+} // namespace native
diff --git a/src/native_device_api/l0/device.cpp b/src/native_device_api/l0/device.cpp
index 87686dc2b..61dd12969 100644
--- a/src/native_device_api/l0/device.cpp
+++ b/src/native_device_api/l0/device.cpp
@@ -37,25 +37,32 @@ uint32_t get_device_properties_from_handle(ccl_device::handle_t handle) {
     return device_properties.deviceId;
 }
 
-details::cross_device_rating property_p2p_rating_calculator(const native::ccl_device& lhs,
-                                                            const native::ccl_device& rhs,
-                                                            size_t weight) {
+detail::cross_device_rating property_p2p_rating_calculator(const native::ccl_device& lhs,
+                                                           const native::ccl_device& rhs,
+                                                           size_t weight) {
     ze_device_p2p_properties_t p2p = lhs.get_p2p_properties(rhs);
     if (p2p.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS)
         return weight;
-    else
-        return 0;
+    else {
+        ze_bool_t access;
+        ze_result_t ret = zeDeviceCanAccessPeer(lhs.handle, rhs.handle, &access);
+        if (ret != ZE_RESULT_SUCCESS) {
+            throw std::runtime_error(std::string("Cannot execute zeDeviceCanAccessPeer, error: ") +
+                                     native::to_string(ret));
+        }
+        return access ? weight : 0;
+    }
 }
 
 CCL_API
 std::shared_ptr<ccl_device> ccl_device::create(
     handle_t handle,
     owner_ptr_t&& driver,
-    const ccl::device_indices_t& indexes /* = ccl::device_indices_t()*/) {
+    const ccl::device_indices_type& indexes /* = ccl::device_indices_type()*/) {
     // TODO - dirty code
     owner_ptr_t shared_driver(std::move(driver));
-    std::shared_ptr<ccl_device> device =
-        std::make_shared<ccl_device>(handle, shared_driver.lock()->get_ptr(), shared_driver.lock()->get_driver_contexts());
+    std::shared_ptr<ccl_device> device = std::make_shared<ccl_device>(
+        handle, shared_driver.lock()->get_ptr(), shared_driver.lock()->get_driver_contexts());
 
     auto collected_subdevices_list = ccl_subdevice::get_handles(*device);
 
@@ -98,7 +105,7 @@ std::shared_ptr<ccl_device> ccl_device::create(
 CCL_API
 ccl_device::indexed_handles ccl_device::get_handles(
     const ccl_device_driver& driver,
-    const ccl::device_indices_t& requested_device_indexes /* = indices()*/) {
+    const ccl::device_indices_type& requested_device_indexes /* = indices()*/) {
     uint32_t devices_count = 0;
     ze_result_t err = zeDeviceGet(driver.handle, &devices_count, nullptr);
     if (err != ZE_RESULT_SUCCESS) {
@@ -118,7 +125,7 @@ ccl_device::indexed_handles ccl_device::get_handles(
 
     //filter indices by driver id
     auto parent_id = driver.get_driver_id();
-    ccl::device_indices_t filtered_ids;
+    ccl::device_indices_type filtered_ids;
     if (!requested_device_indexes.empty()) {
         for (const auto& index : requested_device_indexes) {
             if (std::get<ccl::device_index_enum::driver_index_id>(index) == parent_id) {
@@ -191,10 +198,14 @@ void ccl_device::initialize_device_data() {
     }
 }
 
-ccl_device::ccl_device(handle_t h, owner_ptr_t&& parent, std::weak_ptr<ccl_context_holder>&& ctx, std::false_type)
+ccl_device::ccl_device(handle_t h,
+                       owner_ptr_t&& parent,
+                       std::weak_ptr<ccl_context_holder>&& ctx,
+                       std::false_type)
         : base(h, std::move(parent), std::move(ctx)) {}
 
-ccl_device::ccl_device(handle_t h, owner_ptr_t&& parent, std::weak_ptr<ccl_context_holder>&& ctx) : base(h, std::move(parent), std::move(ctx)) {
+ccl_device::ccl_device(handle_t h, owner_ptr_t&& parent, std::weak_ptr<ccl_context_holder>&& ctx)
+        : base(h, std::move(parent), std::move(ctx)) {
     initialize_device_data();
 }
 
@@ -287,7 +298,8 @@ CCL_API ccl::device_index_type ccl_device::get_device_path() const {
 
 CCL_API ze_device_p2p_properties_t
 ccl_device::get_p2p_properties(const ccl_device& remote_device) const {
-    ze_device_p2p_properties_t pP2PProperties;
+    ze_device_p2p_properties_t pP2PProperties = { .stype = ZE_STRUCTURE_TYPE_DEVICE_P2P_PROPERTIES,
+                                                  .pNext = nullptr };
     ze_result_t ret = zeDeviceGetP2PProperties(handle, remote_device.handle, &pP2PProperties);
     if (ret != ZE_RESULT_SUCCESS) {
         throw std::runtime_error(std::string("Cannot execute zeDeviceGetP2PProperties, error: ") +
@@ -331,18 +343,18 @@ CCL_API const ze_device_mem_alloc_desc_t& ccl_device::get_default_mem_alloc_desc
 
 CCL_API const ze_host_mem_alloc_desc_t& ccl_device::get_default_host_alloc_desc() {
     static const ze_host_mem_alloc_desc_t common{
-        .stype      = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
-        .pNext      = NULL,
-        .flags      = 0,
+        .stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+        .pNext = NULL,
+        .flags = 0,
     };
     return common;
 }
 
-CCL_API ccl_device::device_queue ccl_device::create_cmd_queue(std::shared_ptr<ccl_context> ctx,
+CCL_API ccl_device::device_queue ccl_device::create_cmd_queue(
+    std::shared_ptr<ccl_context> ctx,
     const ze_command_queue_desc_t& properties /* = get_default_queue_desc()*/) {
-
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
     ze_command_queue_handle_t hCommandQueue;
@@ -354,8 +366,8 @@ CCL_API ccl_device::device_queue ccl_device::create_cmd_queue(std::shared_ptr<cc
     return device_queue(hCommandQueue, get_ptr(), ctx);
 }
 
-CCL_API ze_fence_handle_t ccl_device::create_or_get_fence(const device_queue& queue, 
-                                                          std::shared_ptr<ccl_context> ctx) {
+CCL_API ccl_device::device_queue_fence& ccl_device::get_fence(const device_queue& queue,
+                                                              std::shared_ptr<ccl_context> ctx) {
     //TODO not optimal
     std::unique_lock<std::mutex> lock(queue_mutex);
     auto fence_it = queue_fences.find(queue.handle);
@@ -375,7 +387,7 @@ CCL_API ze_fence_handle_t ccl_device::create_or_get_fence(const device_queue& qu
         device_queue_fence f(h, get_ptr(), ctx);
         fence_it = queue_fences.emplace(queue.handle, std::move(f)).first;
     }
-    return fence_it->second.handle;
+    return fence_it->second;
 }
 
 CCL_API void* ccl_device::device_alloc_memory(size_t bytes_count,
@@ -384,15 +396,15 @@ CCL_API void* ccl_device::device_alloc_memory(size_t bytes_count,
                                               const ze_host_mem_alloc_desc_t& host_descr,
                                               std::shared_ptr<ccl_context> ctx) {
     void* out_ptr = nullptr;
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
     ze_result_t
         ret = //zeDriverAllocSharedMem(get_owner()->handle, handle, flags, ordinal, ZE_HOST_MEM_ALLOC_FLAG_DEFAULT, bytes_count, alignment, &out_ptr);
         //zeDriverAllocHostMem(get_owner()->handle, ZE_HOST_MEM_ALLOC_FLAG_DEFAULT, bytes_count, alignment, &out_ptr);
-        zeMemAllocShared(
-            ctx->get(), &mem_descr, &host_descr, bytes_count, alignment, handle, &out_ptr);
+        zeMemAllocDevice(
+            ctx->get(), &mem_descr, /*&host_descr, */ bytes_count, alignment, handle, &out_ptr);
     if (ret != ZE_RESULT_SUCCESS) {
         throw std::runtime_error(std::string("cannot allocate memory, error: ") +
                                  std::to_string(ret));
@@ -407,8 +419,8 @@ CCL_API void* ccl_device::device_alloc_shared_memory(size_t bytes_count,
                                                      const ze_device_mem_alloc_desc_t& mem_descr,
                                                      std::shared_ptr<ccl_context> ctx) {
     void* out_ptr = nullptr;
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
     ze_result_t ret = zeMemAllocShared(
@@ -431,8 +443,7 @@ CCL_API ccl_device::handle_t ccl_device::get_assoc_device_handle(const void* ptr
     // TODO: empty
     ze_context_handle_t ctx_tmp = nullptr;
 
-    ze_result_t result =
-        zeMemGetAllocProperties(ctx_tmp, ptr, &mem_prop, &alloc_device_handle);
+    ze_result_t result = zeMemGetAllocProperties(ctx_tmp, ptr, &mem_prop, &alloc_device_handle);
     if (result != ZE_RESULT_SUCCESS) {
         throw std::runtime_error(std::string("Cannot zeMemGetAllocProperties: ") +
                                  native::to_string(result));
@@ -444,8 +455,8 @@ CCL_API void ccl_device::device_free_memory(void* mem_handle, std::shared_ptr<cc
     if (!mem_handle) {
         return;
     }
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
     if (zeMemFree(ctx->get(), mem_handle) != ZE_RESULT_SUCCESS) {
@@ -454,15 +465,15 @@ CCL_API void ccl_device::device_free_memory(void* mem_handle, std::shared_ptr<cc
 }
 
 CCL_API ccl_device::device_ipc_memory_handle ccl_device::create_ipc_memory_handle(
-    void* device_mem_ptr, std::shared_ptr<ccl_context> ctx) {
+    void* device_mem_ptr,
+    std::shared_ptr<ccl_context> ctx) {
     ze_ipc_mem_handle_t ipc_handle;
 
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
-    ze_result_t ret = zeMemGetIpcHandle(
-        ctx->get(), device_mem_ptr, &ipc_handle);
+    ze_result_t ret = zeMemGetIpcHandle(ctx->get(), device_mem_ptr, &ipc_handle);
     if (ret != ZE_RESULT_SUCCESS) {
         throw std::runtime_error(std::string("cannot get ipc mem handle, error: ") +
                                  native::to_string(ret));
@@ -471,7 +482,8 @@ CCL_API ccl_device::device_ipc_memory_handle ccl_device::create_ipc_memory_handl
 }
 
 CCL_API std::shared_ptr<ccl_device::device_ipc_memory_handle>
-ccl_device::create_shared_ipc_memory_handle(void* device_mem_ptr, std::shared_ptr<ccl_context> ctx) {
+ccl_device::create_shared_ipc_memory_handle(void* device_mem_ptr,
+                                            std::shared_ptr<ccl_context> ctx) {
     ze_ipc_mem_handle_t ipc_handle;
     //TODO thread-safety
     auto it = ipc_storage.find(device_mem_ptr);
@@ -479,12 +491,11 @@ ccl_device::create_shared_ipc_memory_handle(void* device_mem_ptr, std::shared_pt
         return it->second;
     }
 
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
-    ze_result_t ret = zeMemGetIpcHandle(
-        ctx->get(), device_mem_ptr, &ipc_handle);
+    ze_result_t ret = zeMemGetIpcHandle(ctx->get(), device_mem_ptr, &ipc_handle);
     if (ret != ZE_RESULT_SUCCESS) {
         throw std::runtime_error(std::string("cannot get ipc mem handle, error: ") +
                                  native::to_string(ret));
@@ -517,7 +528,8 @@ void CCL_API ccl_device::on_delete(ze_ipc_mem_handle_t& ipc_mem_handle, ze_conte
 }
 
 CCL_API ccl_device::device_ipc_memory ccl_device::get_ipc_memory(
-    std::shared_ptr<device_ipc_memory_handle>&& ipc_handle, std::shared_ptr<ccl_context> ctx) {
+    std::shared_ptr<device_ipc_memory_handle>&& ipc_handle,
+    std::shared_ptr<ccl_context> ctx) {
     assert(ipc_handle->get_owner().lock().get() == this && "IPC handle doesn't belong to device: ");
     //, this,
     // ", expected device: ", ipc_handle.get_owner());
@@ -525,12 +537,12 @@ CCL_API ccl_device::device_ipc_memory ccl_device::get_ipc_memory(
     ze_ipc_memory_flag_t flag = ZE_IPC_MEMORY_FLAG_TBD;
     ip_memory_elem_t ipc_memory{};
 
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
-    ze_result_t ret = zeMemOpenIpcHandle(
-        ctx->get(), handle, ipc_handle->handle, flag, &(ipc_memory.pointer));
+    ze_result_t ret =
+        zeMemOpenIpcHandle(ctx->get(), handle, ipc_handle->handle, flag, &(ipc_memory.pointer));
     if (ret != ZE_RESULT_SUCCESS) {
         throw std::runtime_error(std::string("cannot get open ipc mem handle from: ") +
                                  native::to_string(ipc_handle->handle) +
@@ -547,17 +559,18 @@ CCL_API ccl_device::device_ipc_memory ccl_device::get_ipc_memory(
 }
 
 CCL_API std::shared_ptr<ccl_device::device_ipc_memory> ccl_device::restore_shared_ipc_memory(
-    std::shared_ptr<device_ipc_memory_handle>&& ipc_handle, std::shared_ptr<ccl_context> ctx) {
+    std::shared_ptr<device_ipc_memory_handle>&& ipc_handle,
+    std::shared_ptr<ccl_context> ctx) {
     assert(ipc_handle->get_owner().lock().get() == this && "IPC handle doesn't belong to device: ");
     ze_ipc_memory_flag_t flag = ZE_IPC_MEMORY_FLAG_TBD;
     ip_memory_elem_t ipc_memory{};
 
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
-    ze_result_t ret = zeMemOpenIpcHandle(
-        ctx->get(), handle, ipc_handle->handle, flag, &(ipc_memory.pointer));
+    ze_result_t ret =
+        zeMemOpenIpcHandle(ctx->get(), handle, ipc_handle->handle, flag, &(ipc_memory.pointer));
     if (ret != ZE_RESULT_SUCCESS) {
         throw std::runtime_error(std::string("cannot get open ipc mem handle from: ") +
                                  native::to_string(ipc_handle->handle) +
@@ -588,7 +601,8 @@ void CCL_API ccl_device::on_delete(ip_memory_elem_t& ipc_mem, ze_context_handle_
 }
 
 CCL_API ccl_device::device_queue& ccl_device::get_cmd_queue(
-    const ze_command_queue_desc_t& properties, std::shared_ptr<ccl_context> ctx) {
+    const ze_command_queue_desc_t& properties,
+    std::shared_ptr<ccl_context> ctx) {
     std::unique_lock<std::mutex> lock(queue_mutex);
     auto it = cmd_queus.find(properties);
     if (it == cmd_queus.end()) {
@@ -598,21 +612,21 @@ CCL_API ccl_device::device_queue& ccl_device::get_cmd_queue(
 }
 
 CCL_API
-ccl_device::context_storage_type ccl_device::get_device_contexts() {
+ccl_device::context_storage_type ccl_device::get_contexts() {
     return get_ctx().lock();
 }
 
 CCL_API
-std::shared_ptr<ccl_context> ccl_device::get_default_device_context() {
-    auto ctx_holder = get_device_contexts();
-    if (ctx_holder->map_context.empty())
+std::shared_ptr<ccl_context> ccl_device::get_default_context() {
+    auto ctx_holder = get_contexts();
+    auto driver = get_owner().lock();
+
+    auto& contexts_storage = ctx_holder->get_context_storage(driver.get());
+    auto acc = contexts_storage.access();
+    if (acc.get().empty())
         throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
                                  " - no default driver in context map");
-    auto &default_driver_ptr = *ctx_holder->map_context.begin();
-    if (default_driver_ptr.second.empty())
-        throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
-                                 " - no default context for default driver");
-    auto ctx = *default_driver_ptr.second.begin();
+    auto ctx = *acc.get().begin();
 
     return ctx;
 }
@@ -621,8 +635,8 @@ ccl_device::device_cmd_list CCL_API
 ccl_device::create_cmd_list(std::shared_ptr<ccl_context> ctx,
                             const ze_command_list_desc_t& properties) {
     // Create a command queue
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
     ze_command_list_handle_t hCommandList;
@@ -634,11 +648,13 @@ ccl_device::create_cmd_list(std::shared_ptr<ccl_context> ctx,
     return device_cmd_list(hCommandList, get_ptr(), ctx);
 }
 
-CCL_API ccl_device::device_cmd_list& ccl_device::get_cmd_list(std::shared_ptr<ccl_context> ctx,
+CCL_API ccl_device::device_cmd_list& ccl_device::get_cmd_list(
+    std::shared_ptr<ccl_context> ctx,
     const ze_command_list_desc_t& properties /* = get_default_list_desc()*/) {
+    std::unique_lock<std::mutex> lock(list_mutex);
     auto it = cmd_lists.find(properties);
     if (it == cmd_lists.end()) {
-        it = cmd_lists.emplace(properties, create_cmd_list( ctx, properties)).first;
+        it = cmd_lists.emplace(properties, create_cmd_list(ctx, properties)).first;
     }
     return it->second;
 }
@@ -655,8 +671,8 @@ CCL_API ccl_device::device_module_ptr ccl_device::create_module(const ze_module_
     ze_module_handle_t module = nullptr;
     ze_module_build_log_handle_t build_log = nullptr;
 
-    if(!ctx) {
-        ctx = get_default_device_context();
+    if (!ctx) {
+        ctx = get_default_context();
     }
 
     ze_result_t result = zeModuleCreate(ctx->get(), handle, &descr, &module, &build_log);
@@ -699,7 +715,8 @@ void CCL_API ccl_device::on_delete(ze_command_list_handle_t& handle, ze_context_
     //TODO remove from map: cmd_lists;
 }
 
-void CCL_API ccl_device::on_delete(ze_device_handle_t& sub_device_handle, ze_context_handle_t& ctx) {
+void CCL_API ccl_device::on_delete(ze_device_handle_t& sub_device_handle,
+                                   ze_context_handle_t& ctx) {
     auto& subdevices = get_subdevices();
     auto it = std::find_if(
         subdevices.begin(),
diff --git a/src/native_device_api/l0/driver.cpp b/src/native_device_api/l0/driver.cpp
index 3726ec2d3..0a499c667 100644
--- a/src/native_device_api/l0/driver.cpp
+++ b/src/native_device_api/l0/driver.cpp
@@ -22,11 +22,12 @@
 
 #include "oneapi/ccl/native_device_api/l0/base_impl.hpp"
 #include "oneapi/ccl/native_device_api/l0/device.hpp"
+#include "oneapi/ccl/native_device_api/l0/subdevice.hpp"
 #include "oneapi/ccl/native_device_api/l0/primitives_impl.hpp"
 #include "oneapi/ccl/native_device_api/l0/driver.hpp"
 #include "oneapi/ccl/native_device_api/l0/platform.hpp"
 
-#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
+//#include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
 
 namespace native {
 uint32_t get_driver_properties(ccl_device_driver::handle_t handle) {
@@ -45,7 +46,7 @@ ccl_device_driver::context_storage_type ccl_device_driver::get_driver_contexts()
 }
 
 ccl_device_driver::indexed_driver_handles ccl_device_driver::get_handles(
-    const ccl::device_indices_t& requested_driver_indexes /* = indices()*/) {
+    const ccl::device_indices_type& requested_driver_indexes /* = indices()*/) {
     uint32_t driver_count = 0;
     ze_result_t err = zeDriverGet(&driver_count, nullptr);
     if (err != ZE_RESULT_SUCCESS) {
@@ -84,13 +85,14 @@ CCL_API std::shared_ptr<ccl_device_driver> ccl_device_driver::create(
     auto ctx = platform.lock()->get_platform_contexts();
     std::shared_ptr<ccl_device_driver> driver =
         std::make_shared<ccl_device_driver>(h, id, std::move(platform), std::move(ctx));
-    driver->create_context();
+    if (!driver->create_context()) {
+        throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + " - create context is invalid");
+    }
 
     auto collected_devices_list =
         ccl_device::get_handles(*driver, get_device_indices(rank_device_affinity));
     for (const auto& val : collected_devices_list) {
         driver->devices.emplace(val.first, ccl_device::create(val.second, driver->get_ptr()));
-
     }
 
     return driver;
@@ -100,11 +102,13 @@ CCL_API std::shared_ptr<ccl_device_driver> ccl_device_driver::create(
     handle_t h,
     uint32_t id,
     owner_ptr_t&& platform,
-    const ccl::device_indices_t& rank_device_affinity /* = ccl::device_indices_t()*/) {
+    const ccl::device_indices_type& rank_device_affinity /* = ccl::device_indices_type()*/) {
     auto ctx = platform.lock()->get_platform_contexts();
     std::shared_ptr<ccl_device_driver> driver =
         std::make_shared<ccl_device_driver>(h, id, std::move(platform), ctx);
-    driver->create_context();
+    if (!driver->create_context()) {
+        throw std::runtime_error(std::string(__PRETTY_FUNCTION__) + " - create context is invalid");
+    }
 
     auto collected_devices_list = ccl_device::get_handles(*driver, rank_device_affinity);
     try {
@@ -115,7 +119,7 @@ CCL_API std::shared_ptr<ccl_device_driver> ccl_device_driver::create(
             }
             else {
                 //collect device_index only for drvier specific index
-                ccl::device_indices_t per_driver_index;
+                ccl::device_indices_type per_driver_index;
                 for (const auto& affitinity : rank_device_affinity) {
                     if (std::get<ccl::device_index_enum::device_index_id>(affitinity) ==
                         val.first) {
@@ -188,7 +192,7 @@ CCL_API ccl_device_driver::const_device_ptr ccl_device_driver::get_device(
                                  ". Total devices count: " + std::to_string(devices.size()));
     }
 
-    const device_ptr found_device_ptr = device_it->second;
+    const_device_ptr found_device_ptr = device_it->second;
     ccl::index_type subdevice_index = std::get<ccl::device_index_enum::subdevice_index_id>(path);
     if (ccl::unused_index_value == subdevice_index) {
         return found_device_ptr;
@@ -214,9 +218,9 @@ CCL_API uint32_t ccl_device_driver::get_driver_id() const noexcept {
     return driver_id;
 }
 
-CCL_API ccl::device_indices_t ccl_device_driver::get_device_indices(
+CCL_API ccl::device_indices_type ccl_device_driver::get_device_indices(
     const ccl::device_mask_t& mask) {
-    ccl::device_indices_t ret;
+    ccl::device_indices_type ret;
     std::cerr << __PRETTY_FUNCTION__ << " NOT IMPLEMENTED" << std::endl;
     abort();
     /*
@@ -232,7 +236,7 @@ CCL_API ccl::device_indices_t ccl_device_driver::get_device_indices(
 }
 
 CCL_API ccl::device_mask_t ccl_device_driver::get_device_mask(
-    const ccl::device_indices_t& device_idx) {
+    const ccl::device_indices_type& device_idx) {
     ccl::device_mask_t ret;
     std::cerr << __PRETTY_FUNCTION__ << " NOT IMPLEMENTED" << std::endl;
     abort();
@@ -245,29 +249,32 @@ CCL_API ccl::device_mask_t ccl_device_driver::get_device_mask(
     return ret;
 }
 
-std::shared_ptr<ccl_context> ccl_device_driver::create_context() {
+CCL_API std::shared_ptr<ccl_context> ccl_device_driver::create_context() {
     ze_result_t status = ZE_RESULT_SUCCESS;
     ze_context_handle_t context;
-    ze_context_desc_t context_desc = {
-      ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
+    ze_context_desc_t context_desc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0 };
 
-    auto platform = get_owner();
     status = zeContextCreate(handle, &context_desc, &context);
-    assert(status == ZE_RESULT_SUCCESS);
+    if (status != ZE_RESULT_SUCCESS) {
+        throw std::runtime_error(std::string("zeContextCreate, error: ") +
+                                 native::to_string(status));
+    }
 
-    std::vector<std::weak_ptr<ccl_context>> vec;
+    return create_context_from_handle(context);
+}
+
+CCL_API std::shared_ptr<ccl_context> ccl_device_driver::create_context_from_handle(
+    ccl_context::handle_t h) {
+    auto platform = get_owner();
     auto ctx_holder = get_ctx().lock();
-    auto &table = ctx_holder->map_context;
-    auto ret = std::make_shared<ccl_context>(context, std::move(platform));
-    table[this].push_back(ret);
 
-    return ret;
+    return ctx_holder->emplace(this, std::make_shared<ccl_context>(h, std::move(platform)));
 }
 
 void CCL_API ccl_device_driver::on_delete(ze_device_handle_t& sub_device_handle,
                                           ze_context_handle_t& context) {
-     // status = zeContextDestroy(context);
-     // assert(status == ZE_RESULT_SUCCESS);
+    // status = zeContextDestroy(context);
+    // assert(status == ZE_RESULT_SUCCESS);
 }
 
 std::string CCL_API ccl_device_driver::to_string(const std::string& prefix) const {
diff --git a/src/native_device_api/l0/export.cpp b/src/native_device_api/l0/export.cpp
index b5b6af2f5..e78820dd4 100644
--- a/src/native_device_api/l0/export.cpp
+++ b/src/native_device_api/l0/export.cpp
@@ -13,7 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_config.h"
+#include "oneapi/ccl/config.h"
 #if defined(MULTI_GPU_SUPPORT) and !defined(CCL_ENABLE_SYCL)
 
 #include "oneapi/ccl/native_device_api/l0/export.hpp"
@@ -25,53 +25,52 @@ namespace ccl {
 /**
  * Context
  */
-generic_device_context_type<cl_backend_type::l0>::generic_device_context_type() {}
-generic_device_context_type<cl_backend_type::l0>::generic_device_context_type(handle_t ctx)
-        : context() {
-    //TODO context
-    (void)ctx;
+generic_context_type<cl_backend_type::l0>::generic_context_type() {}
+generic_context_type<cl_backend_type::l0>::generic_context_type(ccl_native_t ctx) : context(ctx) {}
 
-    throw;
+generic_context_type<cl_backend_type::l0>::ccl_native_t&
+generic_context_type<cl_backend_type::l0>::get() noexcept {
+    return const_cast<generic_context_type<cl_backend_type::l0>::ccl_native_t&>(
+        static_cast<const generic_context_type<cl_backend_type::l0>*>(this)->get());
 }
 
-generic_device_context_type<cl_backend_type::l0>::ccl_native_t
-generic_device_context_type<cl_backend_type::l0>::get() noexcept {
-    return /*const_cast<generic_device_context_type<cl_backend_type::l0>::ccl_native_t>*/ (
-        static_cast<const generic_device_context_type<cl_backend_type::l0>*>(this)->get());
-}
-
-const generic_device_context_type<cl_backend_type::l0>::ccl_native_t&
-generic_device_context_type<cl_backend_type::l0>::get() const noexcept {
+const generic_context_type<cl_backend_type::l0>::ccl_native_t&
+generic_context_type<cl_backend_type::l0>::get() const noexcept {
     //TODO
     return context; //native::get_platform();
 }
 
-
 /**
  * Device
  */
-generic_device_type<cl_backend_type::l0>::generic_device_type(device_index_type id)
-        : device(id) {}
+generic_device_type<cl_backend_type::l0>::generic_device_type(device_index_type id) : device(id) {}
+
+generic_device_type<cl_backend_type::l0>::generic_device_type(ccl_native_t dev)
+        : device(dev->get_device_path()) {}
 
 device_index_type generic_device_type<cl_backend_type::l0>::get_id() const noexcept {
     return device;
 }
 
-typename generic_device_type<cl_backend_type::l0>::ccl_native_t
+typename generic_device_type<cl_backend_type::l0>::ccl_native_t&
 generic_device_type<cl_backend_type::l0>::get() noexcept {
     return native::get_runtime_device(device);
 }
 
+const typename generic_device_type<cl_backend_type::l0>::ccl_native_t&
+generic_device_type<cl_backend_type::l0>::get() const noexcept {
+    return native::get_runtime_device(device);
+}
 
 /**
  * Event
  */
 generic_event_type<cl_backend_type::l0>::generic_event_type(handle_t e)
-        : event(/*TODO use ccl_device_context to create event*/) {}
+        : event(/*TODO use ccl_context to create event*/) {}
 
-generic_event_type<cl_backend_type::l0>::ccl_native_t
+generic_event_type<cl_backend_type::l0>::ccl_native_t&
 generic_event_type<cl_backend_type::l0>::get() noexcept {
-    return const_cast<generic_event_type<cl_backend_type::l0>::ccl_native_t>(
+    return const_cast<generic_event_type<cl_backend_type::l0>::ccl_native_t&>(
         static_cast<const generic_event_type<cl_backend_type::l0>*>(this)->get());
 }
 
@@ -80,16 +79,15 @@ generic_event_type<cl_backend_type::l0>::get() const noexcept {
     return event;
 }
 
-
 /**
  * Stream
  */
 generic_stream_type<cl_backend_type::l0>::generic_stream_type(handle_t q)
         : queue(/*TODO use ccl_device to create event*/) {}
 
-generic_stream_type<cl_backend_type::l0>::ccl_native_t
+generic_stream_type<cl_backend_type::l0>::ccl_native_t&
 generic_stream_type<cl_backend_type::l0>::get() noexcept {
-    return const_cast<generic_stream_type<cl_backend_type::l0>::ccl_native_t>(
+    return const_cast<generic_stream_type<cl_backend_type::l0>::ccl_native_t&>(
         static_cast<const generic_stream_type<cl_backend_type::l0>*>(this)->get());
 }
 
@@ -98,13 +96,12 @@ generic_stream_type<cl_backend_type::l0>::get() const noexcept {
     return queue;
 }
 
-
 /**
  * Platform
  */
-generic_platform_type<cl_backend_type::l0>::ccl_native_t
+generic_platform_type<cl_backend_type::l0>::ccl_native_t&
 generic_platform_type<cl_backend_type::l0>::get() noexcept {
-    return const_cast<generic_platform_type<cl_backend_type::l0>::ccl_native_t>(
+    return const_cast<generic_platform_type<cl_backend_type::l0>::ccl_native_t&>(
         static_cast<const generic_platform_type<cl_backend_type::l0>*>(this)->get());
 }
 
@@ -112,5 +109,5 @@ const generic_platform_type<cl_backend_type::l0>::ccl_native_t&
 generic_platform_type<cl_backend_type::l0>::get() const noexcept {
     return native::get_platform();
 }
-}
+} // namespace ccl
 #endif //MULTI_GPU_SUPPORT
diff --git a/src/native_device_api/l0/platform.cpp b/src/native_device_api/l0/platform.cpp
index 0c8216482..b93b2277c 100644
--- a/src/native_device_api/l0/platform.cpp
+++ b/src/native_device_api/l0/platform.cpp
@@ -38,7 +38,7 @@ CCL_API ccl_device_platform& get_platform() {
 }
 
 CCL_API std::shared_ptr<ccl_device_platform> ccl_device_platform::create(
-    const ccl::device_indices_t& indices /* = device_indices_per_driver()*/) {
+    const ccl::device_indices_type& indices /* = device_indices_per_driver()*/) {
     std::shared_ptr<ccl_device_platform> platform(new ccl_device_platform);
     platform->init_drivers(indices);
     return platform;
@@ -88,7 +88,7 @@ CCL_API void ccl_device_platform::init_drivers(const device_affinity_per_driver&
 }
 */
 CCL_API void ccl_device_platform::init_drivers(
-    const ccl::device_indices_t& driver_device_affinities /* = device_indices_per_driver()*/) {
+    const ccl::device_indices_type& driver_device_affinities /* = device_indices_per_driver()*/) {
     /* TODO - do we need that?
 
 #ifdef CCL_ENABLE_SYCL
@@ -127,7 +127,7 @@ CCL_API void ccl_device_platform::init_drivers(
             }
             else {
                 //collect device_index only for drvier specific index
-                ccl::device_indices_t per_driver_index;
+                ccl::device_indices_type per_driver_index;
                 for (const auto& affitinity : driver_device_affinities) {
                     if (std::get<ccl::device_index_enum::driver_index_id>(affitinity) ==
                         val.first) {
@@ -153,12 +153,13 @@ CCL_API void ccl_device_platform::init_drivers(
     }
 }
 
-CCL_API 
+CCL_API
 ccl_device_platform::context_storage_type ccl_device_platform::get_platform_contexts() {
     return context;
 }
 
-std::shared_ptr<ccl_context> ccl_device_platform::create_context(std::shared_ptr<ccl_device_driver> driver) {
+std::shared_ptr<ccl_context> ccl_device_platform::create_context(
+    std::shared_ptr<ccl_device_driver> driver) {
     return driver->create_context();
 }
 
@@ -169,9 +170,7 @@ void CCL_API ccl_device_platform::on_delete(ze_driver_handle_t& sub_device_handl
 }
 
 void CCL_API ccl_device_platform::on_delete(ze_context_handle_t& handle,
-                                            ze_context_handle_t& context){
-    
-}
+                                            ze_context_handle_t& context) {}
 
 CCL_API ccl_device_platform::const_driver_ptr ccl_device_platform::get_driver(
     ccl::index_type index) const {
@@ -192,7 +191,8 @@ CCL_API ccl_device_platform::driver_ptr ccl_device_platform::get_driver(ccl::ind
     return it->second;
 }
 
-const ccl_device_platform::driver_storage_type& ccl_device_platform::get_drivers() const noexcept {
+CCL_API const ccl_device_platform::driver_storage_type& ccl_device_platform::get_drivers()
+    const noexcept {
     return drivers;
 }
 
@@ -226,23 +226,23 @@ std::string CCL_API ccl_device_platform::to_string() const {
     return out.str();
 }
 
-details::adjacency_matrix ccl_device_platform::calculate_device_access_metric(
-    const ccl::device_indices_t& indices,
-    details::p2p_rating_function func) const {
-    details::adjacency_matrix result;
+detail::adjacency_matrix ccl_device_platform::calculate_device_access_metric(
+    const ccl::device_indices_type& indices,
+    detail::p2p_rating_function func) const {
+    detail::adjacency_matrix result;
 
     try {
         // diagonal matrix, assume symmetric cross device access
-        for (typename ccl::device_indices_t::const_iterator lhs_it = indices.begin();
+        for (typename ccl::device_indices_type::const_iterator lhs_it = indices.begin();
              lhs_it != indices.end();
              ++lhs_it) {
-            for (typename ccl::device_indices_t::const_iterator rhs_it = lhs_it;
+            for (typename ccl::device_indices_type::const_iterator rhs_it = lhs_it;
                  rhs_it != indices.end();
                  ++rhs_it) {
                 ccl_device_driver::const_device_ptr lhs_dev = get_device(*lhs_it);
                 ccl_device_driver::const_device_ptr rhs_dev = get_device(*rhs_it);
 
-                details::cross_device_rating rating = func(*lhs_dev, *rhs_dev);
+                detail::cross_device_rating rating = func(*lhs_dev, *rhs_dev);
                 result[*lhs_it][*rhs_it] = rating;
                 result[*rhs_it][*lhs_it] = rating;
             }
diff --git a/src/native_device_api/l0/subdevice.cpp b/src/native_device_api/l0/subdevice.cpp
index 1427b8c58..30b0f55eb 100644
--- a/src/native_device_api/l0/subdevice.cpp
+++ b/src/native_device_api/l0/subdevice.cpp
@@ -35,7 +35,7 @@ uint32_t get_subdevice_properties_from_handle(ccl_device::handle_t handle) {
         throw std::runtime_error(std::string(__PRETTY_FUNCTION__) +
                                  "- invalid device type, got device, but subdevice requested");
     }
-    return device_properties.deviceId;
+    return device_properties.subdeviceId;
 }
 
 CCL_API
@@ -43,15 +43,15 @@ std::shared_ptr<ccl_subdevice> ccl_subdevice::create(handle_t handle,
                                                      owner_ptr_t&& device,
                                                      base::owner_ptr_t&& driver) {
     auto ctx = driver.lock()->get_driver_contexts();
-    std::shared_ptr<ccl_subdevice> subdevice =
-        std::make_shared<ccl_subdevice>(handle, std::move(device), std::move(driver), std::move(ctx));
+    std::shared_ptr<ccl_subdevice> subdevice = std::make_shared<ccl_subdevice>(
+        handle, std::move(device), std::move(driver), std::move(ctx));
     return subdevice;
 }
 
 CCL_API
 ccl_subdevice::indexed_handles ccl_subdevice::get_handles(
     const ccl_device& device,
-    const ccl::device_indices_t& requested_indices) {
+    const ccl::device_indices_type& requested_indices) {
     uint32_t subdevices_count = 0;
     ze_result_t err = zeDeviceGetSubDevices(device.get(), &subdevices_count, nullptr);
     if (err != ZE_RESULT_SUCCESS) {
@@ -71,7 +71,7 @@ ccl_subdevice::indexed_handles ccl_subdevice::get_handles(
 
     //filter indices
     ccl::device_index_type owner_path = device.get_device_path();
-    ccl::device_indices_t filtered_ids;
+    ccl::device_indices_type filtered_ids;
     if (!requested_indices.empty()) {
         for (const auto& index : requested_indices) {
             if ((std::get<ccl::device_index_enum::driver_index_id>(index) ==
@@ -119,7 +119,10 @@ ccl_subdevice::ccl_subdevice(handle_t h,
           parent_device(std::move(device)) {}
 
 CCL_API
-ccl_subdevice::ccl_subdevice(handle_t h, owner_ptr_t&& device, base::owner_ptr_t&& driver, base::context_ptr_t&& ctx)
+ccl_subdevice::ccl_subdevice(handle_t h,
+                             owner_ptr_t&& device,
+                             base::owner_ptr_t&& driver,
+                             base::context_ptr_t&& ctx)
         : //  my_enable_shared_from_this<ccl_subdevice>(),
           base(h, std::move(driver), std::move(ctx)),
           parent_device(std::move(device)) {
diff --git a/src/native_device_api/l0/utils.cpp b/src/native_device_api/l0/utils.cpp
index e83d15dc0..9ad55be9d 100644
--- a/src/native_device_api/l0/utils.cpp
+++ b/src/native_device_api/l0/utils.cpp
@@ -26,7 +26,7 @@
 #endif
 
 namespace native {
-namespace details {
+namespace detail {
 
 adjacency_matrix::adjacency_matrix(std::initializer_list<typename base::value_type> init)
         : base(init) {}
@@ -36,6 +36,6 @@ cross_device_rating binary_p2p_rating_calculator(const native::ccl_device& lhs,
                                                  size_t weight) {
     return property_p2p_rating_calculator(lhs, rhs, 1);
 }
-} // namespace details
+} // namespace detail
 } // namespace native
 #endif //#if defined(MULTI_GPU_SUPPORT)
diff --git a/src/native_device_api/sycl/export.cpp b/src/native_device_api/sycl/export.cpp
index 5cb5cf66e..12b77b4f7 100644
--- a/src/native_device_api/sycl/export.cpp
+++ b/src/native_device_api/sycl/export.cpp
@@ -13,12 +13,12 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_config.h"
+#include "oneapi/ccl/config.h"
 
 #if defined(CCL_ENABLE_SYCL) and !defined(MULTI_GPU_SUPPORT)
 
 #include "oneapi/ccl/native_device_api/sycl/export.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/log/log.hpp"
 #include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
 
@@ -27,18 +27,18 @@ namespace ccl {
 /**
  * Context
  */
-generic_device_context_type<cl_backend_type::dpcpp_sycl>::generic_device_context_type() {}
-generic_device_context_type<cl_backend_type::dpcpp_sycl>::generic_device_context_type(ccl_native_t ctx)
+generic_context_type<cl_backend_type::dpcpp_sycl>::generic_context_type() {}
+generic_context_type<cl_backend_type::dpcpp_sycl>::generic_context_type(ccl_native_t ctx)
         : context(ctx) {}
 
-generic_device_context_type<cl_backend_type::dpcpp_sycl>::ccl_native_t&
-generic_device_context_type<cl_backend_type::dpcpp_sycl>::get() noexcept {
-    return const_cast<generic_device_context_type<cl_backend_type::dpcpp_sycl>::ccl_native_t&>(
-        static_cast<const generic_device_context_type<cl_backend_type::dpcpp_sycl>*>(this)->get());
+generic_context_type<cl_backend_type::dpcpp_sycl>::ccl_native_t&
+generic_context_type<cl_backend_type::dpcpp_sycl>::get() noexcept {
+    return const_cast<generic_context_type<cl_backend_type::dpcpp_sycl>::ccl_native_t&>(
+        static_cast<const generic_context_type<cl_backend_type::dpcpp_sycl>*>(this)->get());
 }
 
-const generic_device_context_type<cl_backend_type::dpcpp_sycl>::ccl_native_t&
-generic_device_context_type<cl_backend_type::dpcpp_sycl>::get() const noexcept {
+const generic_context_type<cl_backend_type::dpcpp_sycl>::ccl_native_t&
+generic_context_type<cl_backend_type::dpcpp_sycl>::get() const noexcept {
     return context;
 }
 
@@ -109,7 +109,8 @@ CCL_API generic_device_type<cl_backend_type::dpcpp_sycl>::generic_device_type(
     device = *it;
 }
 
-generic_device_type<cl_backend_type::dpcpp_sycl>::generic_device_type(const cl::sycl::device& in_device)
+generic_device_type<cl_backend_type::dpcpp_sycl>::generic_device_type(
+    const cl::sycl::device& in_device)
         : device(in_device) {}
 
 device_index_type generic_device_type<cl_backend_type::dpcpp_sycl>::get_id() const {
@@ -122,6 +123,11 @@ generic_device_type<cl_backend_type::dpcpp_sycl>::get() noexcept {
     return device;
 }
 
+const typename generic_device_type<cl_backend_type::dpcpp_sycl>::ccl_native_t&
+generic_device_type<cl_backend_type::dpcpp_sycl>::get() const noexcept {
+    return device;
+}
+
 /**
  * Event
  */
@@ -138,7 +144,6 @@ generic_event_type<cl_backend_type::dpcpp_sycl>::get() const noexcept {
     return event;
 }
 
-
 /**
  * Stream
  */
@@ -155,7 +160,6 @@ generic_stream_type<cl_backend_type::dpcpp_sycl>::get() const noexcept {
     return queue;
 }
 
-
 /**
  * Platform
  */
@@ -172,5 +176,5 @@ const generic_platform_type<cl_backend_type::dpcpp_sycl>::ccl_native_t&
 generic_platform_type<cl_backend_type::dpcpp_sycl>::get() const noexcept {
     return platform;
 }
-}
+} // namespace ccl
 #endif //CCL_ENABLE_SYCL and !defined(MULTI_GPU_SUPPORT)
diff --git a/src/native_device_api/sycl_l0/export.cpp b/src/native_device_api/sycl_l0/export.cpp
index f36402901..856946fc7 100644
--- a/src/native_device_api/sycl_l0/export.cpp
+++ b/src/native_device_api/sycl_l0/export.cpp
@@ -13,11 +13,11 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#include "oneapi/ccl/ccl_config.h"
-#if defined(CCL_ENABLE_SYCL) && defined (MULTI_GPU_SUPPORT)
+#include "oneapi/ccl/config.h"
+#if defined(CCL_ENABLE_SYCL) && defined(MULTI_GPU_SUPPORT)
 
 #include "oneapi/ccl/native_device_api/sycl_l0/export.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/log/log.hpp"
 #include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
 
@@ -26,20 +26,19 @@ namespace ccl {
 /**
  * Context
  */
-generic_device_context_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_context_type() {
-}
+generic_context_type<cl_backend_type::dpcpp_sycl_l0>::generic_context_type() {}
 
-generic_device_context_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_context_type(ccl_native_t ctx)
+generic_context_type<cl_backend_type::dpcpp_sycl_l0>::generic_context_type(ccl_native_t ctx)
         : context(ctx) {}
 
-generic_device_context_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&
-generic_device_context_type<cl_backend_type::dpcpp_sycl_l0>::get() noexcept {
-    return const_cast<generic_device_context_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&>(
-        static_cast<const generic_device_context_type<cl_backend_type::dpcpp_sycl_l0>*>(this)->get());
+generic_context_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&
+generic_context_type<cl_backend_type::dpcpp_sycl_l0>::get() noexcept {
+    return const_cast<generic_context_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&>(
+        static_cast<const generic_context_type<cl_backend_type::dpcpp_sycl_l0>*>(this)->get());
 }
 
-const generic_device_context_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&
-generic_device_context_type<cl_backend_type::dpcpp_sycl_l0>::get() const noexcept {
+const generic_context_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&
+generic_context_type<cl_backend_type::dpcpp_sycl_l0>::get() const noexcept {
     return context;
 }
 
@@ -113,7 +112,8 @@ CCL_API generic_device_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_type
     device = *it;
 }
 
-generic_device_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_type(const cl::sycl::device& in_device)
+generic_device_type<cl_backend_type::dpcpp_sycl_l0>::generic_device_type(
+    const cl::sycl::device& in_device)
         : device(in_device) {}
 
 device_index_type generic_device_type<cl_backend_type::dpcpp_sycl_l0>::get_id() const {
@@ -125,11 +125,16 @@ generic_device_type<cl_backend_type::dpcpp_sycl_l0>::get() noexcept {
     return device;
 }
 
+const typename generic_device_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&
+generic_device_type<cl_backend_type::dpcpp_sycl_l0>::get() const noexcept {
+    return device;
+}
 
 /**
  * Event
  */
-generic_event_type<cl_backend_type::dpcpp_sycl_l0>::generic_event_type(ccl_native_t ev) : event(ev) {}
+generic_event_type<cl_backend_type::dpcpp_sycl_l0>::generic_event_type(ccl_native_t ev)
+        : event(ev) {}
 
 generic_event_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&
 generic_event_type<cl_backend_type::dpcpp_sycl_l0>::get() noexcept {
@@ -142,13 +147,13 @@ generic_event_type<cl_backend_type::dpcpp_sycl_l0>::get() const noexcept {
     return event;
 }
 
-
 /**
  * Stream
  */
-generic_stream_type<cl_backend_type::dpcpp_sycl_l0>::generic_stream_type(ccl_native_t q) : queue(q) {}
+generic_stream_type<cl_backend_type::dpcpp_sycl_l0>::generic_stream_type(ccl_native_t q)
+        : queue(q) {}
 
-generic_stream_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t &
+generic_stream_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&
 generic_stream_type<cl_backend_type::dpcpp_sycl_l0>::get() noexcept {
     return const_cast<generic_stream_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&>(
         static_cast<const generic_stream_type<cl_backend_type::dpcpp_sycl_l0>*>(this)->get());
@@ -165,7 +170,7 @@ generic_stream_type<cl_backend_type::dpcpp_sycl_l0>::get() const noexcept {
 generic_platform_type<cl_backend_type::dpcpp_sycl_l0>::generic_platform_type(ccl_native_t& pl)
         : platform(pl) {}
 
-generic_platform_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t &
+generic_platform_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&
 generic_platform_type<cl_backend_type::dpcpp_sycl_l0>::get() noexcept {
     return const_cast<generic_platform_type<cl_backend_type::dpcpp_sycl_l0>::ccl_native_t&>(
         static_cast<const generic_platform_type<cl_backend_type::dpcpp_sycl_l0>*>(this)->get());
diff --git a/src/parallelizer/parallelizer.cpp b/src/parallelizer/parallelizer.cpp
index 346f47e88..7361df9a6 100644
--- a/src/parallelizer/parallelizer.cpp
+++ b/src/parallelizer/parallelizer.cpp
@@ -21,7 +21,7 @@
 #include "sched/entry/coll/coll_entry_helper.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
-#define CCL_BCAST_LARGE_MSG_SIZE (1024 * 1024 * 1024)
+#define CCL_ATL_LARGE_MSG_SIZE (1024 * 1024 * 1024)
 
 typedef struct {
     /* keep these 3 fields on the top of structure */
@@ -40,55 +40,55 @@ typedef struct {
     size_t dtype_size;
 } ccl_parallelizer_sparse_callback_ctx;
 
-ccl_status_t ccl_parallelizer_sparse_callback_get_buf(const void* ctx, void* field_ptr) {
+ccl::status ccl_parallelizer_sparse_callback_get_buf(const void* ctx, void* field_ptr) {
     ccl_parallelizer_sparse_callback_ctx* cctx = (ccl_parallelizer_sparse_callback_ctx*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     buf_ptr->set(cctx->buf, cctx->count * cctx->dtype_size, 0);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_parallelizer_sparse_callback_get_count(const void* ctx, void* field_ptr) {
+ccl::status ccl_parallelizer_sparse_callback_get_count(const void* ctx, void* field_ptr) {
     ccl_parallelizer_sparse_callback_ctx* cctx = (ccl_parallelizer_sparse_callback_ctx*)ctx;
     size_t* count_ptr = (size_t*)field_ptr;
     *count_ptr = cctx->count;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_parallelizer_prologue_get_buf(const void* ctx, void* field_ptr) {
+ccl::status ccl_parallelizer_prologue_get_buf(const void* ctx, void* field_ptr) {
     ccl_parallelizer_prologue_ctx* pctx = (ccl_parallelizer_prologue_ctx*)ctx;
     ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
     size_t dtype_size = ccl::global_data::get().dtypes->get(pctx->dt_idx).size();
     buf_ptr->set(pctx->buf,
                  pctx->count * dtype_size,
                  pctx->part_idx * (pctx->count / pctx->part_count) * dtype_size);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_parallelizer_prologue_get_count(const void* ctx, void* field_ptr) {
+ccl::status ccl_parallelizer_prologue_get_count(const void* ctx, void* field_ptr) {
     ccl_parallelizer_prologue_ctx* pctx = (ccl_parallelizer_prologue_ctx*)ctx;
     size_t count = pctx->count / pctx->part_count;
     if (pctx->part_idx == (pctx->part_count - 1))
         count += pctx->count % pctx->part_count;
     size_t* count_ptr = (size_t*)field_ptr;
     *count_ptr = count;
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_parallelizer_prologue_get_dtype(const void* ctx, void* field_ptr) {
+ccl::status ccl_parallelizer_prologue_get_dtype(const void* ctx, void* field_ptr) {
     ccl_parallelizer_prologue_ctx* pctx = (ccl_parallelizer_prologue_ctx*)ctx;
     ccl_datatype* dtype_ptr = (ccl_datatype*)field_ptr;
     *dtype_ptr = ccl::global_data::get().dtypes->get(pctx->dt_idx);
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
-ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
+ccl::status ccl_parallelizer::process(ccl_master_sched* sched) {
     /* TODO: split on per-collective classes */
 
     CCL_ASSERT(sched);
 
     ccl::global_data& data = ccl::global_data::get();
 
-    ccl_status_t status = ccl_status_success;
+    ccl::status status = ccl::status::success;
     size_t part_count = 1, idx, base_count, dtype_size, comm_size, my_rank;
 
     ccl_coll_param& coll_param = sched->coll_param;
@@ -134,21 +134,19 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
         case ccl_coll_bcast:
             if (ccl::global_data::env().bcast_part_count != CCL_ENV_SIZET_NOT_SPECIFIED) {
                 part_count = ccl::global_data::env().bcast_part_count;
+                break;
             }
-            else {
-                /* to workaround lack of large msg protocol on ATL level */
-                part_count = (coll_param.count * dtype_size) / CCL_BCAST_LARGE_MSG_SIZE;
-                if (!part_count)
-                    part_count = max_data_partition_count;
-            }
-            break;
         case ccl_coll_reduce:
         case ccl_coll_allreduce:
-            if (coll_param.count * dtype_size <= ccl::global_data::env().max_short_size) {
+            if ((coll_param.count * dtype_size <= ccl::global_data::env().max_short_size) ||
+                (coll_param.count < max_data_partition_count)) {
                 part_count = 1;
             }
             else {
-                part_count = max_data_partition_count;
+                /* to workaround lack of large msg protocol on ATL level */
+                part_count = (coll_param.count * dtype_size) / CCL_ATL_LARGE_MSG_SIZE;
+                if (part_count < max_data_partition_count)
+                    part_count = max_data_partition_count;
             }
             break;
         case ccl_coll_alltoall:
@@ -317,7 +315,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
             for (idx = 0; idx < part_count; idx++) {
                 ccl_coll_entry_param param{};
                 param.ctype = ccl_coll_barrier;
-                param.dtype = ccl_datatype_char;
+                param.dtype = ccl_datatype_int8;
                 param.comm = comm;
                 coll_entry_helper::add_coll_entry<ccl_coll_barrier>(part_scheds[idx].get(), param);
             }
@@ -328,7 +326,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
             /* convert sycl buffer */
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
                 if (comm->rank() == coll_param.root) {
-                    entry_factory::make_entry<sycl_copy_device_to_host_entry>(
+                    entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
                         part_scheds[0].get(),
                         ccl_buffer(&(coll_param.sycl_buf),
                                    coll_param.count * dtype_size,
@@ -358,7 +356,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
             /* convert sycl buffer */
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
                 sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_host_to_device_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
                     part_scheds[0].get(),
                     ccl_buffer(coll_param.buf, coll_param.count * dtype_size),
                     ccl_buffer(&(coll_param.sycl_buf),
@@ -376,7 +374,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
 #ifdef CCL_ENABLE_SYCL
                 /* convert sycl buffer */
                 if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                    entry_factory::make_entry<sycl_copy_device_to_host_entry>(
+                    entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
                         part_scheds[0].get(),
                         ccl_buffer(&(coll_param.sycl_send_buf),
                                    coll_param.count * dtype_size,
@@ -411,7 +409,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
                 sched->sync_partial_scheds();
                 if (comm->rank() == coll_param.root) {
-                    entry_factory::make_entry<sycl_copy_host_to_device_entry>(
+                    entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
                         part_scheds[0].get(),
                         ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size),
                         ccl_buffer(&(coll_param.sycl_recv_buf),
@@ -430,12 +428,13 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
 #ifdef CCL_ENABLE_SYCL
                 /* convert sycl buffer */
                 if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                    entry_factory::make_entry<sycl_copy_device_to_host_entry>(
+                    entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
                         part_scheds[0].get(),
                         ccl_buffer(&(coll_param.sycl_send_buf),
                                    coll_param.count * comm_size * dtype_size,
                                    ccl_buffer_type::INDIRECT),
-                        ccl_buffer((void*)coll_param.send_buf, coll_param.count * comm_size * dtype_size),
+                        ccl_buffer((void*)coll_param.send_buf,
+                                   coll_param.count * comm_size * dtype_size),
                         coll_param.count * comm_size,
                         dtype,
                         coll_param.stream);
@@ -455,26 +454,25 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
                                             coll_param.count * comm_size * dtype_size,
                                             offsets[idx],
                                             ccl_buffer_type::INDIRECT);
-                param.recv_buf = ccl_buffer(&(coll_param.recv_buf),
-                                            recv_buf_size,
-                                            offsets[idx],
-                                            ccl_buffer_type::INDIRECT);
+                param.recv_buf = ccl_buffer(
+                    &(coll_param.recv_buf), recv_buf_size, offsets[idx], ccl_buffer_type::INDIRECT);
                 param.count = counts[idx];
                 param.dtype = dtype;
                 param.reduction = coll_param.reduction;
                 param.comm = comm;
-                coll_entry_helper::add_coll_entry<ccl_coll_reduce_scatter>(part_scheds[idx].get(), param);
+                coll_entry_helper::add_coll_entry<ccl_coll_reduce_scatter>(part_scheds[idx].get(),
+                                                                           param);
             }
 #ifdef CCL_ENABLE_SYCL
             /* convert sycl buffer */
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
                 sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_host_to_device_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
                     part_scheds[0].get(),
                     ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size),
                     ccl_buffer(&(coll_param.sycl_recv_buf),
-                                coll_param.count * dtype_size,
-                                ccl_buffer_type::INDIRECT),
+                               coll_param.count * dtype_size,
+                               ccl_buffer_type::INDIRECT),
                     coll_param.count,
                     dtype,
                     coll_param.stream);
@@ -488,7 +486,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
 #ifdef CCL_ENABLE_SYCL
             /* convert sycl buffer */
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                entry_factory::make_entry<sycl_copy_device_to_host_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
                     part_scheds[0].get(),
                     ccl_buffer(&(coll_param.sycl_send_buf),
                                coll_param.count * dtype_size,
@@ -537,7 +535,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
                         ccl_buffer(main_ctx, sizeof(ccl_parallelizer_prologue_ctx)),
                         ccl_buffer(part_ctx, sizeof(ccl_parallelizer_prologue_ctx)),
                         sizeof(void*) + sizeof(size_t) + sizeof(ccl::datatype),
-                        ccl_datatype_char);
+                        ccl_datatype_int8);
                 }
             }
 
@@ -560,7 +558,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
                     param.send_buf = ccl_buffer();
                     param.recv_buf = ccl_buffer();
                     param.count = 0;
-                    param.dtype = ccl_datatype_char;
+                    param.dtype = ccl_datatype_int8;
                 }
                 param.reduction = coll_param.reduction;
                 param.comm = comm;
@@ -591,7 +589,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
                                                                      coll_param.count * dtype_size,
                                                                      ccl_buffer_type::INDIRECT),
                                                           0, /* count */
-                                                          ccl_datatype_char);
+                                                          ccl_datatype_int8);
                 entry->set_field_fn<ccl_sched_entry_field_in_buf>(
                     ccl_parallelizer_prologue_get_buf, main_ctx, false);
                 entry->set_field_fn<ccl_sched_entry_field_cnt>(
@@ -629,7 +627,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
             /* convert sycl buffer */
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
                 sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_host_to_device_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
                     part_scheds[0].get(),
                     ccl_buffer(coll_param.recv_buf, coll_param.count * dtype_size),
                     ccl_buffer(&(coll_param.sycl_recv_buf),
@@ -650,11 +648,11 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
                 size_t sycl_buf_offset = 0;
                 if (coll_param.sycl_send_buf == coll_param.sycl_recv_buf) {
                     for (int i = 0; i < my_rank; i++) {
-                        sycl_buf_offset += coll_param.recv_counts[i] * dtype_size;
+                        sycl_buf_offset += coll_param.recv_counts[i];
                     }
                     LOG_TRACE("sycl_buf_offset = ", sycl_buf_offset);
                 }
-                entry_factory::make_entry<sycl_copy_device_to_host_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
                     part_scheds[0].get(),
                     ccl_buffer(&(coll_param.sycl_send_buf),
                                coll_param.send_count * dtype_size,
@@ -792,7 +790,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
             /* convert sycl buffer */
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
                 sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_host_to_device_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
                     part_scheds[0].get(),
                     ccl_buffer(coll_param.recv_buf, ag_recv_bytes),
                     ccl_buffer(
@@ -809,7 +807,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
 #ifdef CCL_ENABLE_SYCL
             /* convert sycl buffer */
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
-                entry_factory::make_entry<sycl_copy_device_to_host_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::d2h>>(
                     part_scheds[0].get(),
                     ccl_buffer(
                         &(coll_param.sycl_send_buf), a2av_send_bytes, ccl_buffer_type::INDIRECT),
@@ -858,7 +856,7 @@ ccl_status_t ccl_parallelizer::process(ccl_master_sched* sched) {
             /* convert sycl buffer */
             if (coll_param.stream && coll_param.stream->is_sycl_device_stream()) {
                 sched->sync_partial_scheds();
-                entry_factory::make_entry<sycl_copy_host_to_device_entry>(
+                entry_factory::make_entry<sycl_copy_entry<sycl_copy_direction::h2d>>(
                     part_scheds[0].get(),
                     ccl_buffer(coll_param.recv_buf, a2av_recv_bytes),
                     ccl_buffer(
diff --git a/src/parallelizer/parallelizer.hpp b/src/parallelizer/parallelizer.hpp
index 63fe697b8..008bccd42 100644
--- a/src/parallelizer/parallelizer.hpp
+++ b/src/parallelizer/parallelizer.hpp
@@ -13,25 +13,26 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-#pragma once
-
-#include "sched/master_sched.hpp"
-
-class ccl_parallelizer {
-public:
-    ccl_parallelizer(size_t max_data_partition_count)
-            : max_data_partition_count(max_data_partition_count) {}
-
-    ~ccl_parallelizer() = default;
-
-    ccl_parallelizer(const ccl_parallelizer& other) = delete;
-    ccl_parallelizer& operator=(const ccl_parallelizer& other) = delete;
-
-    ccl_parallelizer(ccl_parallelizer&& other) = delete;
-    ccl_parallelizer& operator=(ccl_parallelizer&& other) = delete;
-
-    ccl_status_t process(ccl_master_sched* sched);
-
-private:
-    size_t max_data_partition_count;
-};
+#pragma once
+
+#include "sched/master_sched.hpp"
+#include "internal_types.hpp"
+
+class ccl_parallelizer {
+public:
+    ccl_parallelizer(size_t max_data_partition_count)
+            : max_data_partition_count(max_data_partition_count) {}
+
+    ~ccl_parallelizer() = default;
+
+    ccl_parallelizer(const ccl_parallelizer& other) = delete;
+    ccl_parallelizer& operator=(const ccl_parallelizer& other) = delete;
+
+    ccl_parallelizer(ccl_parallelizer&& other) = delete;
+    ccl_parallelizer& operator=(ccl_parallelizer&& other) = delete;
+
+    ccl::status process(ccl_master_sched* sched);
+
+private:
+    size_t max_data_partition_count;
+};
diff --git a/src/sched/cache/key.cpp b/src/sched/cache/key.cpp
index bf4c3e155..7196e78ac 100644
--- a/src/sched/cache/key.cpp
+++ b/src/sched/cache/key.cpp
@@ -111,8 +111,7 @@ bool ccl_sched_key::check(const ccl_coll_param& param, const ccl_coll_attr& attr
                        param.sparse_param.send_val_count == f.count2 &&
                        param.sparse_param.recv_ind_count == f.count3 &&
                        param.sparse_param.recv_val_count == f.count4 &&
-                       param.sparse_param.itype.idx() == f.itype &&
-                       param.reduction == f.reduction);
+                       param.sparse_param.itype.idx() == f.itype && param.reduction == f.reduction);
             break;
         default: CCL_THROW("unexpected coll_type ", f.ctype);
     }
@@ -174,11 +173,12 @@ size_t ccl_sched_key_hasher::operator()(const ccl_sched_key& k) const {
 
     size_t hash_value = string_hasher(k.match_id);
     if (ccl::global_data::env().cache_key_type == ccl_cache_key_full) {
-        hash_value += k.f.ctype + utils::enum_to_underlying(k.f.dtype) + 
-                      utils::enum_to_underlying(k.f.itype) + utils::enum_to_underlying(k.f.reduction) +
-                      k.f.count1 + k.f.count2 + k.f.root + (size_t)k.f.buf1 + (size_t)k.f.buf2 +
-                      (size_t)k.f.count3 + (size_t)k.f.count4 + (size_t)k.f.comm +
-                      (size_t)k.f.prologue_fn + (size_t)k.f.epilogue_fn + (size_t)k.f.reduction_fn;
+        hash_value += k.f.ctype + utils::enum_to_underlying(k.f.dtype) +
+                      utils::enum_to_underlying(k.f.itype) +
+                      utils::enum_to_underlying(k.f.reduction) + k.f.count1 + k.f.count2 +
+                      k.f.root + (size_t)k.f.buf1 + (size_t)k.f.buf2 + (size_t)k.f.count3 +
+                      (size_t)k.f.count4 + (size_t)k.f.comm + (size_t)k.f.prologue_fn +
+                      (size_t)k.f.epilogue_fn + (size_t)k.f.reduction_fn;
     }
 
     const_cast<ccl_sched_key&>(k).set_hasher_result(hash_value);
diff --git a/src/sched/cache/key.hpp b/src/sched/cache/key.hpp
index 8d173dd02..71776d4c2 100644
--- a/src/sched/cache/key.hpp
+++ b/src/sched/cache/key.hpp
@@ -60,13 +60,14 @@ class ccl_sched_key {
         void* buf1 = nullptr; /* non-data buffer which can be used for caching */
         void* buf2 = nullptr; /* non-data buffer which can be used for caching */
         ccl::datatype dtype = ccl::datatype::int8;
-        ccl::datatype itype = ccl::datatype::int8; /* used in sparse collective to store index type */
+        ccl::datatype itype =
+            ccl::datatype::int8; /* used in sparse collective to store index type */
         ccl::reduction reduction = ccl::reduction::sum;
         size_t count1 = 0;
         size_t count2 = 0;
         size_t count3 = 0; /* used in sparse collective to store recv index count */
         size_t count4 = 0; /* used in sparse collective to store recv value count */
-        size_t root = 0;
+        int root = 0;
         const ccl_comm* comm = nullptr;
         ccl::prologue_fn prologue_fn = nullptr;
         ccl::epilogue_fn epilogue_fn = nullptr;
diff --git a/src/sched/entry/coll/coll_entry.cpp b/src/sched/entry/coll/coll_entry.cpp
index 2ba7d39af..0fc91107c 100644
--- a/src/sched/entry/coll/coll_entry.cpp
+++ b/src/sched/entry/coll/coll_entry.cpp
@@ -30,7 +30,7 @@ void coll_entry::start() {
         coll_sched->set_op_id(coll_sched_op_id);
 
         auto res = coll_entry_helper::build_schedule(coll_sched.get(), sched, param);
-        CCL_ASSERT(res == ccl_status_success, "error during build_schedule, res ", res);
+        CCL_ASSERT(res == ccl::status::success, "error during build_schedule, res ", res);
     }
 
     LOG_DEBUG("starting COLL entry: ", this, ", subsched: ", coll_sched.get());
diff --git a/src/sched/entry/coll/coll_entry_helper.cpp b/src/sched/entry/coll/coll_entry_helper.cpp
index db284da76..426e3a1aa 100644
--- a/src/sched/entry/coll/coll_entry_helper.cpp
+++ b/src/sched/entry/coll/coll_entry_helper.cpp
@@ -15,10 +15,10 @@
 */
 #include "sched/entry/coll/coll_entry_helper.hpp"
 
-ccl_status_t coll_entry_helper::build_schedule(ccl_sched* sched,
-                                               const ccl_sched* parent_sched,
-                                               const ccl_coll_entry_param& param) {
-    ccl_status_t res = ccl_status_success;
+ccl::status coll_entry_helper::build_schedule(ccl_sched* sched,
+                                              const ccl_sched* parent_sched,
+                                              const ccl_coll_entry_param& param) {
+    ccl::status res = ccl::status::success;
 
     if (param.ctype == ccl_coll_allreduce || param.ctype == ccl_coll_reduce ||
         param.ctype == ccl_coll_reduce_scatter) {
diff --git a/src/sched/entry/coll/coll_entry_helper.hpp b/src/sched/entry/coll/coll_entry_helper.hpp
index eab649abb..ce222d855 100644
--- a/src/sched/entry/coll/coll_entry_helper.hpp
+++ b/src/sched/entry/coll/coll_entry_helper.hpp
@@ -47,12 +47,13 @@ class coll_entry_helper {
                         for direct MPI algo with prologue will use regular coll_entry
                         to simplify work with postponed fields
                     */
-                    sched->strict_start_order = true;
+                    sched->strict_order = true;
                 }
                 else {
                     /* otherwise will place entry directly into schedule due to performance reasons */
                     auto res = coll_entry_helper::build_schedule(sched, sched, param);
-                    CCL_ASSERT(res == ccl_status_success, "error during build_schedule, res ", res);
+                    CCL_ASSERT(
+                        res == ccl::status::success, "error during build_schedule, res ", res);
                     return nullptr; /* coll_entry ptr is required for prologue case only */
                 }
             }
@@ -62,7 +63,7 @@ class coll_entry_helper {
         return entry_factory::make_entry<coll_entry>(sched, param);
     }
 
-    static ccl_status_t build_schedule(ccl_sched* sched,
-                                       const ccl_sched* parent_sched,
-                                       const ccl_coll_entry_param& param);
+    static ccl::status build_schedule(ccl_sched* sched,
+                                      const ccl_sched* parent_sched,
+                                      const ccl_coll_entry_param& param);
 };
diff --git a/src/sched/entry/coll/coll_entry_param.hpp b/src/sched/entry/coll/coll_entry_param.hpp
index 4e10f8d87..64ce1d97d 100644
--- a/src/sched/entry/coll/coll_entry_param.hpp
+++ b/src/sched/entry/coll/coll_entry_param.hpp
@@ -28,7 +28,7 @@ struct ccl_coll_entry_param {
     const size_t* recv_counts;
     ccl_datatype dtype;
     ccl::reduction reduction;
-    size_t root;
+    int root;
     ccl_comm* comm;
 
     ccl_coll_param to_coll_param() const {
diff --git a/src/sched/entry/coll/direct/allgatherv_entry.hpp b/src/sched/entry/coll/direct/allgatherv_entry.hpp
index 4468cecdd..bf9871c0c 100644
--- a/src/sched/entry/coll/direct/allgatherv_entry.hpp
+++ b/src/sched/entry/coll/direct/allgatherv_entry.hpp
@@ -45,8 +45,8 @@ class allgatherv_entry : public base_coll_entry {
     void start() override {
         size_t dt_size = dtype.size();
         size_t send_bytes = send_cnt * dt_size;
-        size_t comm_size = comm->size();
-        size_t i;
+        int comm_size = comm->size();
+        int i;
 
         if (!recv_bytes && !offsets) {
             recv_bytes = static_cast<int*>(CCL_MALLOC(comm_size * sizeof(int), "recv_bytes"));
diff --git a/src/sched/entry/coll/direct/alltoallv_entry.hpp b/src/sched/entry/coll/direct/alltoallv_entry.hpp
index f9ea21176..8e13f7a20 100644
--- a/src/sched/entry/coll/direct/alltoallv_entry.hpp
+++ b/src/sched/entry/coll/direct/alltoallv_entry.hpp
@@ -47,8 +47,8 @@ class alltoallv_entry : public base_coll_entry {
 
     void start() override {
         size_t dt_size = dtype.size();
-        size_t comm_size = comm->size();
-        size_t i;
+        int comm_size = comm->size();
+        int i;
         sum_recv_bytes = 0;
         sum_send_bytes = 0;
 
diff --git a/src/sched/entry/coll/direct/base_coll_entry.hpp b/src/sched/entry/coll/direct/base_coll_entry.hpp
index 5d986596f..7c8284e12 100644
--- a/src/sched/entry/coll/direct/base_coll_entry.hpp
+++ b/src/sched/entry/coll/direct/base_coll_entry.hpp
@@ -21,7 +21,7 @@ class base_coll_entry : public sched_entry {
 public:
     base_coll_entry() = delete;
     base_coll_entry(ccl_sched* sched) : sched_entry(sched) {
-        sched->strict_start_order = true;
+        sched->strict_order = true;
     }
 
     bool is_strict_order_satisfied() override {
diff --git a/src/sched/entry/coll/direct/bcast_entry.hpp b/src/sched/entry/coll/direct/bcast_entry.hpp
index 99f627db6..d41d79b0b 100644
--- a/src/sched/entry/coll/direct/bcast_entry.hpp
+++ b/src/sched/entry/coll/direct/bcast_entry.hpp
@@ -28,7 +28,7 @@ class bcast_entry : public base_coll_entry {
                 ccl_buffer buf,
                 size_t cnt,
                 const ccl_datatype& dtype,
-                size_t root,
+                int root,
                 ccl_comm* comm)
             : base_coll_entry(sched),
               buf(buf),
@@ -92,7 +92,7 @@ class bcast_entry : public base_coll_entry {
 private:
     ccl_buffer buf;
     size_t cnt;
-    size_t root;
+    int root;
     ccl_datatype dtype;
     ccl_comm* comm;
     atl_req_t req{};
diff --git a/src/sched/entry/coll/direct/reduce_entry.hpp b/src/sched/entry/coll/direct/reduce_entry.hpp
index 896d1a7c8..a537b2637 100644
--- a/src/sched/entry/coll/direct/reduce_entry.hpp
+++ b/src/sched/entry/coll/direct/reduce_entry.hpp
@@ -30,7 +30,7 @@ class reduce_entry : public base_coll_entry {
                  size_t cnt,
                  const ccl_datatype& dtype,
                  ccl::reduction reduction,
-                 size_t root,
+                 int root,
                  ccl_comm* comm)
             : base_coll_entry(sched),
               send_buf(send_buf),
@@ -108,7 +108,7 @@ class reduce_entry : public base_coll_entry {
     size_t cnt;
     ccl_datatype dtype;
     ccl::reduction op;
-    size_t root;
+    int root;
     ccl_comm* comm;
     atl_req_t req{};
 };
diff --git a/src/sched/entry/coll/direct/reduce_scatter_entry.hpp b/src/sched/entry/coll/direct/reduce_scatter_entry.hpp
index a49df553c..ed29f28b9 100644
--- a/src/sched/entry/coll/direct/reduce_scatter_entry.hpp
+++ b/src/sched/entry/coll/direct/reduce_scatter_entry.hpp
@@ -25,12 +25,12 @@ class reduce_scatter_entry : public base_coll_entry {
 
     reduce_scatter_entry() = delete;
     reduce_scatter_entry(ccl_sched* sched,
-                 const ccl_buffer send_buf,
-                 ccl_buffer recv_buf,
-                 size_t recv_cnt,
-                 const ccl_datatype& dtype,
-                 ccl::reduction reduction,
-                 ccl_comm* comm)
+                         const ccl_buffer send_buf,
+                         ccl_buffer recv_buf,
+                         size_t recv_cnt,
+                         const ccl_datatype& dtype,
+                         ccl::reduction reduction,
+                         ccl_comm* comm)
             : base_coll_entry(sched),
               send_buf(send_buf),
               recv_buf(recv_buf),
@@ -50,13 +50,14 @@ class reduce_scatter_entry : public base_coll_entry {
         size_t send_bytes = send_cnt * dtype.size();
         size_t recv_bytes = recv_cnt * dtype.size();
 
-        atl_status_t atl_status = comm->atl->atl_ep_reduce_scatter(sched->bin->get_atl_ep(),
-                                                                   send_buf.get_ptr(send_bytes),
-                                                                   recv_buf.get_ptr(recv_bytes),
-                                                                   recv_cnt,
-                                                                   static_cast<atl_datatype_t>(dtype.idx()),
-                                                                   static_cast<atl_reduction_t>(op),
-                                                                   &req);
+        atl_status_t atl_status =
+            comm->atl->atl_ep_reduce_scatter(sched->bin->get_atl_ep(),
+                                             send_buf.get_ptr(send_bytes),
+                                             recv_buf.get_ptr(recv_bytes),
+                                             recv_cnt,
+                                             static_cast<atl_datatype_t>(dtype.idx()),
+                                             static_cast<atl_reduction_t>(op),
+                                             &req);
 
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("REDUCE_SCATTER entry failed. atl_status: ", atl_status_to_str(atl_status));
diff --git a/src/sched/entry/copy_entry.hpp b/src/sched/entry/copy_entry.hpp
index 551e5114a..68508e08d 100644
--- a/src/sched/entry/copy_entry.hpp
+++ b/src/sched/entry/copy_entry.hpp
@@ -44,7 +44,7 @@ class copy_entry : public sched_entry,
 
         size_t bytes = cnt * dtype.size();
         auto comp_status = ccl_comp_copy(in_buf.get_ptr(bytes), out_buf.get_ptr(bytes), cnt, dtype);
-        CCL_ASSERT(comp_status == ccl_status_success, "bad status ", comp_status);
+        CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
         status = ccl_sched_entry_status_complete;
     }
 
diff --git a/src/sched/entry/entry.hpp b/src/sched/entry/entry.hpp
index 39cbaa8bb..24fb2ec38 100644
--- a/src/sched/entry/entry.hpp
+++ b/src/sched/entry/entry.hpp
@@ -19,10 +19,11 @@
 #include "common/datatype/datatype.hpp"
 #include "common/utils/utils.hpp"
 #include "sched/entry/postponed_fields.hpp"
+#include "internal_types.hpp"
 #include <chrono>
 #include <memory>
 
-typedef ccl_status_t (*ccl_sched_entry_function_t)(const void*);
+typedef ccl::status (*ccl_sched_entry_function_t)(const void*);
 
 class ccl_sched;
 
diff --git a/src/sched/entry/factory/chunked_entry_factory.cpp b/src/sched/entry/factory/chunked_entry_factory.cpp
index 9ea49eec6..44060dd17 100644
--- a/src/sched/entry/factory/chunked_entry_factory.cpp
+++ b/src/sched/entry/factory/chunked_entry_factory.cpp
@@ -20,7 +20,7 @@ void make_chunked_send_entry(ccl_sched* sched,
                              const ccl_buffer buf,
                              size_t cnt,
                              const ccl_datatype& dtype,
-                             size_t dst,
+                             int dst,
                              ccl_comm* comm) {
     CCL_CHUNKED_ENTRY_FUNCTION(
         "send",
@@ -34,7 +34,7 @@ void make_chunked_recv_entry(ccl_sched* sched,
                              const ccl_buffer buf,
                              size_t cnt,
                              const ccl_datatype& dtype,
-                             size_t src,
+                             int src,
                              ccl_comm* comm) {
     CCL_CHUNKED_ENTRY_FUNCTION(
         "recv",
@@ -50,7 +50,7 @@ void make_chunked_recv_reduce_entry(ccl_sched* sched,
                                     size_t* out_cnt,
                                     const ccl_datatype& dtype,
                                     ccl::reduction reduction_op,
-                                    size_t src,
+                                    int src,
                                     ccl_buffer comm_buf,
                                     ccl_comm* comm,
                                     ccl_recv_reduce_result_buf_type result_buf_type) {
@@ -75,7 +75,7 @@ void make_chunked_send_entry(std::vector<ccl_sched*>& scheds,
                              const ccl_buffer buf,
                              size_t cnt,
                              const ccl_datatype& dtype,
-                             size_t dst,
+                             int dst,
                              ccl_comm* comm) {
     CCL_CHUNKED_ENTRY_FUNCTION(
         "send",
@@ -90,7 +90,7 @@ void make_chunked_recv_entry(std::vector<ccl_sched*>& scheds,
                              const ccl_buffer buf,
                              size_t cnt,
                              const ccl_datatype& dtype,
-                             size_t src,
+                             int src,
                              ccl_comm* comm) {
     CCL_CHUNKED_ENTRY_FUNCTION(
         "recv",
diff --git a/src/sched/entry/factory/chunked_entry_factory.hpp b/src/sched/entry/factory/chunked_entry_factory.hpp
index 3b032cff3..5a5b90a07 100644
--- a/src/sched/entry/factory/chunked_entry_factory.hpp
+++ b/src/sched/entry/factory/chunked_entry_factory.hpp
@@ -54,14 +54,14 @@ void make_chunked_send_entry(ccl_sched* sched,
                              const ccl_buffer buf,
                              size_t cnt,
                              const ccl_datatype& dtype,
-                             size_t dst,
+                             int dst,
                              ccl_comm* comm);
 
 void make_chunked_recv_entry(ccl_sched* sched,
                              const ccl_buffer buf,
                              size_t cnt,
                              const ccl_datatype& dtype,
-                             size_t src,
+                             int src,
                              ccl_comm* comm);
 
 void make_chunked_recv_reduce_entry(
@@ -71,7 +71,7 @@ void make_chunked_recv_reduce_entry(
     size_t* out_cnt,
     const ccl_datatype& dtype,
     ccl::reduction reduction_op,
-    size_t src,
+    int src,
     ccl_buffer comm_buf,
     ccl_comm* comm,
     ccl_recv_reduce_result_buf_type result_buf_type = ccl_recv_reduce_local_buf);
@@ -82,7 +82,7 @@ void make_chunked_send_entry(std::vector<ccl_sched*>& scheds,
                              const ccl_buffer buf,
                              size_t cnt,
                              const ccl_datatype& dtype,
-                             size_t dst,
+                             int dst,
                              ccl_comm* comm);
 
 void make_chunked_recv_entry(std::vector<ccl_sched*>& scheds,
@@ -90,7 +90,7 @@ void make_chunked_recv_entry(std::vector<ccl_sched*>& scheds,
                              const ccl_buffer buf,
                              size_t cnt,
                              const ccl_datatype& dtype,
-                             size_t src,
+                             int src,
                              ccl_comm* comm);
 
 void make_chunked_copy_entry(std::vector<ccl_sched*>& scheds,
diff --git a/src/sched/entry/factory/entry_factory.hpp b/src/sched/entry/factory/entry_factory.hpp
index c0ab68df4..2d32e3b0c 100644
--- a/src/sched/entry/factory/entry_factory.hpp
+++ b/src/sched/entry/factory/entry_factory.hpp
@@ -44,8 +44,7 @@
 #include "sched/entry/coll/direct/reduce_scatter_entry.hpp"
 
 #ifdef CCL_ENABLE_SYCL
-#include "sched/entry/sycl_copy_device_to_host_entry.hpp"
-#include "sched/entry/sycl_copy_host_to_device_entry.hpp"
+#include "sched/entry/sycl_copy_entry.hpp"
 #endif /* CCL_ENABLE_SYCL */
 
 #include "sched/sched.hpp"
diff --git a/src/sched/entry/l0/l0_allgather_handles_entry.hpp b/src/sched/entry/l0/l0_allgather_handles_entry.hpp
index d6a9fdef8..ce50423e5 100644
--- a/src/sched/entry/l0/l0_allgather_handles_entry.hpp
+++ b/src/sched/entry/l0/l0_allgather_handles_entry.hpp
@@ -17,7 +17,7 @@
 
 #include <initializer_list>
 #include <iterator>
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/datatype/datatype.hpp"
 #include "comp/comp.hpp"
 #include "common/comm/l0/devices/devices_declaration.hpp"
@@ -62,6 +62,7 @@ class l0_allgather_handles_entry : public base_coll_entry {
                                std::shared_ptr<gpu_comm> comm,
                                std::shared_ptr<ccl::host_communicator> ccl_comm,
                                device_storage& global_device_storage,
+                               ccl_driver_context_ptr in_ctx,
                                std::vector<ccl_device::device_ipc_memory_handle>&& send_data)
             : base_coll_entry(sched),
               comm_addr(
@@ -73,7 +74,7 @@ class l0_allgather_handles_entry : public base_coll_entry {
     }
 
     void start() override {
-        size_t comm_size = ccl_communicator->size();
+        int comm_size = ccl_communicator->size();
         LOG_INFO(class_name(), " entry req ", &req, ", rank: ", comm_addr.to_string());
 
         // serialize data for native allgather algo
@@ -116,9 +117,13 @@ class l0_allgather_handles_entry : public base_coll_entry {
                  plain_recv_data.size());
 
         ccl::stream::impl_value_t empty{};
-        event = ccl_communicator->allgatherv_impl(
-            (char*)plain_send_data.data(), send_bytes, (char*)plain_recv_data.data(), recv_bytes,
-            empty, ccl::default_allgatherv_attr, {});
+        event = ccl_communicator->allgatherv_impl((int8_t*)plain_send_data.data(),
+                                                  send_bytes,
+                                                  (int8_t*)plain_recv_data.data(),
+                                                  recv_bytes,
+                                                  empty,
+                                                  ccl::default_allgatherv_attr,
+                                                  {});
         status = ccl_sched_entry_status_started;
 
         //TODO prepare foreign_device_ipc_mem_storage handles array
@@ -197,7 +202,7 @@ class l0_allgather_handles_entry : public base_coll_entry {
                               native::to_string(recv_ip_handle->get()));
 
                     // create IPC memory object & remember in shared storage
-                    
+
                     // TODO: resolve issue to provide ctx correctly
                     std::shared_ptr<ccl_context> ctx;
                     foreign_device_ipc_mem_storage[ipc_mem_owner].push_back(
@@ -297,7 +302,7 @@ class l0_allgather_handles_entry : public base_coll_entry {
     size_t cnt;
     ccl_datatype dtype;
 
-    ccl::communicator::coll_request_t event;
+    ccl::event event;
     atl_req_t req{};
 };
 } // namespace native
diff --git a/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp b/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp
index 3c2ead052..f488fd29a 100644
--- a/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_allgatherv_typed_entry.hpp
@@ -43,6 +43,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
     using base::status;
     using base::launch_args;
     using base::kernel_router;
+    using base::get_ctx;
     using kernel_main_typed = ring_allgatherv_kernel<native_type>;
     using kernel_ipc_typed = ring_allgatherv_ipc<native_type>;
 
@@ -68,6 +69,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
         ccl_sched* sched,
         std::shared_ptr<gpu_comm_impl> comm,
         specific_indexed_device_storage& available_devices,
+        ccl_driver_context_ptr in_ctx,
         const ccl_buffer send_buf,
         size_t send_count,
         ccl_buffer recv_buf,
@@ -75,28 +77,33 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
         std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
             : base(sched,
                    comm,
+                   in_ctx,
                    send_buf,
-                   ccl::native_type_info<native_type>::ccl_type_value,
+                   ccl::native_type_info<native_type>::dtype,
                    device_stream),
               // left_wrote_to_me_flag
               income_data_flag(parent_communicator->get_device()
                                    .template alloc_memory<income_data_flag_gpu_type>(
                                        1,
-                                       sizeof(income_data_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                                       sizeof(income_data_flag_gpu_type),
+                                       get_ctx())),
               // ready_to_recv_flag_arg
               ready_to_recv_flag(parent_communicator->get_device()
                                      .template alloc_memory<ready_to_recv_flag_gpu_type>(
                                          1,
-                                         sizeof(ready_to_recv_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                                         sizeof(ready_to_recv_flag_gpu_type),
+                                         get_ctx())),
               recv_counts_buf(parent_communicator->get_device()
                                   .template alloc_memory<recv_counts_typed_entry_type>(
                                       send_count,
-                                      sizeof(recv_counts_typed_entry_type), std::shared_ptr<ccl_context> { })),
+                                      sizeof(recv_counts_typed_entry_type),
+                                      get_ctx())),
 
               recv_offsets_buf(parent_communicator->get_device()
                                    .template alloc_memory<recv_offsets_typed_entry_type>(
                                        comm_addr.size,
-                                       sizeof(recv_offsets_typed_entry_type), std::shared_ptr<ccl_context> { }))
+                                       sizeof(recv_offsets_typed_entry_type),
+                                       get_ctx()))
 
     {
         // copy recv_buf into alloced recv_buf_entry
@@ -105,10 +112,10 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
         // same as parent_communicator->template
         //                    get_comm_data<base::get_topology(),
         //                    base::get_topology_class()>().size;
-        size_t local_topology_size = comm_addr.size;
+        int local_topology_size = comm_addr.size;
         std::vector<size_t> recv_offsets_v(local_topology_size, 0);
 
-        for (size_t idx = 0; idx < local_topology_size; idx++) {
+        for (int idx = 0; idx < local_topology_size; idx++) {
             if (idx > 0)
                 recv_offsets_v[idx] += recv_offsets_v[idx - 1] + recv_counts[idx - 1];
         }
@@ -116,16 +123,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
         recv_counts_buf.enqueue_write_sync(recv_counts, local_topology_size);
         recv_offsets_buf.enqueue_write_sync(recv_offsets_v);
 
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", cnt ",
-                  cnt_entry,
-                  ", rank: ",
-                  "local topology size:",
-                  local_topology_size,
-                  comm_addr.to_string());
-        size_t next_rank = (comm_addr.rank + 1) % comm_addr.size;
+        int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
             l0_allgatherv_typed_entry<native_type, gpu_comm_impl, topology>>(
             *this, next_rank, available_devices);
@@ -148,6 +146,16 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
             std::unique_lock<std::mutex> lock(global_mutex);
             registered_thread.insert(std::this_thread::get_id());
         }
+
+        //remember list_closed event index
+        list_closed_epoch_id = list_closed_epoch.load();
+
+        ENTRY_LOG_DEBUG("Created, next_rank:",
+                        next_rank,
+                        " ,WaitCount: ",
+                        wait_count.load(),
+                        ", ListClosedEpoch: ",
+                        list_closed_epoch_id);
     }
 
     ~l0_allgatherv_typed_entry() {
@@ -169,14 +177,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
     }
 
     void start() override {
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", rank: ",
-                  comm_addr.to_string(),
-                  ", cnt ",
-                  cnt_entry);
-
+        ENTRY_LOG_DEBUG("Start entry, cnt ", cnt_entry);
         //Create base primitives
         base::start();
 
@@ -234,14 +235,14 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
         //TODO
         std::vector<ccl_device::device_ipc_memory_handle> ret;
         ret.reserve(3);
-        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), ctx));
-        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), ctx));
+        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
+        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
         return ret;
     }
 
 protected:
     bool finalize_entry() override {
-        LOG_TRACE("entry: ", class_name(), ", rank: ", comm_addr.to_string());
+        ENTRY_LOG_TRACE("Try to finalize");
         ccl_device& device = parent_communicator->get_device();
 
         kernel_main_typed& main_entry_function =
@@ -249,100 +250,131 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
                                                          topology,
                                                          ccl::device_topology_type::ring,
                                                          native_type>();
-        if ((*kernel_router)(main_entry_function)) {
-            ze_result_t result;
-            //TODO L0 Workaround
-            if (!is_kernel_added) {
-                std::unique_lock<std::mutex> lock(global_mutex);
-                exec_count++;
-                cur_index = exec_count;
-                result = zeCommandListAppendLaunchKernel(device.get_cmd_list(ctx).get(),
-                                                         main_entry_function.handle,
-                                                         &launch_args,
-                                                         nullptr,
-                                                         0,
-                                                         nullptr);
-                if (result != ZE_RESULT_SUCCESS) {
-                    LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
-                    throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
-                }
-                is_kernel_added = true;
+        if (!(*kernel_router)(main_entry_function)) {
+            return false;
+        }
 
-                LOG_DEBUG("entry: ",
-                          class_name(),
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          ". Kernel added: ",
-                          main_entry_function.to_string(),
-                          " in list");
-            }
+        auto& cmd_list = device.get_cmd_list(get_ctx());
+        ze_result_t result;
+        //TODO L0 Workaround
+        if (!is_kernel_added) {
+            std::unique_lock<std::mutex> lock(global_mutex);
+            exec_count++;
+            (void)cur_index;
+
+            kernel_bind_epoch_id = exec_count;
 
-            while (exec_count < registered_thread.size()) {
+            //L0 Workaround launch kernel require critical section
+            result = zeCommandListAppendLaunchKernel(
+                cmd_list.get(), main_entry_function.handle, &launch_args, nullptr, 0, nullptr);
+            if (result != ZE_RESULT_SUCCESS) {
+                LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
+                throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
             }
+            is_kernel_added = true;
 
-            //TODO L0 workaround
-            LOG_INFO("Check L0 Workaround: WaitCount: ",
-                     wait_count,
-                     ", ExecCount: ",
-                     exec_count,
-                     ", CurIndex: ",
-                     cur_index);
-            if (cur_index == wait_count /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
-                if (topology == ccl::group_split_type::cluster) {
-                    // TODO: implement process communicator case
-                    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + "TODO: implement process communicator case");
-                    // auto c = ccl::environment::instance().create_communicator();
-                    // if (c.rank() == 0) {
-                        // LOG_INFO("L0 Workaround: one device close list!!!",
-                        //          "WaitCount: ",
-                        //          wait_count,
-                        //          ", ExecCount: ",
-                        //          exec_count,
-                        //          ", CurIndex: ",
-                        //          cur_index);
-                        // result = zeCommandListClose(device.get_cmd_list().get());
-                        // if (result != ZE_RESULT_SUCCESS) {
-                        //     LOG_ERROR("zeCommandListClose failed, error: ",
-                        //               native::to_string(result));
-                        //     throw std::runtime_error("zeCommandListClose failed");
-                        // }
-                    // }
-                }
-                else {
-                    LOG_INFO("L0 Workaround: one device close list!!!",
-                             "WaitCount: ",
-                             wait_count,
-                             ", ExecCount: ",
-                             exec_count,
-                             ", CurIndex: ",
-                             cur_index);
-                    result = zeCommandListClose(device.get_cmd_list(ctx).get());
+            ENTRY_LOG_DEBUG("Append kernel successfully: ",
+                            main_entry_function.to_string(),
+                            " in list: ",
+                            cmd_list.get());
+        }
+
+        while (exec_count < registered_thread.size()) {
+            ENTRY_LOG_TRACE("waiting thread counts, exec_cont: ", exec_count);
+        }
+
+        //TODO L0 workaround
+        ENTRY_LOG_INFO("Check L0 Workaround: WaitCount: ",
+                       wait_count,
+                       ", ExecCount: ",
+                       exec_count,
+                       ", CurIndex: ",
+                       kernel_bind_epoch_id);
+
+        if (kernel_bind_epoch_id % wait_count ==
+            0 /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
+            if (topology == ccl::group_split_type::cluster) {
+                // TODO: implement process communicator case
+                throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                                     "TODO: implement process communicator case");
+                // auto c = ccl::environment::instance().create_communicator();
+                // if (c.rank() == 0) {
+                // LOG_INFO("L0 Workaround: one device close list!!!",
+                //          "WaitCount: ",
+                //          wait_count,
+                //          ", ExecCount: ",
+                //          exec_count,
+                //          ", CurIndex: ",
+                //          cur_index);
+                // result = zeCommandListClose(device.get_cmd_list().get());
+                // if (result != ZE_RESULT_SUCCESS) {
+                //     LOG_ERROR("zeCommandListClose failed, error: ",
+                //               native::to_string(result));
+                //     throw std::runtime_error("zeCommandListClose failed");
+                // }
+                // }
+            }
+            else {
+                ENTRY_LOG_INFO("L0 Workaround: one device close list!!!\n",
+                               "WaitCount: ",
+                               wait_count,
+                               ", ExecCount: ",
+                               exec_count,
+                               ", CurIndex: ",
+                               kernel_bind_epoch_id);
+
+                {
+                    std::unique_lock<std::mutex> lock(global_mutex);
+                    result = zeCommandListClose(cmd_list.get());
                     if (result != ZE_RESULT_SUCCESS) {
                         LOG_ERROR("zeCommandListClose failed, error: ", native::to_string(result));
                         throw std::runtime_error("zeCommandListClose failed");
                     }
-                }
 
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
-            }
-            else if (cur_index > wait_count) {
-                LOG_INFO("L0 Workaround: one device should close list before!!! ",
-                         "WaitCount: ",
-                         wait_count,
-                         ", ExecCount: ",
-                         exec_count,
-                         ", CurIndex: ",
-                         cur_index);
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
+                    auto queue_prop = ccl_device::get_default_queue_desc();
+                    auto& cmd_queue = device.get_cmd_queue(queue_prop, get_ctx());
+                    ENTRY_LOG_INFO("Execute list:",
+                                   cmd_list.get(),
+                                   ", queue: ",
+                                   cmd_queue.get(),
+                                   ", go to submit entry");
+                    ze_result_t ret = zeCommandQueueExecuteCommandLists(
+                        cmd_queue.get(), 1, cmd_list.get_ptr(), this->fence);
+                    if (ret != ZE_RESULT_SUCCESS) {
+                        throw ccl::exception(std::string("cannot execute command list, error: ") +
+                                             std::to_string(ret));
+                    }
+
+                    ret = zeFenceQueryStatus(this->fence);
+                    ENTRY_LOG_DEBUG("Fence query status: ",
+                                    native::to_string(ret),
+                                    ", queue: ",
+                                    cmd_queue.get());
+                }
             }
+
+            list_closed_epoch.fetch_add(1);
+            ENTRY_LOG_INFO("List closed:", cmd_list.get(), ", go to submit entry");
+            return true;
+        }
+        else if (kernel_bind_epoch_id > wait_count ||
+                 list_closed_epoch.load() != list_closed_epoch_id /* epoch changed */) {
+            ENTRY_LOG_INFO("L0 Workaround: one device should close list before!!! ",
+                           "WaitCount: ",
+                           wait_count,
+                           ", ExecCount: ",
+                           exec_count,
+                           ", CurIndex: ",
+                           kernel_bind_epoch_id);
+            ENTRY_LOG_INFO(
+                "Dirfferent entry closed the list:", cmd_list.get(), ", go to submit entry");
+            return true;
         }
         return false;
     }
 
     void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str, class_name(), "TODO\n");
+        base::dump_detail(str);
     }
 
 private:
@@ -355,6 +387,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
     ccl_device::device_memory<recv_offsets_typed_entry_type> recv_offsets_buf;
     size_t cnt_entry;
     std::shared_ptr<ccl_context> ctx;
+
 public:
     bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) {
         //Check argument binding in kernels for next rank
@@ -364,13 +397,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
                                             typename kernel_main_typed::ready_to_recv_flag_arg>();
         if (is_right_kernel_ready) {
             if (is_kernel_added) {
-                LOG_DEBUG("entry: ",
-                          class_name(),
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          ". Function: ",
-                          main_entry_function.to_string(),
-                          " - binded already");
+                ENTRY_LOG_TRACE("Function: ", main_entry_function.to_string(), " - binded already");
                 return true;
             }
 
@@ -384,27 +411,22 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
                     right_kernel
                         .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
 
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_output_buf.first,
-                      ", ",
-                      right_output_buf.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
+            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name());
+            ENTRY_LOG_TRACE("Args: \n{ ",
+                            right_output_buf.first,
+                            ", ",
+                            right_output_buf.second,
+                            "}\n",
+                            "{ ",
+                            right_income_data_flag_arg.first,
+                            ", ",
+                            right_income_data_flag_arg.second,
+                            "}\n",
+                            "{ ",
+                            right_ready_to_recv_flag_arg.first,
+                            ", ",
+                            right_ready_to_recv_flag_arg.second,
+                            "}\n");
 
             //TODO register argument for current device kernel: use array-version
             main_entry_function
@@ -414,15 +436,8 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
                     right_output_buf.second,
                     right_income_data_flag_arg.second,
                     right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_output_buf",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+
+            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
         }
         return is_right_kernel_ready;
     }
@@ -497,5 +512,7 @@ class l0_allgatherv_typed_entry : public base_gpu_entry<native_type,
         }
         return is_right_kernel_ready;
     }
+    size_t list_closed_epoch_id = 0;
+    size_t kernel_bind_epoch_id = 0;
 };
 } // namespace native
diff --git a/src/sched/entry/l0/l0_allreduce_typed_entry.hpp b/src/sched/entry/l0/l0_allreduce_typed_entry.hpp
index d57f8f6fa..4de6e6225 100644
--- a/src/sched/entry/l0/l0_allreduce_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_allreduce_typed_entry.hpp
@@ -27,6 +27,8 @@ static thread_local size_t cur_index = 0;
 static std::atomic<size_t> wait_count{};
 static std::set<std::thread::id> registered_thread;
 
+static std::atomic<size_t> list_closed_epoch{};
+
 namespace native {
 template <class native_type, class gpu_comm_impl, ccl::group_split_type topology>
 class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
@@ -49,6 +51,8 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
     using base::status;
     using base::launch_args;
     using base::kernel_router;
+    using base::get_ctx;
+    using base::alloc_memory_wrap;
     using kernel_main_typed = ring_allreduce_kernel<native_type>;
     using kernel_ipc_typed = ring_allreduce_ipc<native_type>;
 
@@ -72,6 +76,7 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
         ccl_sched* sched,
         std::shared_ptr<gpu_comm_impl> comm,
         specific_indexed_device_storage& available_devices,
+        ccl_driver_context_ptr in_ctx,
         const ccl_buffer send_buf,
         ccl_buffer recv_buf,
         size_t cnt,
@@ -79,39 +84,36 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
         std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
             : base(sched,
                    comm,
+                   in_ctx,
                    send_buf,
-                   ccl::native_type_info<native_type>::ccl_type_value,
+                   ccl::native_type_info<native_type>::dtype,
                    device_stream),
 
-              temp_buffer(parent_communicator->get_device().template alloc_memory<native_type>(
-                  cnt,
-                  sizeof(native_type), std::shared_ptr<ccl_context> { })),
-              income_data_flag(parent_communicator->get_device()
-                                   .template alloc_memory<income_data_flag_gpu_type>(
-                                       1,
-                                       sizeof(income_data_flag_gpu_type), std::shared_ptr<ccl_context> { })),
-              ready_to_recv_flag(parent_communicator->get_device()
-                                     .template alloc_memory<ready_to_recv_flag_gpu_type>(
-                                         1,
-                                         sizeof(ready_to_recv_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+              temp_buffer(
+                  this->template alloc_memory_wrap(typename kernel_main_typed::tmp_recv_buf_arg{},
+                                                   parent_communicator,
+                                                   cnt,
+                                                   get_ctx())),
+              income_data_flag(this->template alloc_memory_wrap(
+                  typename kernel_main_typed::income_data_flag_arg{},
+                  parent_communicator,
+                  1,
+                  get_ctx())),
+              ready_to_recv_flag(this->template alloc_memory_wrap(
+                  typename kernel_main_typed::ready_to_recv_flag_arg{},
+                  parent_communicator,
+                  1,
+                  get_ctx())),
               local_barrier_flag(parent_communicator->get_device()
                                      .template alloc_memory<local_barrier_flag_gpu_type>(
                                          1,
-                                         sizeof(local_barrier_flag_gpu_type), std::shared_ptr<ccl_context> { })) {
+                                         sizeof(local_barrier_flag_gpu_type),
+                                         get_ctx())) {
         recv_buf_typed_entry = recv_buf;
         op_typed_entry = op;
         cnt_entry = cnt;
 
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", cnt ",
-                  cnt_entry,
-                  ", op ",
-                  (int)(op),
-                  ", rank: ",
-                  comm_addr.to_string());
-        size_t next_rank = (comm_addr.rank + 1) % comm_addr.size;
+        int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
             l0_allreduce_typed_entry<native_type, gpu_comm_impl, topology>>(
             *this, next_rank, available_devices);
@@ -134,6 +136,16 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
             std::unique_lock<std::mutex> lock(global_mutex);
             registered_thread.insert(std::this_thread::get_id());
         }
+
+        //remember list_closed event index
+        list_closed_epoch_id = list_closed_epoch.load();
+
+        ENTRY_LOG_DEBUG("Created, next_rank:",
+                        next_rank,
+                        " ,WaitCount: ",
+                        wait_count.load(),
+                        ", ListClosedEpoch: ",
+                        list_closed_epoch_id);
     }
 
     ~l0_allreduce_typed_entry() {
@@ -155,13 +167,7 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
     }
 
     void start() override {
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", rank: ",
-                  comm_addr.to_string(),
-                  ", cnt ",
-                  cnt_entry);
+        ENTRY_LOG_DEBUG("Start entry, cnt ", cnt_entry);
 
         //Create base primitives
         base::start();
@@ -221,15 +227,15 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
         //TODO
         std::vector<ccl_device::device_ipc_memory_handle> ret;
         ret.reserve(3);
-        ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), ctx));
-        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), ctx));
-        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), ctx));
+        ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx()));
+        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
+        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
         return ret;
     }
 
 protected:
     bool finalize_entry() override {
-        LOG_TRACE("entry: ", class_name(), ", rank: ", comm_addr.to_string());
+        ENTRY_LOG_TRACE("Try to finalize");
         ccl_device& device = parent_communicator->get_device();
 
         kernel_main_typed& main_entry_function =
@@ -237,105 +243,135 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
                                                          topology,
                                                          ccl::device_topology_type::ring,
                                                          native_type>();
-        if ((*kernel_router)(main_entry_function)) {
-            ze_result_t result;
-            //TODO L0 Workaround
-            if (!is_kernel_added) {
-                std::unique_lock<std::mutex> lock(global_mutex);
-                exec_count++;
-                cur_index = exec_count;
-                result = zeCommandListAppendLaunchKernel(device.get_cmd_list(ctx).get(),
-                                                         main_entry_function.handle,
-                                                         &launch_args,
-                                                         nullptr,
-                                                         0,
-                                                         nullptr);
-                if (result != ZE_RESULT_SUCCESS) {
-                    LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
-                    throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
-                }
-                is_kernel_added = true;
+        if (!(*kernel_router)(main_entry_function)) {
+            return false;
+        }
 
-                LOG_DEBUG("entry: ",
-                          class_name(),
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          ". Kernel added: ",
-                          main_entry_function.to_string(),
-                          " in list");
-            }
+        auto& cmd_list = device.get_cmd_list(get_ctx());
+        ze_result_t result;
+        //TODO L0 Workaround
+        if (!is_kernel_added) {
+            std::unique_lock<std::mutex> lock(global_mutex);
+            exec_count++;
+            (void)cur_index;
+
+            kernel_bind_epoch_id = exec_count;
 
-            while (exec_count < registered_thread.size()) {
+            //L0 Workaround launch kernel require critical section
+            result = zeCommandListAppendLaunchKernel(
+                cmd_list.get(), main_entry_function.handle, &launch_args, nullptr, 0, nullptr);
+            if (result != ZE_RESULT_SUCCESS) {
+                LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
+                throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
             }
+            is_kernel_added = true;
 
-            //TODO L0 workaround
-            LOG_INFO("Check L0 Workaround: WaitCount: ",
-                     wait_count,
-                     ", ExecCount: ",
-                     exec_count,
-                     ", CurIndex: ",
-                     cur_index);
-            if (cur_index == wait_count /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
-                if (topology == ccl::group_split_type::cluster) {
-                    // TODO: implement process communicator case
-                    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + "TODO: implement process communicator case");
-                    // auto c = ccl::environment::instance().create_communicator();
-                    // if (c.rank() == 0) {
-                        // LOG_INFO("L0 Workaround: one device close list!!!",
-                        //          "WaitCount: ",
-                        //          wait_count,
-                        //          ", ExecCount: ",
-                        //          exec_count,
-                        //          ", CurIndex: ",
-                        //          cur_index);
-                        // result = zeCommandListClose(device.get_cmd_list().get());
-                        // if (result != ZE_RESULT_SUCCESS) {
-                        //     LOG_ERROR("zeCommandListClose failed, error: ",
-                        //               native::to_string(result));
-                        //     throw std::runtime_error("zeCommandListClose failed");
-                        // }
-                    // }
-                }
-                else {
-                    LOG_INFO("L0 Workaround: one device close list!!!",
-                             "WaitCount: ",
-                             wait_count,
-                             ", ExecCount: ",
-                             exec_count,
-                             ", CurIndex: ",
-                             cur_index);
-                    result = zeCommandListClose(device.get_cmd_list(ctx).get());
+            ENTRY_LOG_DEBUG("Append kernel successfully: ",
+                            main_entry_function.to_string(),
+                            " in list: ",
+                            cmd_list.get());
+        }
+
+        while (exec_count < registered_thread.size()) {
+            ENTRY_LOG_TRACE("waiting thread counts, exec_cont: ", exec_count);
+        }
+
+        //TODO L0 workaround
+        ENTRY_LOG_INFO("Check L0 Workaround: WaitCount: ",
+                       wait_count,
+                       ", ExecCount: ",
+                       exec_count,
+                       ", CurIndex: ",
+                       kernel_bind_epoch_id);
+
+        if (kernel_bind_epoch_id % wait_count ==
+            0 /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
+            if (topology == ccl::group_split_type::cluster) {
+                // TODO: implement process communicator case
+                throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                                     "TODO: implement process communicator case");
+                // auto c = ccl::detail::environment::instance().create_communicator();
+                // if (c.rank() == 0) {
+                // LOG_INFO("L0 Workaround: one device close list!!!",
+                //          "WaitCount: ",
+                //          wait_count,
+                //          ", ExecCount: ",
+                //          exec_count,
+                //          ", CurIndex: ",
+                //          cur_index);
+                // result = zeCommandListClose(device.get_cmd_list().get());
+                // if (result != ZE_RESULT_SUCCESS) {
+                //     LOG_ERROR("zeCommandListClose failed, error: ",
+                //               native::to_string(result));
+                //     throw std::runtime_error("zeCommandListClose failed");
+                // }
+                // }
+            }
+            else {
+                ENTRY_LOG_INFO("L0 Workaround: one device close list!!!\n",
+                               "WaitCount: ",
+                               wait_count,
+                               ", ExecCount: ",
+                               exec_count,
+                               ", CurIndex: ",
+                               kernel_bind_epoch_id);
+
+                {
+                    std::unique_lock<std::mutex> lock(global_mutex);
+                    result = zeCommandListClose(cmd_list.get());
                     if (result != ZE_RESULT_SUCCESS) {
                         LOG_ERROR("zeCommandListClose failed, error: ", native::to_string(result));
                         throw std::runtime_error("zeCommandListClose failed");
                     }
-                }
 
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
-            }
-            else if (cur_index > wait_count) {
-                LOG_INFO("L0 Workaround: one device should close list before!!! ",
-                         "WaitCount: ",
-                         wait_count,
-                         ", ExecCount: ",
-                         exec_count,
-                         ", CurIndex: ",
-                         cur_index);
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
+                    auto queue_prop = ccl_device::get_default_queue_desc();
+                    auto& cmd_queue = device.get_cmd_queue(queue_prop, get_ctx());
+                    ENTRY_LOG_INFO("Execute list:",
+                                   cmd_list.get(),
+                                   ", queue: ",
+                                   cmd_queue.get(),
+                                   ", go to submit entry");
+                    ze_result_t ret = zeCommandQueueExecuteCommandLists(
+                        cmd_queue.get(), 1, cmd_list.get_ptr(), this->fence);
+                    if (ret != ZE_RESULT_SUCCESS) {
+                        throw ccl::exception(std::string("cannot execute command list, error: ") +
+                                             std::to_string(ret));
+                    }
+
+                    ret = zeFenceQueryStatus(this->fence);
+                    ENTRY_LOG_DEBUG("Fence query status: ",
+                                    native::to_string(ret),
+                                    ", queue: ",
+                                    cmd_queue.get());
+                }
             }
+
+            list_closed_epoch.fetch_add(1);
+            ENTRY_LOG_INFO("List closed:", cmd_list.get(), ", go to submit entry");
+            return true;
+        }
+        else if (kernel_bind_epoch_id > wait_count ||
+                 list_closed_epoch.load() != list_closed_epoch_id /* epoch changed */) {
+            ENTRY_LOG_INFO("L0 Workaround: one device should close list before!!! ",
+                           "WaitCount: ",
+                           wait_count,
+                           ", ExecCount: ",
+                           exec_count,
+                           ", CurIndex: ",
+                           kernel_bind_epoch_id);
+            ENTRY_LOG_INFO(
+                "Dirfferent entry closed the list:", cmd_list.get(), ", go to submit entry");
+            return true;
         }
         return false;
     }
 
     void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str, class_name(), "TODO\n");
+        base::dump_detail(str);
     }
 
 private:
     bool is_kernel_added = false; //TODO L0 workaround - one dev close list
-    std::shared_ptr<ccl_context> ctx;
     ccl_device::device_memory<native_type> temp_buffer;
     ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag;
     ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag;
@@ -353,13 +389,7 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
                                             typename kernel_main_typed::ready_to_recv_flag_arg>();
         if (is_right_kernel_ready) {
             if (is_kernel_added) {
-                LOG_DEBUG("entry: ",
-                          class_name(),
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          ". Function: ",
-                          main_entry_function.to_string(),
-                          " - binded already");
+                ENTRY_LOG_TRACE("Function: ", main_entry_function.to_string(), " - binded already");
                 return true;
             }
 
@@ -373,27 +403,22 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
                     right_kernel
                         .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
 
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_tmp_recv_buf_arg.first,
-                      ", ",
-                      right_tmp_recv_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
+            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name());
+            ENTRY_LOG_TRACE("Args: \n{ ",
+                            right_tmp_recv_buf_arg.first,
+                            ", ",
+                            right_tmp_recv_buf_arg.second,
+                            "}\n",
+                            "{ ",
+                            right_income_data_flag_arg.first,
+                            ", ",
+                            right_income_data_flag_arg.second,
+                            "}\n",
+                            "{ ",
+                            right_ready_to_recv_flag_arg.first,
+                            ", ",
+                            right_ready_to_recv_flag_arg.second,
+                            "}\n");
 
             //TODO register argument for current device kernel: use array-version
             main_entry_function
@@ -403,17 +428,7 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
                     right_tmp_recv_buf_arg.second,
                     right_income_data_flag_arg.second,
                     right_ready_to_recv_flag_arg.second);
-
-            LOG_TRACE("Set right_tmp_recv_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
         }
         return is_right_kernel_ready;
     }
@@ -491,5 +506,7 @@ class l0_allreduce_typed_entry : public base_gpu_entry<native_type,
         }
         return is_right_kernel_ready;
     }
+    size_t list_closed_epoch_id = 0;
+    size_t kernel_bind_epoch_id = 0;
 };
 } // namespace native
diff --git a/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp b/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp
index 5ab3b6987..b6239c52d 100644
--- a/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_alltoallv_typed_entry.hpp
@@ -43,6 +43,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
     using base::status;
     using base::launch_args;
     using base::kernel_router;
+    using base::get_ctx;
     using kernel_main_typed = ring_alltoallv_kernel<native_type>;
     using kernel_ipc_typed = ring_alltoallv_ipc<native_type>;
 
@@ -77,6 +78,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
         ccl_sched* sched,
         std::shared_ptr<gpu_comm_impl> comm,
         specific_indexed_device_storage& available_devices,
+        ccl_driver_context_ptr in_ctx,
         const ccl_buffer send_buf,
         const size_t* send_counts,
         ccl_buffer recv_buf,
@@ -84,42 +86,51 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
         std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
             : base(sched,
                    comm,
+                   in_ctx,
                    send_buf,
-                   ccl::native_type_info<native_type>::ccl_type_value,
+                   ccl::native_type_info<native_type>::dtype,
                    device_stream),
               temp_buffer(parent_communicator->get_device().template alloc_memory<native_type>(
                   512,
-                  sizeof(native_type), std::shared_ptr<ccl_context> { })),
+                  sizeof(native_type),
+                  get_ctx())),
               // left_wrote_to_me_flag
               income_data_flag(parent_communicator->get_device()
                                    .template alloc_memory<income_data_flag_gpu_type>(
                                        1,
-                                       sizeof(income_data_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                                       sizeof(income_data_flag_gpu_type),
+                                       get_ctx())),
               // ready_to_recv_flag_arg
               ready_to_recv_flag(parent_communicator->get_device()
                                      .template alloc_memory<ready_to_recv_flag_gpu_type>(
                                          1,
-                                         sizeof(ready_to_recv_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                                         sizeof(ready_to_recv_flag_gpu_type),
+                                         get_ctx())),
               proxy_size_flag_entry(
                   parent_communicator->get_device().template alloc_memory<proxy_size_flag_gpu_type>(
                       1,
-                      sizeof(proxy_size_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                      sizeof(proxy_size_flag_gpu_type),
+                      get_ctx())),
               recv_counts_buf(parent_communicator->get_device()
                                   .template alloc_memory<recv_counts_typed_entry_type>(
                                       512,
-                                      sizeof(recv_counts_typed_entry_type), std::shared_ptr<ccl_context> { })),
+                                      sizeof(recv_counts_typed_entry_type),
+                                      get_ctx())),
               recv_offsets_buf(parent_communicator->get_device()
                                    .template alloc_memory<recv_offsets_typed_entry_type>(
                                        comm_addr.size,
-                                       sizeof(recv_offsets_typed_entry_type), std::shared_ptr<ccl_context> { })),
+                                       sizeof(recv_offsets_typed_entry_type),
+                                       get_ctx())),
               send_counts_buf(parent_communicator->get_device()
                                   .template alloc_memory<recv_counts_typed_entry_type>(
                                       512,
-                                      sizeof(recv_counts_typed_entry_type), std::shared_ptr<ccl_context> { })),
+                                      sizeof(recv_counts_typed_entry_type),
+                                      get_ctx())),
               send_offsets_buf(parent_communicator->get_device()
                                    .template alloc_memory<send_offsets_typed_entry_type>(
                                        comm_addr.size,
-                                       sizeof(send_offsets_typed_entry_type), std::shared_ptr<ccl_context> { }))
+                                       sizeof(send_offsets_typed_entry_type),
+                                       get_ctx()))
 
     {
         // copy recv_buf into recv_buf_entry
@@ -128,16 +139,16 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
         // same as parent_communicator->template
         //                    get_comm_data<base::get_topology(),
         //                    base::get_topology_class()>().size;
-        size_t local_topology_size = comm_addr.size;
+        int local_topology_size = comm_addr.size;
         std::vector<size_t> recv_offsets_v(local_topology_size, 0);
 
-        for (size_t idx = 0; idx < local_topology_size; idx++) {
+        for (int idx = 0; idx < local_topology_size; idx++) {
             if (idx > 0)
                 recv_offsets_v[idx] += recv_offsets_v[idx - 1] + recv_counts[idx - 1];
         }
 
         std::vector<size_t> send_offsets_v(local_topology_size, 0);
-        for (size_t idx = 0; idx < local_topology_size; idx++) {
+        for (int idx = 0; idx < local_topology_size; idx++) {
             if (idx > 0)
                 send_offsets_v[idx] += send_offsets_v[idx - 1] + send_counts[idx - 1];
         }
@@ -150,14 +161,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
         // flag
         proxy_size_flag_entry.enqueue_write_sync({ (int)0 });
 
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", rank: ",
-                  "local topology size:",
-                  local_topology_size,
-                  comm_addr.to_string());
-        size_t next_rank = (comm_addr.rank + 1) % comm_addr.size;
+        int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
             l0_alltoallv_typed_entry<native_type, gpu_comm_impl, topology>>(
             *this, next_rank, available_devices);
@@ -180,6 +184,16 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
             std::unique_lock<std::mutex> lock(global_mutex);
             registered_thread.insert(std::this_thread::get_id());
         }
+
+        //remember list_closed event index
+        list_closed_epoch_id = list_closed_epoch.load();
+
+        ENTRY_LOG_DEBUG("Created, next_rank:",
+                        next_rank,
+                        " ,WaitCount: ",
+                        wait_count.load(),
+                        ", ListClosedEpoch: ",
+                        list_closed_epoch_id);
     }
 
     ~l0_alltoallv_typed_entry() {
@@ -264,14 +278,14 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
         //TODO
         std::vector<ccl_device::device_ipc_memory_handle> ret;
         ret.reserve(3);
-        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), ctx));
-        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), ctx));
+        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
+        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
         return ret;
     }
 
 protected:
     bool finalize_entry() override {
-        LOG_TRACE("entry: ", class_name(), ", rank: ", comm_addr.to_string());
+        ENTRY_LOG_TRACE("Try to finalize");
         ccl_device& device = parent_communicator->get_device();
 
         kernel_main_typed& main_entry_function =
@@ -279,100 +293,131 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
                                                          topology,
                                                          ccl::device_topology_type::ring,
                                                          native_type>();
-        if ((*kernel_router)(main_entry_function)) {
-            ze_result_t result;
-            //TODO L0 Workaround
-            if (!is_kernel_added) {
-                std::unique_lock<std::mutex> lock(global_mutex);
-                exec_count++;
-                cur_index = exec_count;
-                result = zeCommandListAppendLaunchKernel(device.get_cmd_list(ctx).get(),
-                                                         main_entry_function.handle,
-                                                         &launch_args,
-                                                         nullptr,
-                                                         0,
-                                                         nullptr);
-                if (result != ZE_RESULT_SUCCESS) {
-                    LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
-                    throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
-                }
-                is_kernel_added = true;
+        if (!(*kernel_router)(main_entry_function)) {
+            return false;
+        }
 
-                LOG_DEBUG("entry: ",
-                          class_name(),
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          ". Kernel added: ",
-                          main_entry_function.to_string(),
-                          " in list");
-            }
+        auto& cmd_list = device.get_cmd_list(get_ctx());
+        ze_result_t result;
+        //TODO L0 Workaround
+        if (!is_kernel_added) {
+            std::unique_lock<std::mutex> lock(global_mutex);
+            exec_count++;
+            (void)cur_index;
+
+            kernel_bind_epoch_id = exec_count;
 
-            while (exec_count < registered_thread.size()) {
+            //L0 Workaround launch kernel require critical section
+            result = zeCommandListAppendLaunchKernel(
+                cmd_list.get(), main_entry_function.handle, &launch_args, nullptr, 0, nullptr);
+            if (result != ZE_RESULT_SUCCESS) {
+                LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
+                throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
             }
+            is_kernel_added = true;
 
-            //TODO L0 workaround
-            LOG_INFO("Check L0 Workaround: WaitCount: ",
-                     wait_count,
-                     ", ExecCount: ",
-                     exec_count,
-                     ", CurIndex: ",
-                     cur_index);
-            if (cur_index == wait_count /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
-                if (topology == ccl::group_split_type::cluster) {
-                    // TODO: implement process communicator case
-                    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + "TODO: implement process communicator case");
-                    // auto c = ccl::environment::instance().create_communicator();
-                    // if (c.rank() == 0) {
-                        // LOG_INFO("L0 Workaround: one device close list!!!",
-                        //          "WaitCount: ",
-                        //          wait_count,
-                        //          ", ExecCount: ",
-                        //          exec_count,
-                        //          ", CurIndex: ",
-                        //          cur_index);
-                        // result = zeCommandListClose(device.get_cmd_list().get());
-                        // if (result != ZE_RESULT_SUCCESS) {
-                        //     LOG_ERROR("zeCommandListClose failed, error: ",
-                        //               native::to_string(result));
-                        //     throw std::runtime_error("zeCommandListClose failed");
-                        // }
-                    // }
-                }
-                else {
-                    LOG_INFO("L0 Workaround: one device close list!!!",
-                             "WaitCount: ",
-                             wait_count,
-                             ", ExecCount: ",
-                             exec_count,
-                             ", CurIndex: ",
-                             cur_index);
-                    result = zeCommandListClose(device.get_cmd_list(ctx).get());
+            ENTRY_LOG_DEBUG("Append kernel successfully: ",
+                            main_entry_function.to_string(),
+                            " in list: ",
+                            cmd_list.get());
+        }
+
+        while (exec_count < registered_thread.size()) {
+            ENTRY_LOG_TRACE("waiting thread counts, exec_cont: ", exec_count);
+        }
+
+        //TODO L0 workaround
+        ENTRY_LOG_INFO("Check L0 Workaround: WaitCount: ",
+                       wait_count,
+                       ", ExecCount: ",
+                       exec_count,
+                       ", CurIndex: ",
+                       kernel_bind_epoch_id);
+
+        if (kernel_bind_epoch_id % wait_count ==
+            0 /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
+            if (topology == ccl::group_split_type::cluster) {
+                // TODO: implement process communicator case
+                throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                                     "TODO: implement process communicator case");
+                // auto c = ccl::detail::environment::instance().create_communicator();
+                // if (c.rank() == 0) {
+                // LOG_INFO("L0 Workaround: one device close list!!!",
+                //          "WaitCount: ",
+                //          wait_count,
+                //          ", ExecCount: ",
+                //          exec_count,
+                //          ", CurIndex: ",
+                //          cur_index);
+                // result = zeCommandListClose(device.get_cmd_list().get());
+                // if (result != ZE_RESULT_SUCCESS) {
+                //     LOG_ERROR("zeCommandListClose failed, error: ",
+                //               native::to_string(result));
+                //     throw std::runtime_error("zeCommandListClose failed");
+                // }
+                // }
+            }
+            else {
+                ENTRY_LOG_INFO("L0 Workaround: one device close list!!!\n",
+                               "WaitCount: ",
+                               wait_count,
+                               ", ExecCount: ",
+                               exec_count,
+                               ", CurIndex: ",
+                               kernel_bind_epoch_id);
+
+                {
+                    std::unique_lock<std::mutex> lock(global_mutex);
+                    result = zeCommandListClose(cmd_list.get());
                     if (result != ZE_RESULT_SUCCESS) {
                         LOG_ERROR("zeCommandListClose failed, error: ", native::to_string(result));
                         throw std::runtime_error("zeCommandListClose failed");
                     }
-                }
 
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
-            }
-            else if (cur_index > wait_count) {
-                LOG_INFO("L0 Workaround: one device should close list before!!! ",
-                         "WaitCount: ",
-                         wait_count,
-                         ", ExecCount: ",
-                         exec_count,
-                         ", CurIndex: ",
-                         cur_index);
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
+                    auto queue_prop = ccl_device::get_default_queue_desc();
+                    auto& cmd_queue = device.get_cmd_queue(queue_prop, get_ctx());
+                    ENTRY_LOG_INFO("Execute list:",
+                                   cmd_list.get(),
+                                   ", queue: ",
+                                   cmd_queue.get(),
+                                   ", go to submit entry");
+                    ze_result_t ret = zeCommandQueueExecuteCommandLists(
+                        cmd_queue.get(), 1, cmd_list.get_ptr(), this->fence);
+                    if (ret != ZE_RESULT_SUCCESS) {
+                        throw ccl::exception(std::string("cannot execute command list, error: ") +
+                                             std::to_string(ret));
+                    }
+
+                    ret = zeFenceQueryStatus(this->fence);
+                    ENTRY_LOG_DEBUG("Fence query status: ",
+                                    native::to_string(ret),
+                                    ", queue: ",
+                                    cmd_queue.get());
+                }
             }
+
+            list_closed_epoch.fetch_add(1);
+            ENTRY_LOG_INFO("List closed:", cmd_list.get(), ", go to submit entry");
+            return true;
+        }
+        else if (kernel_bind_epoch_id > wait_count ||
+                 list_closed_epoch.load() != list_closed_epoch_id /* epoch changed */) {
+            ENTRY_LOG_INFO("L0 Workaround: one device should close list before!!! ",
+                           "WaitCount: ",
+                           wait_count,
+                           ", ExecCount: ",
+                           exec_count,
+                           ", CurIndex: ",
+                           kernel_bind_epoch_id);
+            ENTRY_LOG_INFO(
+                "Dirfferent entry closed the list:", cmd_list.get(), ", go to submit entry");
+            return true;
         }
         return false;
     }
 
     void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str, class_name(), "TODO\n");
+        base::dump_detail(str);
     }
 
 private:
@@ -388,6 +433,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
     ccl_device::device_memory<send_counts_typed_entry_type> send_counts_buf;
     ccl_device::device_memory<send_offsets_typed_entry_type> send_offsets_buf;
     std::shared_ptr<ccl_context> ctx;
+
 public:
     bool execute(kernel_main_typed& main_entry_function, kernel_main_typed& right_kernel) {
         //Check argument binding in kernels for next rank
@@ -398,13 +444,7 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
                                             typename kernel_main_typed::proxy_size_flag_arg>();
         if (is_right_kernel_ready) {
             if (is_kernel_added) {
-                LOG_DEBUG("entry: ",
-                          class_name(),
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          ". Function: ",
-                          main_entry_function.to_string(),
-                          " - binded already");
+                ENTRY_LOG_TRACE("Function: ", main_entry_function.to_string(), " - binded already");
                 return true;
             }
 
@@ -421,27 +461,22 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
             typename kernel_main_typed::proxy_size_flag_arg::return_t right_proxy_size_flag_arg =
                 right_kernel.template get_arg<typename kernel_main_typed::proxy_size_flag_arg>();
 
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_tmp_recv_buf_arg.first,
-                      ", ",
-                      right_tmp_recv_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
+            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name());
+            ENTRY_LOG_TRACE("Args: \n{ ",
+                            right_tmp_recv_buf_arg.first,
+                            ", ",
+                            right_tmp_recv_buf_arg.second,
+                            "}\n",
+                            "{ ",
+                            right_income_data_flag_arg.first,
+                            ", ",
+                            right_income_data_flag_arg.second,
+                            "}\n",
+                            "{ ",
+                            right_ready_to_recv_flag_arg.first,
+                            ", ",
+                            right_ready_to_recv_flag_arg.second,
+                            "}\n");
 
             //TODO register argument for current device kernel: use array-version
             main_entry_function
@@ -453,20 +488,12 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
                     right_income_data_flag_arg.second,
                     right_ready_to_recv_flag_arg.second,
                     right_proxy_size_flag_arg.second);
-            LOG_TRACE("Set right_tmp_recv_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
         }
         return is_right_kernel_ready;
     }
 
-    /*bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) {
+    bool execute(kernel_main_typed& main_entry_function, kernel_ipc_typed& right_kernel) {
         //Check argument binding in kernels for next rank
         bool is_right_kernel_ready =
             right_kernel.template test_args< //typename kernel_ipc_typed::right_output_buf_arg,
@@ -535,6 +562,8 @@ class l0_alltoallv_typed_entry : public base_gpu_entry<native_type,
                       main_entry_function.to_string());
         }
         return is_right_kernel_ready;
-    }*/
+    }
+    size_t list_closed_epoch_id = 0;
+    size_t kernel_bind_epoch_id = 0;
 };
 } // namespace native
diff --git a/src/sched/entry/l0/l0_bcast_typed_entry.hpp b/src/sched/entry/l0/l0_bcast_typed_entry.hpp
index 34acc631f..e69126015 100644
--- a/src/sched/entry/l0/l0_bcast_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_bcast_typed_entry.hpp
@@ -42,6 +42,7 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
     using base::status;
     using base::launch_args;
     using base::kernel_router;
+    using base::get_ctx;
     using kernel_main_typed = ring_bcast_kernel<native_type>;
     using kernel_ipc_typed = ring_bcast_ipc<native_type>;
 
@@ -64,40 +65,37 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
     l0_bcast_typed_entry(ccl_sched* sched,
                          std::shared_ptr<gpu_comm_impl> comm,
                          specific_indexed_device_storage& available_devices,
+                         ccl_driver_context_ptr in_ctx,
                          ccl_buffer buf,
                          size_t cnt,
-                         size_t root,
+                         int root,
                          std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
             : base(sched,
                    comm,
+                   in_ctx,
                    buf,
-                   ccl::native_type_info<native_type>::ccl_type_value,
+                   ccl::native_type_info<native_type>::dtype,
                    device_stream),
 
               income_data_flag(parent_communicator->get_device()
                                    .template alloc_memory<income_data_flag_gpu_type>(
                                        1,
-                                       sizeof(income_data_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                                       sizeof(income_data_flag_gpu_type),
+                                       get_ctx())),
               ready_to_recv_flag(parent_communicator->get_device()
                                      .template alloc_memory<ready_to_recv_flag_gpu_type>(
                                          1,
-                                         sizeof(ready_to_recv_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                                         sizeof(ready_to_recv_flag_gpu_type),
+                                         get_ctx())),
               local_barrier_flag(parent_communicator->get_device()
                                      .template alloc_memory<local_barrier_flag_gpu_type>(
                                          1,
-                                         sizeof(local_barrier_flag_gpu_type), std::shared_ptr<ccl_context> { })) {
+                                         sizeof(local_barrier_flag_gpu_type),
+                                         get_ctx())) {
         root_typed_entry = root;
         cnt_entry = cnt;
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", cnt ",
-                  cnt_entry,
-                  ", root",
-                  root,
-                  ", rank: ",
-                  comm_addr.to_string());
-        size_t next_rank = (comm_addr.rank + 1) % comm_addr.size;
+
+        int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
             l0_bcast_typed_entry<native_type, gpu_comm_impl, topology>>(
             *this, next_rank, available_devices);
@@ -120,18 +118,22 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
             std::unique_lock<std::mutex> lock(global_mutex);
             registered_thread.insert(std::this_thread::get_id());
         }
+
+        //remember list_closed event index
+        list_closed_epoch_id = list_closed_epoch.load();
+
+        ENTRY_LOG_DEBUG("Created, next_rank:",
+                        next_rank,
+                        " ,WaitCount: ",
+                        wait_count.load(),
+                        ", ListClosedEpoch: ",
+                        list_closed_epoch_id);
     }
 
     ~l0_bcast_typed_entry() {}
 
     void start() override {
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", rank: ",
-                  comm_addr.to_string(),
-                  ", cnt ",
-                  cnt_entry);
+        ENTRY_LOG_DEBUG("Start entry, cnt ", cnt_entry);
 
         //Create base primitives
         base::start();
@@ -158,7 +160,7 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
         /* TRY To APPEND Kernel HERE!!! Not in update
          *
          * ze_result_t result = zeCommandListAppendLaunchKernel(exec_cmd_list->handle, main_entry_function.handle, &launch_args, nullptr, 0, nullptr);
-      
+
 
         / * result = zeCommandListClose(exec_cmd_list->handle);
         if(result != ZE_RESULT_SUCCESS)
@@ -182,14 +184,14 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
         //TODO
         std::vector<ccl_device::device_ipc_memory_handle> ret;
         ret.reserve(2);
-        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), ctx));
-        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), ctx));
+        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
+        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
         return ret;
     }
 
 protected:
     bool finalize_entry() override {
-        LOG_TRACE("entry: ", class_name(), ", rank: ", comm_addr.to_string());
+        ENTRY_LOG_TRACE("Try to finalize");
         ccl_device& device = parent_communicator->get_device();
 
         kernel_main_typed& main_entry_function =
@@ -197,100 +199,131 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
                                                          topology,
                                                          ccl::device_topology_type::ring,
                                                          native_type>();
-        if ((*kernel_router)(main_entry_function)) {
-            ze_result_t result;
-            //TODO L0 Workaround
-            if (!is_kernel_added) {
-                std::unique_lock<std::mutex> lock(global_mutex);
-                exec_count++;
-                cur_index = exec_count;
-                result = zeCommandListAppendLaunchKernel(device.get_cmd_list(ctx).get(),
-                                                         main_entry_function.handle,
-                                                         &launch_args,
-                                                         nullptr,
-                                                         0,
-                                                         nullptr);
-                if (result != ZE_RESULT_SUCCESS) {
-                    LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
-                    throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
-                }
-                is_kernel_added = true;
+        if (!(*kernel_router)(main_entry_function)) {
+            return false;
+        }
 
-                LOG_DEBUG("entry: ",
-                          class_name(),
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          ". Kernel added: ",
-                          main_entry_function.to_string(),
-                          " in list");
-            }
+        auto& cmd_list = device.get_cmd_list(get_ctx());
+        ze_result_t result;
+        //TODO L0 Workaround
+        if (!is_kernel_added) {
+            std::unique_lock<std::mutex> lock(global_mutex);
+            exec_count++;
+            (void)cur_index;
+
+            kernel_bind_epoch_id = exec_count;
 
-            while (exec_count < registered_thread.size()) {
+            //L0 Workaround launch kernel require critical section
+            result = zeCommandListAppendLaunchKernel(
+                cmd_list.get(), main_entry_function.handle, &launch_args, nullptr, 0, nullptr);
+            if (result != ZE_RESULT_SUCCESS) {
+                LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
+                throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
             }
+            is_kernel_added = true;
 
-            //TODO L0 workaround
-            LOG_INFO("Check L0 Workaround: WaitCount: ",
-                     wait_count,
-                     ", ExecCount: ",
-                     exec_count,
-                     ", CurIndex: ",
-                     cur_index);
-            if (cur_index == wait_count /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
-                if (topology == ccl::group_split_type::cluster) {
-                    // TODO: implement process communicator case
-                    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + "TODO: implement process communicator case");
-                    // auto c = ccl::environment::instance().create_communicator();
-                    // if (c.rank() == 0) {
-                        // LOG_INFO("L0 Workaround: one device close list!!!",
-                        //          "WaitCount: ",
-                        //          wait_count,
-                        //          ", ExecCount: ",
-                        //          exec_count,
-                        //          ", CurIndex: ",
-                        //          cur_index);
-                        // result = zeCommandListClose(device.get_cmd_list().get());
-                        // if (result != ZE_RESULT_SUCCESS) {
-                        //     LOG_ERROR("zeCommandListClose failed, error: ",
-                        //               native::to_string(result));
-                        //     throw std::runtime_error("zeCommandListClose failed");
-                        // }
-                    // }
-                }
-                else {
-                    LOG_INFO("L0 Workaround: one device close list!!!",
-                             "WaitCount: ",
-                             wait_count,
-                             ", ExecCount: ",
-                             exec_count,
-                             ", CurIndex: ",
-                             cur_index);
-                    result = zeCommandListClose(device.get_cmd_list(ctx).get());
+            ENTRY_LOG_DEBUG("Append kernel successfully: ",
+                            main_entry_function.to_string(),
+                            " in list: ",
+                            cmd_list.get());
+        }
+
+        while (exec_count < registered_thread.size()) {
+            ENTRY_LOG_TRACE("waiting thread counts, exec_cont: ", exec_count);
+        }
+
+        //TODO L0 workaround
+        ENTRY_LOG_INFO("Check L0 Workaround: WaitCount: ",
+                       wait_count,
+                       ", ExecCount: ",
+                       exec_count,
+                       ", CurIndex: ",
+                       kernel_bind_epoch_id);
+
+        if (kernel_bind_epoch_id % wait_count ==
+            0 /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
+            if (topology == ccl::group_split_type::cluster) {
+                // TODO: implement process communicator case
+                throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                                     "TODO: implement process communicator case");
+                // auto c = ccl::detail::environment::instance().create_communicator();
+                // if (c.rank() == 0) {
+                // LOG_INFO("L0 Workaround: one device close list!!!",
+                //          "WaitCount: ",
+                //          wait_count,
+                //          ", ExecCount: ",
+                //          exec_count,
+                //          ", CurIndex: ",
+                //          cur_index);
+                // result = zeCommandListClose(device.get_cmd_list().get());
+                // if (result != ZE_RESULT_SUCCESS) {
+                //     LOG_ERROR("zeCommandListClose failed, error: ",
+                //               native::to_string(result));
+                //     throw std::runtime_error("zeCommandListClose failed");
+                // }
+                // }
+            }
+            else {
+                ENTRY_LOG_INFO("L0 Workaround: one device close list!!!\n",
+                               "WaitCount: ",
+                               wait_count,
+                               ", ExecCount: ",
+                               exec_count,
+                               ", CurIndex: ",
+                               kernel_bind_epoch_id);
+
+                {
+                    std::unique_lock<std::mutex> lock(global_mutex);
+                    result = zeCommandListClose(cmd_list.get());
                     if (result != ZE_RESULT_SUCCESS) {
                         LOG_ERROR("zeCommandListClose failed, error: ", native::to_string(result));
                         throw std::runtime_error("zeCommandListClose failed");
                     }
-                }
 
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
-            }
-            else if (cur_index > wait_count) {
-                LOG_INFO("L0 Workaround: one device should close list before!!! ",
-                         "WaitCount: ",
-                         wait_count,
-                         ", ExecCount: ",
-                         exec_count,
-                         ", CurIndex: ",
-                         cur_index);
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
+                    auto queue_prop = ccl_device::get_default_queue_desc();
+                    auto& cmd_queue = device.get_cmd_queue(queue_prop, get_ctx());
+                    ENTRY_LOG_INFO("Execute list:",
+                                   cmd_list.get(),
+                                   ", queue: ",
+                                   cmd_queue.get(),
+                                   ", go to submit entry");
+                    ze_result_t ret = zeCommandQueueExecuteCommandLists(
+                        cmd_queue.get(), 1, cmd_list.get_ptr(), this->fence);
+                    if (ret != ZE_RESULT_SUCCESS) {
+                        throw ccl::exception(std::string("cannot execute command list, error: ") +
+                                             std::to_string(ret));
+                    }
+
+                    ret = zeFenceQueryStatus(this->fence);
+                    ENTRY_LOG_DEBUG("Fence query status: ",
+                                    native::to_string(ret),
+                                    ", queue: ",
+                                    cmd_queue.get());
+                }
             }
+
+            list_closed_epoch.fetch_add(1);
+            ENTRY_LOG_INFO("List closed:", cmd_list.get(), ", go to submit entry");
+            return true;
+        }
+        else if (kernel_bind_epoch_id > wait_count ||
+                 list_closed_epoch.load() != list_closed_epoch_id /* epoch changed */) {
+            ENTRY_LOG_INFO("L0 Workaround: one device should close list before!!! ",
+                           "WaitCount: ",
+                           wait_count,
+                           ", ExecCount: ",
+                           exec_count,
+                           ", CurIndex: ",
+                           kernel_bind_epoch_id);
+            ENTRY_LOG_INFO(
+                "Dirfferent entry closed the list:", cmd_list.get(), ", go to submit entry");
+            return true;
         }
         return false;
     }
 
     void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str, class_name(), "TODO\n");
+        base::dump_detail(str);
     }
 
 private:
@@ -299,7 +332,7 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
     ccl_device::device_memory<income_data_flag_gpu_type> income_data_flag;
     ccl_device::device_memory<ready_to_recv_flag_gpu_type> ready_to_recv_flag;
     ccl_device::device_memory<local_barrier_flag_gpu_type> local_barrier_flag;
-    size_t root_typed_entry;
+    int root_typed_entry;
     size_t cnt_entry;
     std::shared_ptr<ccl_context> ctx;
 
@@ -332,27 +365,22 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
                     right_kernel
                         .template get_arg<typename kernel_main_typed::ready_to_recv_flag_arg>();
 
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ", bind elapsed arguments for kernel: ",
-                      kernel_main_typed::name());
-            LOG_TRACE("Args: \n{ ",
-                      right_buf_arg.first,
-                      ", ",
-                      right_buf_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_income_data_flag_arg.first,
-                      ", ",
-                      right_income_data_flag_arg.second,
-                      "}\n",
-                      "{ ",
-                      right_ready_to_recv_flag_arg.first,
-                      ", ",
-                      right_ready_to_recv_flag_arg.second,
-                      "}\n");
+            ENTRY_LOG_DEBUG("Bind final arguments for kernel: ", kernel_main_typed::name());
+            ENTRY_LOG_TRACE("Args: \n{ ",
+                            right_buf_arg.first,
+                            ", ",
+                            right_buf_arg.second,
+                            "}\n",
+                            "{ ",
+                            right_income_data_flag_arg.first,
+                            ", ",
+                            right_income_data_flag_arg.second,
+                            "}\n",
+                            "{ ",
+                            right_ready_to_recv_flag_arg.first,
+                            ", ",
+                            right_ready_to_recv_flag_arg.second,
+                            "}\n");
 
             //TODO register argument for current device kernel: use array-version
             main_entry_function
@@ -362,16 +390,7 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
                     right_buf_arg.second,
                     right_income_data_flag_arg.second,
                     right_ready_to_recv_flag_arg.second);
-            LOG_TRACE("Set right_buf_arg",
-                      "Set right_income_data_flag_arg",
-                      "Set right_ready_to_recv_flag_arg");
-
-            LOG_DEBUG("entry: ",
-                      class_name(),
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      ". Function: ",
-                      main_entry_function.to_string());
+            ENTRY_LOG_DEBUG("Final Function: ", main_entry_function.to_string());
         }
         return is_right_kernel_ready;
     }
@@ -446,5 +465,7 @@ class l0_bcast_typed_entry : public base_gpu_entry<native_type,
         }
         return is_right_kernel_ready;
     }
+    size_t list_closed_epoch_id = 0;
+    size_t kernel_bind_epoch_id = 0;
 };
 } // namespace native
diff --git a/src/sched/entry/l0/l0_entry.hpp b/src/sched/entry/l0/l0_entry.hpp
index c496f64f5..e3a5d1d79 100644
--- a/src/sched/entry/l0/l0_entry.hpp
+++ b/src/sched/entry/l0/l0_entry.hpp
@@ -14,9 +14,11 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/datatype/datatype.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/native_device_api/l0/primitives.hpp"
+#include "common/comm/l0/modules/kernel_functions.hpp"
 
 #include "oneapi/ccl.hpp"
 
@@ -31,6 +33,54 @@
 #include <unistd.h>
 static std::mutex global_fence_mutex;
 
+#define ENTRY_LOG_TRACE(...) \
+    if (unlikely(logger.get_log_level() >= ccl_log_level::TRACE)) { \
+        do { \
+            std::stringstream ss; \
+            this->dump_detail(ss); \
+            logger.trace("|TRACE| ", \
+                         basedir_static(__FILE__), \
+                         ":", \
+                         __LINE__, \
+                         "  ", \
+                         ss.str(), \
+                         " - ", \
+                         ##__VA_ARGS__); \
+        } while (0); \
+    }
+
+#define ENTRY_LOG_DEBUG(...) \
+    if (unlikely(logger.get_log_level() >= ccl_log_level::DEBUG)) { \
+        do { \
+            std::stringstream ss; \
+            this->dump_detail(ss); \
+            logger.debug("|DEBUG| ", \
+                         basedir_static(__FILE__), \
+                         ":", \
+                         __LINE__, \
+                         "  ", \
+                         ss.str(), \
+                         " - ", \
+                         ##__VA_ARGS__); \
+        } while (0); \
+    }
+
+#define ENTRY_LOG_INFO(...) \
+    if (unlikely(logger.get_log_level() >= ccl_log_level::INFO)) { \
+        do { \
+            std::stringstream ss; \
+            this->dump_detail(ss); \
+            logger.info("|INFO| ", \
+                        basedir_static(__FILE__), \
+                        ":", \
+                        __LINE__, \
+                        "  ", \
+                        ss.str(), \
+                        " - ", \
+                        ##__VA_ARGS__); \
+        } while (0); \
+    }
+
 namespace native {
 template <class native_type,
           class gpu_comm_impl,
@@ -45,11 +95,13 @@ class base_gpu_entry : public sched_entry {
         typename gpu_comm::template gpu_kernel_t<type_op, group_id, class_id, native_type>;
     // using kernel_ipc_typed = ring_allreduce_ipc<native_type>;
 
+    template <class elem_t>
+    using device_memory = memory<elem_t, ccl_device, ccl_context>;
+
     friend class ccl_gpu_comm;
     friend class ccl_virtual_gpu_comm;
     static constexpr const char *class_name() noexcept {
         return ccl_coll_type_to_str(type_op);
-        ;
     }
     static constexpr ccl_coll_type type() noexcept {
         return type_op;
@@ -66,6 +118,7 @@ class base_gpu_entry : public sched_entry {
     base_gpu_entry() = delete;
     base_gpu_entry(ccl_sched *sched,
                    std::shared_ptr<gpu_comm> comm,
+                   ccl_driver_context_ptr in_ctx,
                    const ccl_buffer send_buf,
                    ccl::datatype dtype_in,
                    std::shared_ptr<ccl_stream> &stream)
@@ -75,21 +128,25 @@ class base_gpu_entry : public sched_entry {
                             ->template get_comm_data<get_topology(), get_topology_class()>()),
               send_buf(send_buf),
               dtype(dtype_in),
-              device_stream(stream) {
-    }
+              device_stream(stream),
+              ctx(in_ctx) {}
 
     virtual ~base_gpu_entry() {}
 
     virtual void start() override {
-        LOG_DEBUG(class_name(), " entry req ", &req, ", rank: ", comm_addr.to_string());
-
         ccl_device &device = parent_communicator->get_device();
         {
-            LOG_DEBUG(class_name(), " entry req ", &req, " - create initial gpu primitives");
             //TODO make check, that device_stream belong to the device
             auto queue_prop = ccl_device::get_default_queue_desc();
             auto &cmd_queue = device.get_cmd_queue(queue_prop, ctx);
-            fence = device.create_or_get_fence(cmd_queue, ctx);
+            fence = device.get_fence(cmd_queue, ctx).get();
+
+            ENTRY_LOG_DEBUG("start base entry initialization, ctx: ",
+                            ctx.get(),
+                            ", queue: ",
+                            cmd_queue.get(),
+                            ", fence: ",
+                            fence);
         }
         //else
         //{
@@ -124,6 +181,7 @@ class base_gpu_entry : public sched_entry {
         //make sure, that kernel ready for launch
 
         status = ccl_sched_entry_status_started;
+        ENTRY_LOG_DEBUG("started");
     }
 
     bool submit_for_execution() {
@@ -133,14 +191,22 @@ class base_gpu_entry : public sched_entry {
             //if(std::is_same<gpu_comm_impl, ccl_gpu_comm>::value)
             if (gpu_comm_impl::type_idx() == ccl_gpu_comm::type_idx() or
                 gpu_comm_impl::type_idx() == ccl_ipc_source_gpu_comm<ccl_gpu_comm>::type_idx()) {
+                ccl_device &device = parent_communicator->get_device();
+                auto queue_prop = ccl_device::get_default_queue_desc();
+                auto &cmd_queue = device.get_cmd_queue(queue_prop, ctx);
+                auto &cmd_list = device.get_cmd_list(ctx);
+                ENTRY_LOG_DEBUG("Start submit for execution: main device: ",
+                                parent_communicator->to_string(),
+                                ", queue: ",
+                                cmd_queue.get(),
+                                ", list: ",
+                                cmd_list.get());
                 if (group_id == ccl::group_split_type::cluster) {
-                    //auto c = ccl::environment::instance().create_communicator();
+                    //auto c = ccl::detail::environment::instance().create_communicator();
                     //(void)c;
                     //if(c->rank() == 0)
                     {
                         // Execute command list in command queue
-                        ccl_device &device = parent_communicator->get_device();
-                        auto queue_prop = ccl_device::get_default_queue_desc();
                         //TODO SPECIAL FOR VIRTUAL
                         /*
                     if(std::is_same<gpu_comm, ccl_virtual_gpu_comm>::value)
@@ -148,21 +214,9 @@ class base_gpu_entry : public sched_entry {
                         queue_prop.ordinal = parent_communicator->get_rank(); //TODO SPECIAL FOR VIRTUAL
                     }
                     queue_prop.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;*/
-                        auto &cmd_queue = device.get_cmd_queue(queue_prop, ctx);
-
-                        LOG_DEBUG(class_name(),
-                                  " entry req ",
-                                  &req,
-                                  ", rank: ",
-                                  comm_addr.to_string(),
-                                  " - ready for execution on: ",
-                                  device.handle,
-                                  ", queue:",
-                                  cmd_queue.get(),
-                                  ", list: ",
-                                  device.get_cmd_list(ctx).get());
+
                         ze_result_t ret = zeCommandQueueExecuteCommandLists(
-                            cmd_queue.get(), 1, device.get_cmd_list(ctx).get_ptr(), fence);
+                            cmd_queue.get(), 1, cmd_list.get_ptr(), fence);
                         if (ret != ZE_RESULT_SUCCESS) {
                             throw ccl::exception(
                                 std::string("cannot execute command list, error: ") +
@@ -171,38 +225,18 @@ class base_gpu_entry : public sched_entry {
                     }
                 }
                 else {
-                    // Execute command list in command queue
-                    ccl_device &device = parent_communicator->get_device();
-                    auto queue_prop = ccl_device::get_default_queue_desc();
-                    //TODO SPECIAL FOR VIRTUAL
-                    /*
-            if(std::is_same<gpu_comm, ccl_virtual_gpu_comm>::value)
-            {
-                queue_prop.ordinal = parent_communicator->get_rank(); //TODO SPECIAL FOR VIRTUAL
-            }
-            queue_prop.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;*/
-                    auto &cmd_queue = device.get_cmd_queue(queue_prop, ctx);
-
-                    LOG_DEBUG(class_name(),
-                              " entry req ",
-                              &req,
-                              ", rank: ",
-                              comm_addr.to_string(),
-                              " - ready for execution on: ",
-                              device.handle,
-                              ", queue:",
-                              cmd_queue.get(),
-                              ", list: ",
-                              device.get_cmd_list(ctx).get());
+                    /*-S-
                     ze_result_t ret = zeCommandQueueExecuteCommandLists(
-                        cmd_queue.get(), 1, device.get_cmd_list(ctx).get_ptr(), fence);
+                        cmd_queue.get(), 1, cmd_list.get_ptr(), fence);
                     if (ret != ZE_RESULT_SUCCESS) {
                         throw ccl::exception(std::string("cannot execute command list, error: ") +
                                              std::to_string(ret));
                     }
+                    */
                 }
             }
         }
+        ENTRY_LOG_TRACE("submission result: ", ready_to_exec);
         return ready_to_exec;
     }
 
@@ -214,26 +248,15 @@ class base_gpu_entry : public sched_entry {
             //wait execution
             ccl_device &device = parent_communicator->get_device();
             auto queue_prop = ccl_device::get_default_queue_desc();
-            //TODO SPECIAL FOR VIRTUAL
-            /*
-            if(std::is_same<gpu_comm, ccl_virtual_gpu_comm>::value)
-            {
-                queue_prop.ordinal = parent_communicator->get_rank(); //TODO SPECIAL FOR VIRTUAL
-            }*/
-            //queue_prop.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
             auto &cmd_queue = device.get_cmd_queue(queue_prop, ctx);
 
-            LOG_TRACE(class_name(),
-                      " entry req ",
-                      &req,
-                      ", rank: ",
-                      comm_addr.to_string(),
-                      " waiting for finished execution, queue: ",
-                      cmd_queue.get());
+            ENTRY_LOG_TRACE(" waiting for finished execution, queue: ", cmd_queue.get());
             /* TODO fence!
             ze_result_t ret = zeCommandQueueSynchronize(cmd_queue.handle,
                                                           std::numeric_limits<uint32_t>::max());*/
             ze_result_t ret = zeFenceQueryStatus(fence);
+            ENTRY_LOG_TRACE(
+                "Fence query status: ", native::to_string(ret), ", queue: ", cmd_queue.get());
             if (ret != ZE_RESULT_SUCCESS) {
                 if (ret != ZE_RESULT_NOT_READY) {
                     //TODO L0 workaround: Virtual Device may execute this part before fence actually queued
@@ -243,12 +266,13 @@ class base_gpu_entry : public sched_entry {
                             ccl_ipc_source_gpu_comm<ccl_gpu_comm>::type_idx()) {
                         if (group_id == ccl::group_split_type::cluster) {
                             // TODO: implement process communicator case
-                            throw ccl::exception(std::string(__PRETTY_FUNCTION__) + "TODO: implement process communicator case");
-                            // auto c = ccl::environment::instance().create_communicator();
+                            throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                                                 "TODO: implement process communicator case");
+                            // auto c = ccl::detail::environment::instance().create_communicator();
                             // if (c.rank() == 0) {
-                                // throw ccl::exception(
-                                //     std::string("cannot sync queue from real device, error: ") +
-                                //     native::to_string(ret));
+                            // throw ccl::exception(
+                            //     std::string("cannot sync queue from real device, error: ") +
+                            //     native::to_string(ret));
                             // }
                         }
                         else {
@@ -265,22 +289,10 @@ class base_gpu_entry : public sched_entry {
                         }
                     }
                 }
-                LOG_TRACE(class_name(),
-                          " entry req ",
-                          &req,
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          " not completed yet, reason: ",
-                          native::to_string(ret));
             }
             else {
                 status = ccl_sched_entry_status_complete;
-                LOG_DEBUG(class_name(),
-                          " entry req ",
-                          &req,
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          " completed");
+                ENTRY_LOG_DEBUG(" Completed on queue: ", cmd_queue.get());
             }
         }
     }
@@ -296,33 +308,70 @@ class base_gpu_entry : public sched_entry {
 protected:
     virtual bool finalize_entry() = 0;
     virtual void dump_detail(std::stringstream &str) const override {
-        ccl_logger::format(str,
-                           class_name(),
-                           ", dt ",
-                           ccl::global_data::get().dtypes->name(dtype),
-                           ", send_buf ",
-                           send_buf,
-                           ", comm_id ",
-                           sched->coll_param.comm->id(),
-                           ", req ",
-                           &req,
-                           "\n");
+        ccl_logger::format(str, "{", name(), ", addr: ", comm_addr.to_string(), "}");
     }
 
 protected:
+    ccl_driver_context_ptr get_ctx() const {
+        return ctx;
+    }
+
+    template <template <size_t pos, class Policy> class KernelArg, size_t POS, class POL>
+    device_memory<typename std::remove_pointer<typename KernelArg<POS, POL>::arg_type>::type>
+    alloc_memory_wrap(const KernelArg<POS, POL> &arg,
+                      std::shared_ptr<gpu_comm> parent_communicator,
+                      size_t cnt,
+                      std::shared_ptr<ccl_context> ctx) {
+        using alloc_type =
+            typename std::remove_pointer<typename KernelArg<POS, POL>::arg_type>::type;
+        auto memory = parent_communicator->get_device().template alloc_memory<alloc_type>(
+            cnt, sizeof(alloc_type), ctx);
+        LOG_DEBUG("Allocation memory by default: ",
+                  POS,
+                  ", ctx: ",
+                  (void *)ctx.get(),
+                  ", memory: ",
+                  (void *)memory.get());
+        return memory;
+    }
+
+    template <template <size_t pos, class> class KernelArg, size_t POS, class Type, bool B>
+    device_memory<typename std::remove_pointer<
+        typename KernelArg<POS, arg_access_policy_atomic_uncached<POS, Type, B>>::arg_type>::type>
+    alloc_memory_wrap(const KernelArg<POS, arg_access_policy_atomic_uncached<POS, Type, B>> &arg,
+                      std::shared_ptr<gpu_comm> parent_communicator,
+                      size_t cnt,
+                      std::shared_ptr<ccl_context> ctx) {
+        using alloc_type = typename std::remove_pointer<
+            typename KernelArg<POS,
+                               arg_access_policy_atomic_uncached<POS, Type, B>>::arg_type>::type;
+        ze_device_mem_alloc_desc_t mem_descr{
+            .stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC,
+            .pNext = NULL,
+            .flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED,
+            .ordinal = 0,
+        };
+        auto memory = parent_communicator->get_device().template alloc_memory<alloc_type>(
+            cnt, sizeof(alloc_type), ctx, mem_descr);
+        LOG_DEBUG("Allocation memory with bias uncached flag: ",
+                  POS,
+                  ", ctx: ",
+                  (void *)ctx.get(),
+                  ", memory: ",
+                  (void *)memory.get(),
+                  " mem_descr: ",
+                  native::to_string(mem_descr));
+        return memory;
+    }
+
     std::shared_ptr<gpu_comm> parent_communicator;
     topology_addr<group_id, class_id> comm_addr;
     ccl_buffer send_buf;
     ccl::datatype dtype;
     atl_req_t req{};
     std::shared_ptr<ccl_stream> device_stream;
-    std::shared_ptr<ccl_context> ctx;
     // GPU
     bool ready_to_exec = false;
-
-    //std::unique_ptr<ccl_device::device_cmd_list> copy_send_cmd_list;
-    //std::unique_ptr<ccl_device::device_cmd_list> copy_recv_cmd_list;
-    //std::unique_ptr<ccl_device::device_cmd_list> exec_cmd_list;
     ze_fence_handle_t fence;
 
     //TODO
@@ -331,7 +380,7 @@ class base_gpu_entry : public sched_entry {
     template <class executor>
     static std::unique_ptr<base_connector_interface<kernel_main_typed>>
     create_kernel_router_for_rank(executor &exec,
-                                  size_t next_rank,
+                                  int next_rank,
                                   specific_indexed_device_storage &group_devices) {
         std::unique_ptr<base_connector_interface<kernel_main_typed>> kernel_router;
         while (!kernel_router) {
@@ -477,6 +526,9 @@ class base_gpu_entry : public sched_entry {
     }
 
     std::unique_ptr<base_connector_interface<kernel_main_typed>> kernel_router;
+
+private:
+    ccl_driver_context_ptr ctx;
 };
 
 } // namespace native
diff --git a/src/sched/entry/l0/l0_reduce_typed_entry.hpp b/src/sched/entry/l0/l0_reduce_typed_entry.hpp
index eb913a8c5..7a5247f46 100644
--- a/src/sched/entry/l0/l0_reduce_typed_entry.hpp
+++ b/src/sched/entry/l0/l0_reduce_typed_entry.hpp
@@ -43,6 +43,7 @@ class l0_reduce_typed_entry : public base_gpu_entry<native_type,
     using base::status;
     using base::launch_args;
     using base::kernel_router;
+    using base::get_ctx;
     using kernel_main_typed = ring_reduce_kernel<native_type>;
     using kernel_ipc_typed = ring_reduce_ipc<native_type>;
 
@@ -65,47 +66,45 @@ class l0_reduce_typed_entry : public base_gpu_entry<native_type,
     l0_reduce_typed_entry(ccl_sched* sched,
                           std::shared_ptr<gpu_comm_impl> comm,
                           specific_indexed_device_storage& available_devices,
+                          ccl_driver_context_ptr in_ctx,
                           const ccl_buffer send_buf,
                           ccl_buffer recv_buf,
                           size_t cnt,
                           ccl::reduction op,
-                          size_t root,
+                          int root,
                           std::shared_ptr<ccl_stream> device_stream = std::shared_ptr<ccl_stream>())
             : base(sched,
                    comm,
+                   in_ctx,
                    send_buf,
-                   ccl::native_type_info<native_type>::ccl_type_value,
+                   ccl::native_type_info<native_type>::dtype,
                    device_stream),
 
               temp_buffer(parent_communicator->get_device().template alloc_memory<native_type>(
                   cnt,
-                  sizeof(native_type), std::shared_ptr<ccl_context> { })),
+                  sizeof(native_type),
+                  get_ctx())),
               income_data_flag(parent_communicator->get_device()
                                    .template alloc_memory<income_data_flag_gpu_type>(
                                        1,
-                                       sizeof(income_data_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                                       sizeof(income_data_flag_gpu_type),
+                                       get_ctx())),
               ready_to_recv_flag(parent_communicator->get_device()
                                      .template alloc_memory<ready_to_recv_flag_gpu_type>(
                                          1,
-                                         sizeof(ready_to_recv_flag_gpu_type), std::shared_ptr<ccl_context> { })),
+                                         sizeof(ready_to_recv_flag_gpu_type),
+                                         get_ctx())),
               local_barrier_flag(parent_communicator->get_device()
                                      .template alloc_memory<local_barrier_flag_gpu_type>(
                                          1,
-                                         sizeof(local_barrier_flag_gpu_type), std::shared_ptr<ccl_context> { })) {
+                                         sizeof(local_barrier_flag_gpu_type),
+                                         get_ctx())) {
         recv_buf_typed_entry = recv_buf;
         op_typed_entry = op;
         root_typed_entry = root;
         cnt_entry = cnt;
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", cnt ",
-                  cnt_entry,
-                  ", op ",
-                  (int)(op),
-                  ", rank: ",
-                  comm_addr.to_string());
-        size_t next_rank = (comm_addr.rank + 1) % comm_addr.size;
+
+        int next_rank = (comm_addr.rank + 1) % comm_addr.size;
         kernel_router = base::template create_kernel_router_for_rank<
             l0_reduce_typed_entry<native_type, gpu_comm_impl, topology>>(
             *this, next_rank, available_devices);
@@ -128,6 +127,16 @@ class l0_reduce_typed_entry : public base_gpu_entry<native_type,
             std::unique_lock<std::mutex> lock(global_mutex);
             registered_thread.insert(std::this_thread::get_id());
         }
+
+        //remember list_closed event index
+        list_closed_epoch_id = list_closed_epoch.load();
+
+        ENTRY_LOG_DEBUG("Created, next_rank:",
+                        next_rank,
+                        " ,WaitCount: ",
+                        wait_count.load(),
+                        ", ListClosedEpoch: ",
+                        list_closed_epoch_id);
     }
 
     ~l0_reduce_typed_entry() {
@@ -149,13 +158,7 @@ class l0_reduce_typed_entry : public base_gpu_entry<native_type,
     }
 
     void start() override {
-        LOG_DEBUG(class_name(),
-                  " entry req ",
-                  &req,
-                  ", rank: ",
-                  comm_addr.to_string(),
-                  ", cnt ",
-                  cnt_entry);
+        ENTRY_LOG_DEBUG("Start entry, cnt ", cnt_entry);
 
         //Create base primitives
         base::start();
@@ -216,15 +219,15 @@ class l0_reduce_typed_entry : public base_gpu_entry<native_type,
         //TODO
         std::vector<ccl_device::device_ipc_memory_handle> ret;
         ret.reserve(3);
-        ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(). ctx));
-        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), ctx));
-        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), ctx));
+        ret.push_back(owned_device.create_ipc_memory_handle(temp_buffer.get(), get_ctx()));
+        ret.push_back(owned_device.create_ipc_memory_handle(income_data_flag.get(), get_ctx()));
+        ret.push_back(owned_device.create_ipc_memory_handle(ready_to_recv_flag.get(), get_ctx()));
         return ret;
     }
 
 protected:
     bool finalize_entry() override {
-        LOG_TRACE("entry: ", class_name(), ", rank: ", comm_addr.to_string());
+        ENTRY_LOG_TRACE("Try to finalize");
         ccl_device& device = parent_communicator->get_device();
 
         kernel_main_typed& main_entry_function =
@@ -232,100 +235,131 @@ class l0_reduce_typed_entry : public base_gpu_entry<native_type,
                                                          topology,
                                                          ccl::device_topology_type::ring,
                                                          native_type>();
-        if ((*kernel_router)(main_entry_function)) {
-            ze_result_t result;
-            //TODO L0 Workaround
-            if (!is_kernel_added) {
-                std::unique_lock<std::mutex> lock(global_mutex);
-                exec_count++;
-                cur_index = exec_count;
-                result = zeCommandListAppendLaunchKernel(device.get_cmd_list(ctx).get(),
-                                                         main_entry_function.handle,
-                                                         &launch_args,
-                                                         nullptr,
-                                                         0,
-                                                         nullptr);
-                if (result != ZE_RESULT_SUCCESS) {
-                    LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
-                    throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
-                }
-                is_kernel_added = true;
+        if (!(*kernel_router)(main_entry_function)) {
+            return false;
+        }
 
-                LOG_DEBUG("entry: ",
-                          class_name(),
-                          ", rank: ",
-                          comm_addr.to_string(),
-                          ". Kernel added: ",
-                          main_entry_function.to_string(),
-                          " in list");
-            }
+        auto& cmd_list = device.get_cmd_list(get_ctx());
+        ze_result_t result;
+        //TODO L0 Workaround
+        if (!is_kernel_added) {
+            std::unique_lock<std::mutex> lock(global_mutex);
+            exec_count++;
+            (void)cur_index;
 
-            while (exec_count < registered_thread.size()) {
+            kernel_bind_epoch_id = exec_count;
+
+            //L0 Workaround launch kernel require critical section
+            result = zeCommandListAppendLaunchKernel(
+                cmd_list.get(), main_entry_function.handle, &launch_args, nullptr, 0, nullptr);
+            if (result != ZE_RESULT_SUCCESS) {
+                LOG_ERROR("zeCommandListAppendLaunchKernel failed, error: ", to_string(result));
+                throw std::runtime_error("zeCommandListAppendLaunchKernel failed");
             }
+            is_kernel_added = true;
 
-            //TODO L0 workaround
-            LOG_INFO("Check L0 Workaround: WaitCount: ",
-                     wait_count,
-                     ", ExecCount: ",
-                     exec_count,
-                     ", CurIndex: ",
-                     cur_index);
-            if (cur_index == wait_count /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
-                if (topology == ccl::group_split_type::cluster) {
-                    // TODO: implement process communicator case
-                    throw ccl::exception(std::string(__PRETTY_FUNCTION__) + "TODO: implement process communicator case");
-                    // auto c = ccl::environment::instance().create_communicator();
-                    // if (c.rank() == 0) {
-                        // LOG_INFO("L0 Workaround: one device close list!!!",
-                        //          "WaitCount: ",
-                        //          wait_count,
-                        //          ", ExecCount: ",
-                        //          exec_count,
-                        //          ", CurIndex: ",
-                        //          cur_index);
-                        // result = zeCommandListClose(device.get_cmd_list().get());
-                        // if (result != ZE_RESULT_SUCCESS) {
-                        //     LOG_ERROR("zeCommandListClose failed, error: ",
-                        //               native::to_string(result));
-                        //     throw std::runtime_error("zeCommandListClose failed");
-                        // }
-                    // }
-                }
-                else {
-                    LOG_INFO("L0 Workaround: one device close list!!!",
-                             "WaitCount: ",
-                             wait_count,
-                             ", ExecCount: ",
-                             exec_count,
-                             ", CurIndex: ",
-                             cur_index);
-                    result = zeCommandListClose(device.get_cmd_list(ctx).get());
+            ENTRY_LOG_DEBUG("Append kernel successfully: ",
+                            main_entry_function.to_string(),
+                            " in list: ",
+                            cmd_list.get());
+        }
+
+        while (exec_count < registered_thread.size()) {
+            ENTRY_LOG_TRACE("waiting thread counts, exec_cont: ", exec_count);
+        }
+
+        //TODO L0 workaround
+        ENTRY_LOG_INFO("Check L0 Workaround: WaitCount: ",
+                       wait_count,
+                       ", ExecCount: ",
+                       exec_count,
+                       ", CurIndex: ",
+                       kernel_bind_epoch_id);
+
+        if (kernel_bind_epoch_id % wait_count ==
+            0 /*std::is_same<gpu_comm_impl, ccl_gpu_comm>::value*/) {
+            if (topology == ccl::group_split_type::cluster) {
+                // TODO: implement process communicator case
+                throw ccl::exception(std::string(__PRETTY_FUNCTION__) +
+                                     "TODO: implement process communicator case");
+                // auto c = ccl::detail::environment::instance().create_communicator();
+                // if (c.rank() == 0) {
+                // LOG_INFO("L0 Workaround: one device close list!!!",
+                //          "WaitCount: ",
+                //          wait_count,
+                //          ", ExecCount: ",
+                //          exec_count,
+                //          ", CurIndex: ",
+                //          cur_index);
+                // result = zeCommandListClose(device.get_cmd_list().get());
+                // if (result != ZE_RESULT_SUCCESS) {
+                //     LOG_ERROR("zeCommandListClose failed, error: ",
+                //               native::to_string(result));
+                //     throw std::runtime_error("zeCommandListClose failed");
+                // }
+                // }
+            }
+            else {
+                ENTRY_LOG_INFO("L0 Workaround: one device close list!!!\n",
+                               "WaitCount: ",
+                               wait_count,
+                               ", ExecCount: ",
+                               exec_count,
+                               ", CurIndex: ",
+                               kernel_bind_epoch_id);
+
+                {
+                    std::unique_lock<std::mutex> lock(global_mutex);
+                    result = zeCommandListClose(cmd_list.get());
                     if (result != ZE_RESULT_SUCCESS) {
                         LOG_ERROR("zeCommandListClose failed, error: ", native::to_string(result));
                         throw std::runtime_error("zeCommandListClose failed");
                     }
-                }
 
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
-            }
-            else if (cur_index > wait_count) {
-                LOG_INFO("L0 Workaround: one device should close list before!!! ",
-                         "WaitCount: ",
-                         wait_count,
-                         ", ExecCount: ",
-                         exec_count,
-                         ", CurIndex: ",
-                         cur_index);
-                LOG_INFO("entry: ", class_name(), ", rank: ", comm_addr.to_string(), " finalized!");
-                return true;
+                    auto queue_prop = ccl_device::get_default_queue_desc();
+                    auto& cmd_queue = device.get_cmd_queue(queue_prop, get_ctx());
+                    ENTRY_LOG_INFO("Execute list:",
+                                   cmd_list.get(),
+                                   ", queue: ",
+                                   cmd_queue.get(),
+                                   ", go to submit entry");
+                    ze_result_t ret = zeCommandQueueExecuteCommandLists(
+                        cmd_queue.get(), 1, cmd_list.get_ptr(), this->fence);
+                    if (ret != ZE_RESULT_SUCCESS) {
+                        throw ccl::exception(std::string("cannot execute command list, error: ") +
+                                             std::to_string(ret));
+                    }
+
+                    ret = zeFenceQueryStatus(this->fence);
+                    ENTRY_LOG_DEBUG("Fence query status: ",
+                                    native::to_string(ret),
+                                    ", queue: ",
+                                    cmd_queue.get());
+                }
             }
+
+            list_closed_epoch.fetch_add(1);
+            ENTRY_LOG_INFO("List closed:", cmd_list.get(), ", go to submit entry");
+            return true;
+        }
+        else if (kernel_bind_epoch_id > wait_count ||
+                 list_closed_epoch.load() != list_closed_epoch_id /* epoch changed */) {
+            ENTRY_LOG_INFO("L0 Workaround: one device should close list before!!! ",
+                           "WaitCount: ",
+                           wait_count,
+                           ", ExecCount: ",
+                           exec_count,
+                           ", CurIndex: ",
+                           kernel_bind_epoch_id);
+            ENTRY_LOG_INFO(
+                "Dirfferent entry closed the list:", cmd_list.get(), ", go to submit entry");
+            return true;
         }
         return false;
     }
 
     void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str, class_name(), "TODO\n");
+        base::dump_detail(str);
     }
 
 private:
@@ -337,7 +371,7 @@ class l0_reduce_typed_entry : public base_gpu_entry<native_type,
     ccl_device::device_memory<local_barrier_flag_gpu_type> local_barrier_flag;
     ccl::reduction op_typed_entry;
     ccl_buffer recv_buf_typed_entry;
-    size_t root_typed_entry;
+    int root_typed_entry;
     size_t cnt_entry;
     std::shared_ptr<ccl_context> ctx;
 
@@ -483,5 +517,7 @@ class l0_reduce_typed_entry : public base_gpu_entry<native_type,
         }
         return is_right_kernel_ready;
     }
+    size_t list_closed_epoch_id = 0;
+    size_t kernel_bind_epoch_id = 0;
 };
 } // namespace native
diff --git a/src/sched/entry/postponed_fields.hpp b/src/sched/entry/postponed_fields.hpp
index e4622665c..c2d62cc8e 100644
--- a/src/sched/entry/postponed_fields.hpp
+++ b/src/sched/entry/postponed_fields.hpp
@@ -19,9 +19,10 @@
 #include <set>
 #include <tuple>
 
-#include "oneapi/ccl/ccl_types.hpp"
+#include "oneapi/ccl/types.hpp"
 #include "common/log/log.hpp"
 #include "common/utils/tuple.hpp"
+#include "internal_types.hpp"
 
 enum ccl_sched_entry_field_id {
     ccl_sched_entry_field_buf,
@@ -41,7 +42,7 @@ enum ccl_sched_entry_field_id {
     ccl_sched_entry_field_send_count
 };
 
-typedef ccl_status_t (*ccl_sched_entry_field_function_t)(const void*, void*);
+typedef ccl::status (*ccl_sched_entry_field_function_t)(const void*, void*);
 
 template <ccl_sched_entry_field_id id>
 using field_id_t = std::integral_constant<ccl_sched_entry_field_id, id>;
diff --git a/src/sched/entry/probe_entry.hpp b/src/sched/entry/probe_entry.hpp
index 7d37c6a7d..43a9367e4 100644
--- a/src/sched/entry/probe_entry.hpp
+++ b/src/sched/entry/probe_entry.hpp
@@ -25,14 +25,14 @@ class probe_entry : public sched_entry {
     }
 
     probe_entry() = delete;
-    probe_entry(ccl_sched* sched, size_t src, size_t* recv_len, ccl_comm* comm)
+    probe_entry(ccl_sched* sched, int src, size_t* recv_len, ccl_comm* comm)
             : sched_entry(sched),
               src(src),
               recv_len(recv_len),
               comm(comm) {}
 
     void start() override {
-        size_t global_src = comm->get_global_rank(src);
+        int global_src = comm->get_global_rank(src);
         atl_tag = comm->atl->tag->create(
             sched->get_comm_id(), global_src, sched->sched_id, sched->get_op_id());
         LOG_DEBUG("PROBE entry src ", src, ", tag ", atl_tag);
@@ -43,7 +43,7 @@ class probe_entry : public sched_entry {
         int found = 0;
         size_t len = 0;
 
-        size_t global_src = comm->get_global_rank(src);
+        int global_src = comm->get_global_rank(src);
 
         atl_status_t atl_status =
             comm->atl->atl_ep_probe(sched->bin->get_atl_ep(), global_src, atl_tag, &found, &len);
@@ -77,7 +77,7 @@ class probe_entry : public sched_entry {
     }
 
 private:
-    size_t src;
+    int src;
     size_t* recv_len;
     ccl_comm* comm;
     uint64_t atl_tag = 0;
diff --git a/src/sched/entry/recv_entry.hpp b/src/sched/entry/recv_entry.hpp
index b7463bfe4..a267dd3e4 100644
--- a/src/sched/entry/recv_entry.hpp
+++ b/src/sched/entry/recv_entry.hpp
@@ -33,7 +33,7 @@ class recv_entry : public sched_entry,
                ccl_buffer buf,
                size_t cnt,
                const ccl_datatype& dtype,
-               size_t src,
+               int src,
                ccl_comm* comm)
             : sched_entry(sched),
               buf(buf),
@@ -53,7 +53,7 @@ class recv_entry : public sched_entry,
     void start() override {
         update_fields();
 
-        size_t global_src = comm->get_global_rank(src);
+        int global_src = comm->get_global_rank(src);
         atl_tag = comm->atl->tag->create(
             sched->get_comm_id(), global_src, sched->sched_id, sched->get_op_id());
         size_t bytes = cnt * dtype.size();
@@ -118,7 +118,7 @@ class recv_entry : public sched_entry,
     ccl_buffer buf;
     size_t cnt;
     ccl_datatype dtype;
-    size_t src;
+    int src;
     ccl_comm* comm;
     uint64_t atl_tag = 0;
     atl_req_t req{};
diff --git a/src/sched/entry/recv_reduce_entry.hpp b/src/sched/entry/recv_reduce_entry.hpp
index 9b196152a..f93f9c1ab 100644
--- a/src/sched/entry/recv_reduce_entry.hpp
+++ b/src/sched/entry/recv_reduce_entry.hpp
@@ -37,7 +37,7 @@ class recv_reduce_entry final : public sched_entry {
                       size_t* out_cnt,
                       const ccl_datatype& dtype,
                       ccl::reduction reduction_op,
-                      size_t src,
+                      int src,
                       ccl_buffer comm_buf,
                       ccl_comm* comm,
                       ccl_recv_reduce_result_buf_type result_buf_type = ccl_recv_reduce_local_buf)
@@ -81,7 +81,7 @@ class recv_reduce_entry final : public sched_entry {
     }
 
     void start() override {
-        size_t global_src = comm->get_global_rank(src);
+        int global_src = comm->get_global_rank(src);
         atl_tag = comm->atl->tag->create(
             sched->get_comm_id(), global_src, sched->sched_id, sched->get_op_id());
         size_t bytes = in_cnt * dtype.size();
@@ -122,16 +122,16 @@ class recv_reduce_entry final : public sched_entry {
             ccl_buffer reduce_inout_buf =
                 (result_buf_type == ccl_recv_reduce_local_buf) ? inout_buf : comm_buf;
 
-            ccl_status_t comp_status = ccl_comp_reduce(reduce_in_buf.get_ptr(bytes),
-                                                       in_cnt,
-                                                       reduce_inout_buf.get_ptr(bytes),
-                                                       out_cnt,
-                                                       dtype,
-                                                       op,
-                                                       fn,
-                                                       &context);
+            ccl::status comp_status = ccl_comp_reduce(reduce_in_buf.get_ptr(bytes),
+                                                      in_cnt,
+                                                      reduce_inout_buf.get_ptr(bytes),
+                                                      out_cnt,
+                                                      dtype,
+                                                      op,
+                                                      fn,
+                                                      &context);
 
-            CCL_ASSERT(comp_status == ccl_status_success, "bad status ", comp_status);
+            CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
             status = ccl_sched_entry_status_complete;
             LOG_DEBUG("completed REDUCE in RECV_REDUCE entry");
         }
@@ -177,7 +177,7 @@ class recv_reduce_entry final : public sched_entry {
     size_t* out_cnt;
     ccl_datatype dtype;
     ccl::reduction op;
-    size_t src;
+    int src;
     ccl_buffer comm_buf;
     ccl_comm* comm;
     bool own_comm_buff = false;
diff --git a/src/sched/entry/reduce_local_entry.hpp b/src/sched/entry/reduce_local_entry.hpp
index 1d919e7d3..0eb133e35 100644
--- a/src/sched/entry/reduce_local_entry.hpp
+++ b/src/sched/entry/reduce_local_entry.hpp
@@ -48,15 +48,15 @@ class reduce_local_entry : public sched_entry {
         size_t bytes = in_cnt * dtype.size();
         size_t offset = inout_buf.get_offset();
         const ccl::fn_context context = { sched->coll_attr.match_id.c_str(), offset };
-        ccl_status_t comp_status = ccl_comp_reduce(in_buf.get_ptr(bytes),
-                                                   in_cnt,
-                                                   inout_buf.get_ptr(bytes),
-                                                   out_cnt,
-                                                   dtype,
-                                                   op,
-                                                   fn,
-                                                   &context);
-        CCL_ASSERT(comp_status == ccl_status_success, "bad status ", comp_status);
+        ccl::status comp_status = ccl_comp_reduce(in_buf.get_ptr(bytes),
+                                                  in_cnt,
+                                                  inout_buf.get_ptr(bytes),
+                                                  out_cnt,
+                                                  dtype,
+                                                  op,
+                                                  fn,
+                                                  &context);
+        CCL_ASSERT(comp_status == ccl::status::success, "bad status ", comp_status);
 
         status = ccl_sched_entry_status_complete;
     }
diff --git a/src/sched/entry/send_entry.hpp b/src/sched/entry/send_entry.hpp
index df3fcd3d3..5250a8c4f 100644
--- a/src/sched/entry/send_entry.hpp
+++ b/src/sched/entry/send_entry.hpp
@@ -33,7 +33,7 @@ class send_entry : public sched_entry,
                const ccl_buffer buf,
                size_t cnt,
                const ccl_datatype& dtype,
-               size_t dst,
+               int dst,
                ccl_comm* comm)
             : sched_entry(sched),
               buf(buf),
@@ -45,8 +45,8 @@ class send_entry : public sched_entry,
     void start() override {
         update_fields();
 
-        size_t global_dst = comm->get_global_rank(dst);
-        size_t global_rank = comm->get_global_rank(comm->rank());
+        int global_dst = comm->get_global_rank(dst);
+        int global_rank = comm->get_global_rank(comm->rank());
 
         atl_tag = comm->atl->tag->create(
             sched->get_comm_id(), global_rank, sched->sched_id, sched->get_op_id());
@@ -112,7 +112,7 @@ class send_entry : public sched_entry,
     ccl_buffer buf;
     size_t cnt;
     ccl_datatype dtype;
-    size_t dst;
+    int dst;
     ccl_comm* comm;
     uint64_t atl_tag = 0;
     atl_req_t req{};
diff --git a/src/sched/entry/sycl_copy_device_to_host_entry.hpp b/src/sched/entry/sycl_copy_entry.hpp
similarity index 53%
rename from src/sched/entry/sycl_copy_device_to_host_entry.hpp
rename to src/sched/entry/sycl_copy_entry.hpp
index b07c5f8c8..1495700b1 100644
--- a/src/sched/entry/sycl_copy_device_to_host_entry.hpp
+++ b/src/sched/entry/sycl_copy_entry.hpp
@@ -15,50 +15,47 @@
 */
 #pragma once
 
+#ifdef CCL_ENABLE_SYCL
+
 #include "sched/entry/entry.hpp"
 #include "sched/entry/sycl_entry_helper.hpp"
 
 #include <CL/sycl.hpp>
 
-class sycl_copy_device_to_host_entry : public sched_entry {
+template <sycl_copy_direction direction>
+class sycl_copy_entry : public sched_entry {
 public:
-    static constexpr const char* class_name() noexcept {
-        return "SYCL_COPY_D2H";
-    }
+    static constexpr const char* class_name() noexcept;
 
-    sycl_copy_device_to_host_entry() = delete;
-    sycl_copy_device_to_host_entry(ccl_sched* sched,
-                                   ccl_buffer in_buf,
-                                   ccl_buffer out_buf,
-                                   size_t cnt,
-                                   const ccl_datatype& dtype,
-                                   const ccl_stream* stream,
-                                   size_t offset = 0)
+    sycl_copy_entry() = delete;
+    sycl_copy_entry(ccl_sched* sched,
+                    ccl_buffer in_buf,
+                    ccl_buffer out_buf,
+                    size_t count,
+                    const ccl_datatype& dtype,
+                    const ccl_stream* stream,
+                    size_t offset = 0)
             : sched_entry(sched),
               in_buf(in_buf),
               out_buf(out_buf),
-              cnt(cnt),
+              count(count),
               dtype(dtype),
               stream(stream),
-              offset(offset) {}
+              offset(offset),
+              copier(sycl_copier<direction>(in_buf, out_buf, count, dtype, offset)) {}
 
     void start() override {
+        LOG_DEBUG(class_name(), ": in_buf ", in_buf, ", out_buf ", out_buf, ", count ", count);
 
-        // LOG_DEBUG(class_name(), ": in_buf ", in_buf, ", out_buf ", out_buf, ", cnt ", cnt);
-        // cl::sycl::usm::alloc usm_kind = get_pointer_type(in_buf, stream.get().get_context());
-        // CCL_THROW_IF_NOT(usm_kind == cl::sycl::usm::alloc::shared, "usm_kind should be shared");
-
-        //fill visitor with actual ccl_buffer data
-        auto visitor = make_reader_visitor<cl::sycl::access::mode::read>(
-            dtype, cnt, offset, in_buf, stream->get_native_stream(), std::ref(out_buf), [this](char* sycl_pointer, size_t bytes) {
-                (void)sycl_pointer;
-                (void)bytes;
-                (void)this;
-                // TODO remove function callback
-            });
+        copier.set_queue(stream->get_native_stream(sched->queue->get_idx()));
+        ccl_tuple_for_each_indexed<ccl_sycle_buffer_one_dim_types>(copier);
+        status = ccl_sched_entry_status_started;
+    }
 
-        ccl_tuple_for_each_indexed<ccl_sycle_buffer_one_dim_types>(visitor);
-        status = ccl_sched_entry_status_complete;
+    void update() override {
+        if (copier.is_completed()) {
+            status = ccl_sched_entry_status_complete;
+        }
     }
 
     const char* name() const override {
@@ -70,22 +67,37 @@ class sycl_copy_device_to_host_entry : public sched_entry {
         ccl_logger::format(str,
                            "  dtype ",
                            ccl::global_data::get().dtypes->name(dtype),
-                           ", cnt ",
-                           cnt,
+                           ", count ",
+                           count,
                            ", in_buf ",
                            in_buf,
                            ", out_buf ",
                            out_buf,
                            ", native_stream ",
                            stream->to_string(),
+                           ", offset ",
+                           offset,
                            "\n");
     }
 
 private:
     ccl_buffer in_buf;
     ccl_buffer out_buf;
-    size_t cnt;
+    size_t count;
     ccl_datatype dtype;
     const ccl_stream* stream;
     size_t offset;
+    sycl_copier<direction> copier;
 };
+
+template <>
+constexpr const char* sycl_copy_entry<sycl_copy_direction::d2h>::class_name() noexcept {
+    return "SYCL_COPY_D2H";
+}
+
+template <>
+constexpr const char* sycl_copy_entry<sycl_copy_direction::h2d>::class_name() noexcept {
+    return "SYCL_COPY_H2D";
+}
+
+#endif /* CCL_ENABLE_SYCL */
diff --git a/src/sched/entry/sycl_copy_host_to_device_entry.hpp b/src/sched/entry/sycl_copy_host_to_device_entry.hpp
deleted file mode 100644
index 59b6fbc43..000000000
--- a/src/sched/entry/sycl_copy_host_to_device_entry.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- Copyright 2016-2020 Intel Corporation
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
-     http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-*/
-#pragma once
-
-#include "sched/entry/entry.hpp"
-#include "sched/entry/sycl_entry_helper.hpp"
-
-#include <CL/sycl.hpp>
-
-class sycl_copy_host_to_device_entry : public sched_entry {
-public:
-    static constexpr const char* class_name() noexcept {
-        return "SYCL_COPY_H2D";
-    }
-
-    sycl_copy_host_to_device_entry() = delete;
-    sycl_copy_host_to_device_entry(ccl_sched* sched,
-                                   ccl_buffer in_buf,
-                                   ccl_buffer out_buf,
-                                   size_t cnt,
-                                   const ccl_datatype& dtype,
-                                   const ccl_stream* stream)
-            : sched_entry(sched),
-              in_buf(in_buf),
-              out_buf(out_buf),
-              cnt(cnt),
-              dtype(dtype),
-              stream(stream) {}
-
-    void start() override {
-
-        LOG_DEBUG(class_name(), "in_buf ", in_buf, ", out_buf ", out_buf, ", cnt ", cnt);
-
-        //fill visitor with actual ccl_buffer data
-
-
-        auto visitor = make_writer_visitor<cl::sycl::access::mode::write>(
-            dtype, cnt, 0, in_buf, stream->get_native_stream(), std::ref(out_buf), [this](void* sycl_pointer, size_t bytes) {
-                (void)this;
-                (void)sycl_pointer;
-                (void)bytes;
-                // TODO remove fucntion callback
-            });
-        ccl_tuple_for_each_indexed<ccl_sycle_buffer_one_dim_types>(visitor);
-
-
-        status = ccl_sched_entry_status_complete;
-    }
-
-    const char* name() const override {
-        return class_name();
-    }
-
-protected:
-    void dump_detail(std::stringstream& str) const override {
-        ccl_logger::format(str,
-                           "  dtype ",
-                           ccl::global_data::get().dtypes->name(dtype),
-                           ", cnt ",
-                           cnt,
-                           ", in_buf ",
-                           in_buf,
-                           ", out_buf ",
-                           out_buf,
-                           ", native_stream ",
-                           stream->to_string(),
-                           "\n");
-    }
-
-private:
-    ccl_buffer in_buf;
-    ccl_buffer out_buf;
-    size_t cnt;
-    ccl_datatype dtype;
-    const ccl_stream* stream;
-};
diff --git a/src/sched/entry/sycl_entry_helper.cpp b/src/sched/entry/sycl_entry_helper.cpp
new file mode 100644
index 000000000..d5beb2552
--- /dev/null
+++ b/src/sched/entry/sycl_entry_helper.cpp
@@ -0,0 +1,22 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sched/entry/sycl_entry_helper.hpp"
+
+using sycl_copy_direction_str_enum =
+    utils::enum_to_str<utils::enum_to_underlying(sycl_copy_direction::h2d) + 1>;
+std::string to_string(sycl_copy_direction val) {
+    return sycl_copy_direction_str_enum({ "D2H", "H2D" }).choose(val, "UNKNOWN");
+}
diff --git a/src/sched/entry/sycl_entry_helper.hpp b/src/sched/entry/sycl_entry_helper.hpp
index 0f63d1698..84b70179e 100644
--- a/src/sched/entry/sycl_entry_helper.hpp
+++ b/src/sched/entry/sycl_entry_helper.hpp
@@ -15,193 +15,158 @@
 */
 #pragma once
 
+#include "common/datatype/datatype.hpp"
+#include "common/global/global.hpp"
+#include "common/utils/buffer.hpp"
+#include "common/utils/enums.hpp"
 #include "common/utils/tuple.hpp"
+#include "oneapi/ccl/native_device_api/interop_utils.hpp"
 
-template <class Func, cl::sycl::access::mode access_mode>
-struct sycl_buffer_reader_visitor {
-    sycl_buffer_reader_visitor(const ccl_datatype& dtype,
-                        size_t cnt,
-                        size_t offset,
-                        const ccl_buffer& buf,
-                        cl::sycl::queue queue,
-                        ccl_buffer& dest_buf,
-                        Func f)
-            : requested_dtype(dtype),
-              requested_cnt(cnt),
-              requested_offset(offset),
-              requested_buf(buf),
-              q(queue),
-              to_buf(dest_buf),
-              callback(f) {}
+enum class sycl_copy_direction { d2h, h2d };
 
-    template <size_t index, class specific_sycl_buffer>
-    void invoke() {
-        if (index == (int)(requested_dtype.idx())) {
-            LOG_DEBUG("visitor matched index: ",
-                      index,
-                      ", ccl: ",
-                      ccl::global_data::get().dtypes->name(requested_dtype),
-                      ", in: ",
-                      __PRETTY_FUNCTION__);
+std::string to_string(sycl_copy_direction val);
 
-            size_t bytes = requested_cnt * requested_dtype.size();
-            auto out_buf_acc = static_cast<specific_sycl_buffer*>(requested_buf.get_ptr(bytes));
-            auto* dst_ptr = static_cast<typename specific_sycl_buffer::value_type*>(to_buf.get_ptr(bytes));
-            {
-                specific_sycl_buffer sycl_out(dst_ptr, requested_cnt);
-                LOG_DEBUG("requested_cnt: ",
-                      requested_cnt,
-                      ", requested_dtype.size(): ",
-                      requested_dtype.size(),
-                      ", requested_offset in bytes: ",
-                      requested_offset,
-                      ", bytes: ",
-                      bytes,
-                      ", src_buf.get_count(): ",
-                      out_buf_acc->get_count(),
-                      ", dst_buf.get_count(): ",
-                      sycl_out.get_count());
-                size_t offset = requested_offset / requested_dtype.size();
-                auto e = q.submit([&](cl::sycl::handler &cgh) {
-                    auto recv_buf_acc = sycl_out.template get_access<cl::sycl::access::mode::write>(cgh);
-                    auto in_buf_acc = out_buf_acc-> template get_access<cl::sycl::access::mode::read>(cgh);
-                    cgh.parallel_for<class sycl_copy_device_to_host_entry_kernel>(
-                                    cl::sycl::range<1>{ requested_cnt }, [=](cl::sycl::item<1> id) {
-                            recv_buf_acc[id] = in_buf_acc[id + offset];
-                    });
-                    });
-                    e.wait();
-            }
-        }
-        else {
-            LOG_TRACE("visitor skipped index: ",
-                      index,
-                      ", ccl: ",
-                      ccl::global_data::get().dtypes->name(requested_dtype),
-                      ", in: ",
-                      __PRETTY_FUNCTION__);
-        }
+#ifdef CCL_ENABLE_SYCL
+
+template <sycl_copy_direction direction>
+struct sycl_copier {
+    sycl_copier(ccl_buffer in_buf,
+                ccl_buffer out_buf,
+                size_t count,
+                const ccl_datatype& dtype,
+                size_t in_buf_offset)
+            : in_buf(in_buf),
+              out_buf(out_buf),
+              count(count),
+              dtype(dtype),
+              in_buf_offset(in_buf_offset) {}
+
+    bool is_completed() {
+        return (e.get_info<sycl::info::event::command_execution_status>() ==
+                sycl::info::event_command_status::complete)
+                   ? true
+                   : false;
     }
-    const ccl_datatype& requested_dtype;
-    size_t requested_cnt;
-    size_t requested_offset;
-    const ccl_buffer& requested_buf;
-    cl::sycl::queue q;
-    ccl_buffer& to_buf;
-    Func callback;
-};
 
-template <cl::sycl::access::mode access_mode, class Func>
-sycl_buffer_reader_visitor<Func, access_mode> make_reader_visitor(const ccl_datatype& dtype,
-                                                    size_t cnt,
-                                                    size_t offset,
-                                                    const ccl_buffer& buf,
-                                                    cl::sycl::queue queue,
-                                                    ccl_buffer& dst,
-                                                    Func f) {
-    return sycl_buffer_reader_visitor<Func, access_mode>(dtype, cnt, offset, buf, queue, dst, f);
-}
-
-
-//
-template <class Func, cl::sycl::access::mode access_mode>
-struct sycl_buffer_writer_visitor {
-    sycl_buffer_writer_visitor(const ccl_datatype& dtype,
-                        size_t cnt,
-                        size_t offset,
-                        const ccl_buffer& in_buf,
-                        cl::sycl::queue queue,
-                        ccl_buffer& dest_buf,
-                        Func f)
-            : requested_dtype(dtype),
-              requested_cnt(cnt),
-              requested_offset(offset),
-              requested_buf(in_buf),
-              q(queue),
-              to_buf(dest_buf),
-              callback(f) {}
+    void set_queue(sycl::queue external_q) {
+        q = external_q;
+    }
 
     template <size_t index, class specific_sycl_buffer>
     void invoke() {
-        if (index == (int)(requested_dtype.idx())) {
+        if (index == (int)(dtype.idx())) {
             LOG_DEBUG("visitor matched index: ",
                       index,
                       ", ccl: ",
-                      ccl::global_data::get().dtypes->name(requested_dtype),
+                      ccl::global_data::get().dtypes->name(dtype),
                       ", in: ",
                       __PRETTY_FUNCTION__);
 
-            size_t bytes = requested_cnt * requested_dtype.size();
+            size_t bytes = count * dtype.size();
+
+            void* in_buf_ptr = in_buf.get_ptr(bytes);
+            void* out_buf_ptr = out_buf.get_ptr(bytes);
+
+            void* void_device_ptr =
+                (direction == sycl_copy_direction::h2d) ? out_buf_ptr : in_buf_ptr;
+
             /*
-            auto out_buf_acc = static_cast<specific_sycl_buffer*>(requested_buf.get_ptr(bytes))
-                                   ->template get_access<access_mode>();
-            void* out_pointer = out_buf_acc.get_pointer();
-            LOG_DEBUG("requested_cnt: ",
-                      requested_cnt,
-                      ", requested_dtype.size(): ",
-                      requested_dtype.size(),
-                      ", requested_offset: ",
-                      requested_offset,
-                      ", bytes: ",
-                      bytes,
-                      ", out_buf_acc.get_count(): ",
-                      out_buf_acc.get_count());
-            CCL_ASSERT(requested_cnt <= out_buf_acc.get_count());
-            callback((char*)out_pointer + requested_offset, bytes);
-            * */
-            auto in_buf_host = static_cast<typename specific_sycl_buffer::value_type*>(requested_buf.get_ptr(bytes));
-            auto* dst_ptr = static_cast<specific_sycl_buffer*>(to_buf.get_ptr(bytes));
-            //size_t total_bytes = requested_cnt * requested_dtype.size();
-            {
-                specific_sycl_buffer sycl_in(in_buf_host, requested_cnt);
-                LOG_DEBUG("requested_cnt: ",
-                      requested_cnt,
-                      ", requested_dtype.size(): ",
-                      requested_dtype.size(),
-                      ", requested_offset in bytes: ",
-                      requested_offset,
+              don't print this pointer through CCL logger
+              as in case of char/int8_t it will be interpreted as string
+              and logger will try access device memory
+              use void_device_ptr instead
+            */
+            typename specific_sycl_buffer::value_type* device_ptr =
+                static_cast<typename specific_sycl_buffer::value_type*>(void_device_ptr);
+
+            auto device_ptr_type = cl::sycl::get_pointer_type(device_ptr, q.get_context());
+
+            CCL_THROW_IF_NOT((device_ptr_type == cl::sycl::usm::alloc::device ||
+                              device_ptr_type == cl::sycl::usm::alloc::unknown),
+                             "unexpected USM type ",
+                             native::detail::usm_to_string(device_ptr_type),
+                             " for device_ptr ",
+                             device_ptr);
+
+            specific_sycl_buffer* device_buf_ptr = nullptr;
+
+            if (device_ptr_type == cl::sycl::usm::alloc::device) {
+                /* do nothing, provided device USM pointer can be used as is in copy kernel */
+            }
+            else {
+                /* cast pointer into SYCL buffer */
+                device_buf_ptr = static_cast<specific_sycl_buffer*>(void_device_ptr);
+            }
+
+            LOG_DEBUG("count: ",
+                      count,
+                      ", in_buf_offset: ",
+                      in_buf_offset,
+                      ", dtype_size: ",
+                      dtype.size(),
                       ", bytes: ",
                       bytes,
-                      ", src_buf.get_count(): ",
-                      sycl_in.get_count(),
-                      ", dst_buf.get_count(): ",
-                      dst_ptr->get_count());
-                size_t offset = requested_offset;
-                auto e = q.submit([&](cl::sycl::handler &cgh) {
-                    auto send_buf_acc = sycl_in.template get_access<cl::sycl::access::mode::read>(cgh);
-                    auto out_buf_acc = dst_ptr-> template get_access<cl::sycl::access::mode::write>(cgh);
-                    cgh.parallel_for<class sycl_copy_device_to_host_entry_kernel>(
-                                    cl::sycl::range<1>{ requested_cnt }, [=](cl::sycl::item<1> id) {
-                            out_buf_acc[id] = send_buf_acc[id + offset];
-                    });
-                    });
-                    e.wait();
+                      ", direction: ",
+                      to_string(direction),
+                      ", in_buf_ptr: ",
+                      in_buf_ptr,
+                      ", out_buf_ptr: ",
+                      out_buf_ptr,
+                      ", device_ptr: ",
+                      void_device_ptr,
+                      ", is_device_usm: ",
+                      (device_buf_ptr) ? "no" : "yes",
+                      ", device_ptr usm_type: ",
+                      native::detail::usm_to_string(device_ptr_type));
+
+            size_t offset = in_buf_offset;
+
+            if (device_buf_ptr) {
+                specific_sycl_buffer host_buf(
+                    static_cast<typename specific_sycl_buffer::value_type*>(
+                        (direction == sycl_copy_direction::h2d) ? in_buf_ptr : out_buf_ptr),
+                    count,
+                    cl::sycl::property::buffer::use_host_ptr{});
+
+                e = q.submit([&](cl::sycl::handler& h) {
+                    auto& src_buf =
+                        (direction == sycl_copy_direction::h2d) ? host_buf : *device_buf_ptr;
+                    auto& dst_buf =
+                        (direction == sycl_copy_direction::h2d) ? *device_buf_ptr : host_buf;
+                    auto src_buf_acc =
+                        src_buf.template get_access<cl::sycl::access::mode::read>(h, count, offset);
+                    auto dst_buf_acc =
+                        dst_buf.template get_access<cl::sycl::access::mode::write>(h);
+                    h.copy(src_buf_acc, dst_buf_acc);
+                });
+            }
+            else {
+                e = q.memcpy(
+                    out_buf_ptr,
+                    static_cast<typename specific_sycl_buffer::value_type*>(in_buf_ptr) + offset,
+                    count * dtype.size());
+
+                /* TODO: remove explicit wait */
+                e.wait();
             }
         }
         else {
             LOG_TRACE("visitor skipped index: ",
                       index,
                       ", ccl: ",
-                      ccl::global_data::get().dtypes->name(requested_dtype),
+                      ccl::global_data::get().dtypes->name(dtype),
                       ", in: ",
                       __PRETTY_FUNCTION__);
         }
     }
-    const ccl_datatype& requested_dtype;
-    size_t requested_cnt;
-    size_t requested_offset;
-    const ccl_buffer& requested_buf;
+
+    ccl_buffer in_buf;
+    ccl_buffer out_buf;
+    size_t count;
+    const ccl_datatype& dtype;
     cl::sycl::queue q;
-    ccl_buffer& to_buf;
-    Func callback;
+    size_t in_buf_offset;
+    sycl::event e;
 };
-template <cl::sycl::access::mode access_mode, class Func>
-sycl_buffer_writer_visitor<Func, access_mode> make_writer_visitor(const ccl_datatype& dtype,
-                                                    size_t cnt,
-                                                    size_t offset,
-                                                    const ccl_buffer& in_buf,
-                                                    cl::sycl::queue queue,
-                                                    ccl_buffer& dst_buf,
-                                                    Func f) {
-    return sycl_buffer_writer_visitor<Func, access_mode>(dtype, cnt, offset, in_buf, queue, dst_buf, f);
-}
+
+#endif /* CCL_ENABLE_SYCL */
diff --git a/src/sched/entry/write_entry.hpp b/src/sched/entry/write_entry.hpp
index 2564d370d..443e32350 100644
--- a/src/sched/entry/write_entry.hpp
+++ b/src/sched/entry/write_entry.hpp
@@ -33,7 +33,7 @@ class write_entry : public sched_entry,
                 atl_mr_t* src_mr,
                 size_t cnt,
                 const ccl_datatype& dtype,
-                size_t dst,
+                int dst,
                 atl_mr_t* dst_mr,
                 size_t dst_buf_off,
                 ccl_comm* comm)
@@ -66,7 +66,7 @@ class write_entry : public sched_entry,
             return;
         }
 
-        size_t global_dst = comm->get_global_rank(dst);
+        int global_dst = comm->get_global_rank(dst);
 
         size_t bytes = cnt * dtype.size();
         atl_status_t atl_status = comm->atl->atl_ep_write(sched->bin->get_atl_ep(),
@@ -136,7 +136,7 @@ class write_entry : public sched_entry,
     atl_mr_t* src_mr;
     size_t cnt;
     ccl_datatype dtype;
-    size_t dst;
+    int dst;
     atl_mr_t* dst_mr;
     size_t dst_buf_off;
     ccl_comm* comm;
diff --git a/src/sched/extra_sched.cpp b/src/sched/extra_sched.cpp
index 92ad5d01a..7a0d7e673 100644
--- a/src/sched/extra_sched.cpp
+++ b/src/sched/extra_sched.cpp
@@ -30,21 +30,21 @@ void ccl_extra_sched::dump(std::ostream& out) const {
                        ", num_entries: ",
                        entries.size(),
                        "\n");
-    
+
     std::stringstream msg;
     for (size_t i = 0; i < entries.size(); ++i) {
         entries[i]->dump(msg, i);
     }
     out << msg.str();
 #ifdef ENABLE_TIMERS
-    ccl_logger::format(out,
-                       "\nlife time [us] ",
-                       std::setw(5),
-                       std::setbase(10),
-                       std::chrono::duration_cast<std::chrono::microseconds>(
-                          exec_complete_time - exec_start_time)
-                          .count(),
-                       "\n");
+    ccl_logger::format(
+        out,
+        "\nlife time [us] ",
+        std::setw(5),
+        std::setbase(10),
+        std::chrono::duration_cast<std::chrono::microseconds>(exec_complete_time - exec_start_time)
+            .count(),
+        "\n");
 #endif
 
     ccl_logger::format(out, "--------------------------------\n");
diff --git a/src/sched/master_sched.cpp b/src/sched/master_sched.cpp
index 3a85c76e5..b93e44e91 100644
--- a/src/sched/master_sched.cpp
+++ b/src/sched/master_sched.cpp
@@ -164,6 +164,8 @@ ccl_master_sched::ccl_master_sched_ptr ccl_master_sched::create(const ccl_coll_p
     CCL_THROW_IF_NOT(param.ctype == ccl_coll_allgatherv || !(attr.vector_buf),
                      "vector buffer is supported for allgatherv only");
 
+    CCL_THROW_IF_NOT(param.dtype.idx() != ccl::datatype::float16, "FP16 is unsupported yet");
+
     if (param.ctype == ccl_coll_sparse_allreduce) {
         CCL_THROW_IF_NOT(
             ccl::global_data::env().sparse_allreduce_algo_raw != "mask" || !(attr.reduction_fn),
diff --git a/src/sched/queue/queue.cpp b/src/sched/queue/queue.cpp
index f34582adc..1a9a4839d 100644
--- a/src/sched/queue/queue.cpp
+++ b/src/sched/queue/queue.cpp
@@ -16,6 +16,10 @@
 #include "common/global/global.hpp"
 #include "sched/queue/queue.hpp"
 
+size_t ccl_sched_queue::get_idx() const {
+    return idx;
+}
+
 void ccl_sched_bin::add(ccl_sched* sched) {
     if (ccl::global_data::env().priority_mode != ccl_priority_none) {
         CCL_ASSERT(sched->coll_attr.priority == priority,
@@ -51,8 +55,15 @@ size_t ccl_sched_bin::erase(size_t idx, size_t& next_idx) {
     return size;
 }
 
-ccl_sched_queue::ccl_sched_queue(std::vector<size_t> atl_eps) : atl_eps(atl_eps) {
-    LOG_DEBUG("created sched_queue, atl_eps count ", atl_eps.size(), ", atl_eps[0] ", atl_eps[0]);
+ccl_sched_queue::ccl_sched_queue(size_t idx, std::vector<size_t> atl_eps)
+        : idx(idx),
+          atl_eps(atl_eps) {
+    LOG_DEBUG("created sched_queue, idx ",
+              idx,
+              ", atl_eps count ",
+              atl_eps.size(),
+              ", atl_eps[0] ",
+              atl_eps[0]);
 
     if (ccl::global_data::env().priority_mode != ccl_priority_none) {
         CCL_ASSERT(atl_eps.size() == CCL_PRIORITY_BUCKET_COUNT,
diff --git a/src/sched/queue/queue.hpp b/src/sched/queue/queue.hpp
index bb0d77144..010453b48 100644
--- a/src/sched/queue/queue.hpp
+++ b/src/sched/queue/queue.hpp
@@ -193,13 +193,15 @@ class ccl_sched_bin {
 
 class ccl_sched_queue {
 public:
-    ccl_sched_queue(std::vector<size_t> atl_eps);
+    ccl_sched_queue(size_t idx, std::vector<size_t> atl_eps);
 
     ccl_sched_queue() = delete;
     ccl_sched_queue(const ccl_sched_queue& other) = delete;
     ccl_sched_queue& operator=(const ccl_sched_queue& other) = delete;
     ~ccl_sched_queue();
 
+    size_t get_idx() const;
+
     void add(ccl_sched* sched);
     size_t erase(ccl_sched_bin* bin, size_t idx);
     void clear();
@@ -230,6 +232,7 @@ class ccl_sched_queue {
 private:
     mutable sched_queue_lock_t bins_guard{};
 
+    size_t idx;
     std::vector<size_t> atl_eps;
     sched_bin_list_t bins{ CCL_SCHED_QUEUE_INITIAL_BIN_COUNT };
     size_t max_priority = 0;
diff --git a/src/sched/queue/strict_queue.cpp b/src/sched/queue/strict_queue.cpp
index a0be6b6bb..25d8c852c 100644
--- a/src/sched/queue/strict_queue.cpp
+++ b/src/sched/queue/strict_queue.cpp
@@ -18,7 +18,7 @@
 void ccl_strict_sched_queue::add(ccl_sched* sched) {
     CCL_ASSERT(sched);
     CCL_ASSERT(!sched->bin, "sched ", sched, ", bin ", sched->bin);
-    CCL_ASSERT(sched->strict_start_order);
+    CCL_ASSERT(sched->strict_order);
 
     std::lock_guard<sched_queue_lock_t> lock{ queue_guard };
     queue.push_back(sched);
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index ea0c301d8..52f340544 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -21,6 +21,12 @@
 #include "sched/queue/queue.hpp"
 #include "sched/sched.hpp"
 
+ccl_sched::ccl_sched(const ccl_coll_param& coll_param, ccl_request* master_request)
+        : ccl_sched_base(coll_param) {
+    req = master_request;
+    strict_order = ccl::global_data::env().enable_strict_order;
+}
+
 ccl_sched::~ccl_sched() {
     if (in_bin_status == ccl_sched_in_bin_added)
         LOG_DEBUG("in_bin_status == ccl_sched_in_bin_added");
diff --git a/src/sched/sched.hpp b/src/sched/sched.hpp
index b3de9990b..d8f700cd9 100644
--- a/src/sched/sched.hpp
+++ b/src/sched/sched.hpp
@@ -15,6 +15,7 @@
 */
 #pragma once
 #include "sched/sched_base.hpp"
+#include "internal_types.hpp"
 
 //todo: sequence diagram
 //workflow:
@@ -33,7 +34,7 @@ enum ccl_sched_in_bin_status {
     ccl_sched_in_bin_erased
 };
 
-typedef ccl_status_t (*ccl_sched_finalize_fn_t)(ccl_sched*, const void*);
+typedef ccl::status (*ccl_sched_finalize_fn_t)(ccl_sched*, const void*);
 
 class ccl_extra_sched;
 
@@ -43,11 +44,7 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
         return "worker_sched";
     }
 
-    ccl_sched(const ccl_coll_param& coll_param, ccl_request* master_request)
-            : ccl_sched_base(coll_param) {
-        req = master_request;
-    }
-
+    ccl_sched(const ccl_coll_param& coll_param, ccl_request* master_request);
     ccl_sched() = delete;
     ccl_sched(const ccl_sched& other) = delete;
     ccl_sched& operator=(const ccl_sched& other) = delete;
@@ -158,8 +155,9 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
     using sched_entry_ptr = std::unique_ptr<sched_entry>;
     std::deque<sched_entry_ptr> entries{};
 
-    /* whether sched should be started in the same order as in user code */
-    bool strict_start_order = false;
+    /* whether sched should be executed in the same order as in user code */
+    /* currently applicable for start phase only */
+    bool strict_order;
 
     void set_finalize_fn(ccl_sched_finalize_fn_t fn, void* ctx) {
         finalize_fn = fn;
diff --git a/src/sched/sched_base.cpp b/src/sched/sched_base.cpp
index 57e5dac13..eb8f1c3e6 100644
--- a/src/sched/sched_base.cpp
+++ b/src/sched/sched_base.cpp
@@ -48,12 +48,12 @@ void ccl_sched_base::update_coll_param_and_attr(const ccl_coll_param& param,
 
     if (coll_param.ctype == ccl_coll_allgatherv) {
         coll_param.recv_counts = param.recv_counts;
-        CCL_THROW_IF_NOT(coll_param_copy.ag_recv_counts.size() == coll_param.comm->size());
+        CCL_THROW_IF_NOT((int)coll_param_copy.ag_recv_counts.size() == coll_param.comm->size());
         coll_param_copy.ag_recv_counts.assign((size_t*)param.recv_counts,
                                               (size_t*)param.recv_counts + coll_param.comm->size());
 
         if (coll_attr.vector_buf) {
-            CCL_THROW_IF_NOT(coll_param_copy.ag_recv_bufs.size() == coll_param.comm->size());
+            CCL_THROW_IF_NOT((int)coll_param_copy.ag_recv_bufs.size() == coll_param.comm->size());
             coll_param_copy.ag_recv_bufs.assign((void**)param.recv_buf,
                                                 (void**)param.recv_buf + coll_param.comm->size());
         }
@@ -63,8 +63,8 @@ void ccl_sched_base::update_coll_param_and_attr(const ccl_coll_param& param,
         coll_param.send_counts = param.send_counts;
         coll_param.recv_counts = param.recv_counts;
 
-        CCL_THROW_IF_NOT(coll_param_copy.a2av_send_counts.size() == coll_param.comm->size());
-        CCL_THROW_IF_NOT(coll_param_copy.a2av_recv_counts.size() == coll_param.comm->size());
+        CCL_THROW_IF_NOT((int)coll_param_copy.a2av_send_counts.size() == coll_param.comm->size());
+        CCL_THROW_IF_NOT((int)coll_param_copy.a2av_recv_counts.size() == coll_param.comm->size());
 
         coll_param_copy.a2av_send_counts.assign(
             (size_t*)param.send_counts, (size_t*)param.send_counts + coll_param.comm->size());
@@ -226,9 +226,13 @@ void ccl_sched_base::alloc_buffers_for_sycl_copy() {
         return;
 
     LOG_DEBUG("alloc tmp buffers for D2H and H2D copies, coll_type ",
-              ccl_coll_type_to_str(param.ctype), ", dtype_size ", param.dtype.size(),
-              ", comm_size ", param.comm->size(),
-              ", count ", param.count);
+              ccl_coll_type_to_str(param.ctype),
+              ", dtype_size ",
+              param.dtype.size(),
+              ", comm_size ",
+              param.comm->size(),
+              ", count ",
+              param.count);
 
     size_t idx, send_count = 0, recv_count = 0;
 
@@ -283,7 +287,8 @@ void ccl_sched_base::alloc_buffers_for_sycl_copy() {
         case ccl_coll_reduce_scatter:
             param.sycl_send_buf = static_cast<ccl_sycl_buffer_t*>((void*)param.send_buf);
             param.sycl_recv_buf = static_cast<ccl_sycl_buffer_t*>(param.recv_buf);
-            param.send_buf = alloc_buffer(param.count * param.comm->size() * param.dtype.size()).get_ptr();
+            param.send_buf =
+                alloc_buffer(param.count * param.comm->size() * param.dtype.size()).get_ptr();
             param.recv_buf = alloc_buffer(param.count * param.dtype.size()).get_ptr();
             break;
         case ccl_coll_sparse_allreduce:
diff --git a/src/sched/sched_base.hpp b/src/sched/sched_base.hpp
index 6751b9375..076ff4382 100644
--- a/src/sched/sched_base.hpp
+++ b/src/sched/sched_base.hpp
@@ -70,7 +70,8 @@ struct ccl_sched_base {
 
     void set_coll_attr(const struct ccl_coll_attr& attr);
 
-    void update_coll_param_and_attr(const struct ccl_coll_param& param, const struct ccl_coll_attr& attr);
+    void update_coll_param_and_attr(const struct ccl_coll_param& param,
+                                    const struct ccl_coll_attr& attr);
 
     size_t get_priority() const;
 
diff --git a/src/stream_impl.hpp b/src/stream_impl.hpp
index 2c8f528ff..fa08ad528 100644
--- a/src/stream_impl.hpp
+++ b/src/stream_impl.hpp
@@ -14,30 +14,28 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_aliases.hpp"
-
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_types_policy.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids.hpp"
-#include "oneapi/ccl/ccl_stream_attr_ids_traits.hpp"
-#include "oneapi/ccl/ccl_stream.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/aliases.hpp"
+
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/types_policy.hpp"
+#include "oneapi/ccl/stream_attr_ids.hpp"
+#include "oneapi/ccl/stream_attr_ids_traits.hpp"
+#include "oneapi/ccl/stream.hpp"
 #include "common/stream/stream.hpp"
+#include "common/utils/version.hpp"
 
 namespace ccl {
-/* TODO temporary function for UT compilation: would be part of ccl::environment in final*/
+
+namespace v1 {
+
+/* TODO temporary function for UT compilation: would be part of ccl::detail::environment in final*/
 template <class... attr_value_pair_t>
 stream stream::create_stream_from_attr(typename unified_device_type::ccl_native_t device,
                                        attr_value_pair_t&&... avps) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    stream str{ stream_provider_dispatcher::create(device, ret) };
+    auto version = utils::get_library_version();
+
+    stream str{ stream_provider_dispatcher::create(device, version) };
     int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
     (void)expander;
     str.build_from_params();
@@ -46,17 +44,11 @@ stream stream::create_stream_from_attr(typename unified_device_type::ccl_native_
 
 template <class... attr_value_pair_t>
 stream stream::create_stream_from_attr(typename unified_device_type::ccl_native_t device,
-                                       typename unified_device_context_type::ccl_native_t context,
+                                       typename unified_context_type::ccl_native_t context,
                                        attr_value_pair_t&&... avps) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-
-    stream str{ stream_provider_dispatcher::create(device, context, ret) };
+    auto version = utils::get_library_version();
+
+    stream str{ stream_provider_dispatcher::create(device, context, version) };
     int expander[]{ (str.template set<attr_value_pair_t::idx()>(avps.val()), 0)... };
     (void)expander;
     str.build_from_params();
@@ -65,49 +57,40 @@ stream stream::create_stream_from_attr(typename unified_device_type::ccl_native_
 
 template <class native_stream_type, typename T>
 stream stream::create_stream(native_stream_type& native_stream) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-    return { stream_provider_dispatcher::create(native_stream, ret) };
+    auto version = utils::get_library_version();
+    return { stream_provider_dispatcher::create(native_stream, version) };
 }
 
 template <class device_type, class native_context_type, typename T>
 stream stream::create_stream(device_type& device, native_context_type& native_ctx) {
-    ccl::library_version ret{};
-    ret.major = CCL_MAJOR_VERSION;
-    ret.minor = CCL_MINOR_VERSION;
-    ret.update = CCL_UPDATE_VERSION;
-    ret.product_status = CCL_PRODUCT_STATUS;
-    ret.build_date = CCL_PRODUCT_BUILD_DATE;
-    ret.full = CCL_PRODUCT_FULL;
-    return { stream_provider_dispatcher::create(device, native_ctx, ret) };
+    auto version = utils::get_library_version();
+    return { stream_provider_dispatcher::create(device, native_ctx, version) };
 }
 
 template <stream_attr_id attrId>
-CCL_API const typename details::ccl_api_type_attr_traits<stream_attr_id, attrId>::return_type&
+CCL_API const typename detail::ccl_api_type_attr_traits<stream_attr_id, attrId>::return_type&
 stream::get() const {
     return get_impl()->get_attribute_value(
-        details::ccl_api_type_attr_traits<stream_attr_id, attrId>{});
+        detail::ccl_api_type_attr_traits<stream_attr_id, attrId>{});
 }
 
 template<stream_attr_id attrId,
              class Value/*,
              typename T*/>
-CCL_API typename details::ccl_api_type_attr_traits<stream_attr_id, attrId>::return_type stream::set(const Value& v)
+CCL_API typename detail::ccl_api_type_attr_traits<stream_attr_id, attrId>::return_type stream::set(const Value& v)
 {
     return get_impl()->set_attribute_value(
-        v, details::ccl_api_type_attr_traits<stream_attr_id, attrId>{});
+        v, detail::ccl_api_type_attr_traits<stream_attr_id, attrId>{});
 }
 
 /*
-stream::stream(const typename details::ccl_api_type_attr_traits<stream_attr_id, stream_attr_id::version>::type& version) :
+stream::stream(const typename detail::ccl_api_type_attr_traits<stream_attr_id, stream_attr_id::version>::type& version) :
         base_t(stream_provider_dispatcher::create(version))
 {
 }*/
+
+} // namespace v1
+
 } // namespace ccl
 
 /***************************TypeGenerations*********************************************************/
@@ -119,14 +102,14 @@ stream::stream(const typename details::ccl_api_type_attr_traits<stream_attr_id,
                                                             native_context_type& native_ctx);
 
 #define API_STREAM_FORCE_INSTANTIATION_SET(IN_attrId, IN_Value) \
-    template CCL_API typename ccl::details::ccl_api_type_attr_traits<ccl::stream_attr_id, \
-                                                                     IN_attrId>::return_type \
+    template CCL_API typename ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, \
+                                                                    IN_attrId>::return_type \
     ccl::stream::set<IN_attrId, IN_Value>(const IN_Value& v);
 
 #define API_STREAM_FORCE_INSTANTIATION_GET(IN_attrId) \
-    template CCL_API const typename ccl::details:: \
-        ccl_api_type_attr_traits<ccl::stream_attr_id, IN_attrId>::return_type& \
-        ccl::stream::get<IN_attrId>() const;
+    template CCL_API const typename ccl::detail::ccl_api_type_attr_traits<ccl::stream_attr_id, \
+                                                                          IN_attrId>::return_type& \
+    ccl::stream::get<IN_attrId>() const;
 
 #define API_STREAM_FORCE_INSTANTIATION(IN_attrId, IN_Value) \
     API_STREAM_FORCE_INSTANTIATION_SET(IN_attrId, IN_Value) \
diff --git a/src/supported_topologies.hpp b/src/supported_topologies.hpp
index 4696185cd..0244b550c 100644
--- a/src/supported_topologies.hpp
+++ b/src/supported_topologies.hpp
@@ -14,25 +14,23 @@
  limitations under the License.
 */
 #pragma once
-#include "oneapi/ccl/ccl_type_traits.hpp"
-#include "oneapi/ccl/ccl_comm_split_attr_ids.hpp"
+
+#include "oneapi/ccl/type_traits.hpp"
+#include "oneapi/ccl/comm_split_attr_ids.hpp"
 #include "common/utils/enums.hpp"
+#include "internal_types.hpp"
 
 namespace ccl {
 
-//TODO I do not want to rename 90% f code at now
-using group_split_type = group_split_type;
-
 #define SUPPORTED_HW_TOPOLOGIES_DECL_LIST \
-    ccl::group_split_type::thread, ccl::group_split_type::process, \
-        ccl::group_split_type::cluster
+    ccl::group_split_type::thread, ccl::group_split_type::process, ccl::group_split_type::cluster
 
 #define SUPPORTED_TOPOLOGY_CLASSES_DECL_LIST \
     ccl::device_topology_type::ring, ccl::device_topology_type::a2a
 } // namespace ccl
 
-using device_group_split_type_names = utils::enum_to_str<
-    static_cast<typename std::underlying_type<ccl::group_split_type>::type>(
+using device_group_split_type_names =
+    utils::enum_to_str<static_cast<typename std::underlying_type<ccl::group_split_type>::type>(
         ccl::group_split_type::last_value)>;
 inline std::string to_string(ccl::group_split_type type) {
     return device_group_split_type_names({
diff --git a/src/types_generator_defines.hpp b/src/types_generator_defines.hpp
index 23c051187..918320d72 100644
--- a/src/types_generator_defines.hpp
+++ b/src/types_generator_defines.hpp
@@ -20,216 +20,216 @@
 /**
  * Core types generators
  */
-#define DEVICE_COMM_INTERFACE_COLL_DECLARATION__VOID \
+#define COMM_INTERFACE_COLL_DECLARATION__VOID \
 \
     virtual ccl::event allgatherv(const void* send_buf, \
-                                    size_t send_count, \
-                                    void* recv_buf, \
-                                    const ccl::vector_class<size_t>& recv_counts, \
-                                    ccl::datatype dtype, \
-                                    const ccl::stream::impl_value_t& stream, \
-                                    const ccl::allgatherv_attr& attr, \
-                                    const ccl::vector_class<ccl::event>& deps = {}) = 0; \
-\
-    virtual ccl::event allgatherv(const void* send_buf, \
-                                    size_t send_count, \
-                                    const ccl::vector_class<void*>& recv_bufs, \
-                                    const ccl::vector_class<size_t>& recv_counts, \
-                                    ccl::datatype dtype, \
-                                    const ccl::stream::impl_value_t& stream, \
-                                    const ccl::allgatherv_attr& attr, \
-                                    const ccl::vector_class<ccl::event>& deps = {}) = 0; \
-\
-    virtual ccl::event allreduce(const void* send_buf, \
-                                   void* recv_buf, \
-                                   size_t count, \
-                                   ccl::datatype dtype, \
-                                   ccl::reduction reduction, \
-                                   const ccl::stream::impl_value_t& stream, \
-                                   const ccl::allreduce_attr& attr, \
-                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
-\
-    virtual ccl::event alltoall(const void* send_buf, \
+                                  size_t send_count, \
                                   void* recv_buf, \
-                                  size_t count, \
+                                  const ccl::vector_class<size_t>& recv_counts, \
                                   ccl::datatype dtype, \
                                   const ccl::stream::impl_value_t& stream, \
-                                  const ccl::alltoall_attr& attr, \
+                                  const ccl::allgatherv_attr& attr, \
                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
-    virtual ccl::event alltoall(const ccl::vector_class<void*>& send_buf, \
-                                  const ccl::vector_class<void*>& recv_buf, \
-                                  size_t count, \
+\
+    virtual ccl::event allgatherv(const void* send_buf, \
+                                  size_t send_count, \
+                                  const ccl::vector_class<void*>& recv_bufs, \
+                                  const ccl::vector_class<size_t>& recv_counts, \
                                   ccl::datatype dtype, \
                                   const ccl::stream::impl_value_t& stream, \
-                                  const ccl::alltoall_attr& attr, \
+                                  const ccl::allgatherv_attr& attr, \
                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+\
+    virtual ccl::event allreduce(const void* send_buf, \
+                                 void* recv_buf, \
+                                 size_t count, \
+                                 ccl::datatype dtype, \
+                                 ccl::reduction reduction, \
+                                 const ccl::stream::impl_value_t& stream, \
+                                 const ccl::allreduce_attr& attr, \
+                                 const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+\
+    virtual ccl::event alltoall(const void* send_buf, \
+                                void* recv_buf, \
+                                size_t count, \
+                                ccl::datatype dtype, \
+                                const ccl::stream::impl_value_t& stream, \
+                                const ccl::alltoall_attr& attr, \
+                                const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+    virtual ccl::event alltoall(const ccl::vector_class<void*>& send_buf, \
+                                const ccl::vector_class<void*>& recv_buf, \
+                                size_t count, \
+                                ccl::datatype dtype, \
+                                const ccl::stream::impl_value_t& stream, \
+                                const ccl::alltoall_attr& attr, \
+                                const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoallv(const void* send_buf, \
-                                   const ccl::vector_class<size_t>& send_counts, \
-                                   void* recv_buf, \
-                                   const ccl::vector_class<size_t>& recv_counts, \
-                                   ccl::datatype dtype, \
-                                   const ccl::stream::impl_value_t& stream, \
-                                   const ccl::alltoallv_attr& attr, \
-                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                 const ccl::vector_class<size_t>& send_counts, \
+                                 void* recv_buf, \
+                                 const ccl::vector_class<size_t>& recv_counts, \
+                                 ccl::datatype dtype, \
+                                 const ccl::stream::impl_value_t& stream, \
+                                 const ccl::alltoallv_attr& attr, \
+                                 const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoallv(const ccl::vector_class<void*>& send_bufs, \
-                                   const ccl::vector_class<size_t>& send_counts, \
-                                   const ccl::vector_class<void*>& recv_bufs, \
-                                   const ccl::vector_class<size_t>& recv_counts, \
-                                   ccl::datatype dtype, \
-                                   const ccl::stream::impl_value_t& stream, \
-                                   const ccl::alltoallv_attr& attr, \
-                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                 const ccl::vector_class<size_t>& send_counts, \
+                                 const ccl::vector_class<void*>& recv_bufs, \
+                                 const ccl::vector_class<size_t>& recv_counts, \
+                                 ccl::datatype dtype, \
+                                 const ccl::stream::impl_value_t& stream, \
+                                 const ccl::alltoallv_attr& attr, \
+                                 const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event bcast(void* buf, \
-                               size_t count, \
-                               ccl::datatype dtype, \
-                               size_t root, \
-                               const ccl::stream::impl_value_t& stream, \
-                               const ccl::broadcast_attr& attr, \
-                               const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                             size_t count, \
+                             ccl::datatype dtype, \
+                             int root, \
+                             const ccl::stream::impl_value_t& stream, \
+                             const ccl::broadcast_attr& attr, \
+                             const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event reduce(const void* send_buf, \
-                                void* recv_buf, \
-                                size_t count, \
-                                ccl::datatype dtype, \
-                                ccl::reduction reduction, \
-                                size_t root, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::reduce_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                              void* recv_buf, \
+                              size_t count, \
+                              ccl::datatype dtype, \
+                              ccl::reduction reduction, \
+                              int root, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::reduce_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event reduce_scatter(const void* send_buf, \
-                                        void* recv_buf, \
-                                        size_t recv_count, \
-                                        ccl::datatype dtype, \
-                                        ccl::reduction reduction, \
-                                        const ccl::stream::impl_value_t& stream, \
-                                        const reduce_scatter_attr& attr, \
-                                        const ccl::vector_class<ccl::event>& deps = {}) = 0;
+                                      void* recv_buf, \
+                                      size_t recv_count, \
+                                      ccl::datatype dtype, \
+                                      ccl::reduction reduction, \
+                                      const ccl::stream::impl_value_t& stream, \
+                                      const ccl::reduce_scatter_attr& attr, \
+                                      const ccl::vector_class<ccl::event>& deps = {}) = 0;
 
-#define DEVICE_COMM_INTERFACE_SPARSE_DECLARATION__VOID \
+#define COMM_INTERFACE_SPARSE_DECLARATION__VOID \
 \
     virtual ccl::event sparse_allreduce(const void* send_ind_buf, \
-                                          size_t send_ind_count, \
-                                          const void* send_val_buf, \
-                                          size_t send_val_count, \
-                                          void* recv_ind_buf, \
-                                          size_t recv_ind_count, \
-                                          void* recv_val_buf, \
-                                          size_t recv_val_count, \
-                                          ccl::datatype index_dtype, \
-                                          ccl::datatype value_dtype, \
-                                          ccl::reduction reduction, \
-                                          const ccl::stream::impl_value_t& stream, \
-                                          const ccl::sparse_allreduce_attr& attr, \
-                                          const ccl::vector_class<ccl::event>& deps = {}) = 0;
+                                        size_t send_ind_count, \
+                                        const void* send_val_buf, \
+                                        size_t send_val_count, \
+                                        void* recv_ind_buf, \
+                                        size_t recv_ind_count, \
+                                        void* recv_val_buf, \
+                                        size_t recv_val_count, \
+                                        ccl::datatype index_dtype, \
+                                        ccl::datatype value_dtype, \
+                                        ccl::reduction reduction, \
+                                        const ccl::stream::impl_value_t& stream, \
+                                        const ccl::sparse_allreduce_attr& attr, \
+                                        const ccl::vector_class<ccl::event>& deps = {}) = 0;
 
-#define DEVICE_COMM_INTERFACE_COLL_DECLARATION(type) \
+#define COMM_INTERFACE_COLL_DECLARATION(type) \
 \
     virtual ccl::event allgatherv(const type* send_buf, \
-                                    size_t send_count, \
-                                    type* recv_buf, \
-                                    const ccl::vector_class<size_t>& recv_counts, \
-                                    const ccl::stream::impl_value_t& stream, \
-                                    const ccl::allgatherv_attr& attr, \
-                                    const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                  size_t send_count, \
+                                  type* recv_buf, \
+                                  const ccl::vector_class<size_t>& recv_counts, \
+                                  const ccl::stream::impl_value_t& stream, \
+                                  const ccl::allgatherv_attr& attr, \
+                                  const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event allgatherv(const type* send_buf, \
-                                    size_t send_count, \
-                                    ccl::vector_class<type*>& recv_bufs, \
-                                    const ccl::vector_class<size_t>& recv_counts, \
-                                    const ccl::stream::impl_value_t& stream, \
-                                    const ccl::allgatherv_attr& attr, \
-                                    const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                  size_t send_count, \
+                                  ccl::vector_class<type*>& recv_bufs, \
+                                  const ccl::vector_class<size_t>& recv_counts, \
+                                  const ccl::stream::impl_value_t& stream, \
+                                  const ccl::allgatherv_attr& attr, \
+                                  const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event allreduce(const type* send_buf, \
-                                   type* recv_buf, \
-                                   size_t count, \
-                                   ccl::reduction reduction, \
-                                   const ccl::stream::impl_value_t& stream, \
-                                   const ccl::allreduce_attr& attr, \
-                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                 type* recv_buf, \
+                                 size_t count, \
+                                 ccl::reduction reduction, \
+                                 const ccl::stream::impl_value_t& stream, \
+                                 const ccl::allreduce_attr& attr, \
+                                 const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoall(const type* send_buf, \
-                                  type* recv_buf, \
-                                  size_t count, \
-                                  const ccl::stream::impl_value_t& stream, \
-                                  const ccl::alltoall_attr& attr, \
-                                  const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                type* recv_buf, \
+                                size_t count, \
+                                const ccl::stream::impl_value_t& stream, \
+                                const ccl::alltoall_attr& attr, \
+                                const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoall(const ccl::vector_class<type*>& send_buf, \
-                                  const ccl::vector_class<type*>& recv_buf, \
-                                  size_t count, \
-                                  const ccl::stream::impl_value_t& stream, \
-                                  const ccl::alltoall_attr& attr, \
-                                  const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                const ccl::vector_class<type*>& recv_buf, \
+                                size_t count, \
+                                const ccl::stream::impl_value_t& stream, \
+                                const ccl::alltoall_attr& attr, \
+                                const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoallv(const type* send_buf, \
-                                   const ccl::vector_class<size_t>& send_counts, \
-                                   type* recv_buf, \
-                                   const ccl::vector_class<size_t>& recv_counts, \
-                                   const ccl::stream::impl_value_t& stream, \
-                                   const ccl::alltoallv_attr& attr, \
-                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                 const ccl::vector_class<size_t>& send_counts, \
+                                 type* recv_buf, \
+                                 const ccl::vector_class<size_t>& recv_counts, \
+                                 const ccl::stream::impl_value_t& stream, \
+                                 const ccl::alltoallv_attr& attr, \
+                                 const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoallv(const ccl::vector_class<type*>& send_bufs, \
-                                   const ccl::vector_class<size_t>& send_counts, \
-                                   const ccl::vector_class<type*>& recv_bufs, \
-                                   const ccl::vector_class<size_t>& recv_counts, \
-                                   const ccl::stream::impl_value_t& stream, \
-                                   const ccl::alltoallv_attr& attr, \
-                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                 const ccl::vector_class<size_t>& send_counts, \
+                                 const ccl::vector_class<type*>& recv_bufs, \
+                                 const ccl::vector_class<size_t>& recv_counts, \
+                                 const ccl::stream::impl_value_t& stream, \
+                                 const ccl::alltoallv_attr& attr, \
+                                 const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event bcast(type* buf, \
-                               size_t count, \
-                               size_t root, \
-                               const ccl::stream::impl_value_t& stream, \
-                               const ccl::broadcast_attr& attr, \
-                               const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                             size_t count, \
+                             int root, \
+                             const ccl::stream::impl_value_t& stream, \
+                             const ccl::broadcast_attr& attr, \
+                             const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event reduce(const type* send_buf, \
-                                type* recv_buf, \
-                                size_t count, \
-                                ccl::reduction reduction, \
-                                size_t root, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::reduce_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                              type* recv_buf, \
+                              size_t count, \
+                              ccl::reduction reduction, \
+                              int root, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::reduce_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event reduce_scatter(const type* send_buf, \
-                                        type* recv_buf, \
-                                        size_t recv_count, \
-                                        ccl::reduction reduction, \
-                                        const ccl::stream::impl_value_t& stream, \
-                                        const ccl::reduce_scatter_attr& attr, \
-                                        const ccl::vector_class<ccl::event>& deps) = 0;
+                                      type* recv_buf, \
+                                      size_t recv_count, \
+                                      ccl::reduction reduction, \
+                                      const ccl::stream::impl_value_t& stream, \
+                                      const ccl::reduce_scatter_attr& attr, \
+                                      const ccl::vector_class<ccl::event>& deps) = 0;
 
-#define DEVICE_COMM_INTERFACE_SPARSE_DECLARATION(index_type, value_type) \
+#define COMM_INTERFACE_SPARSE_DECLARATION(index_type, value_type) \
 \
     virtual ccl::event sparse_allreduce(const index_type* send_ind_buf, \
-                                          size_t send_ind_count, \
-                                          const value_type* send_val_buf, \
-                                          size_t send_val_count, \
-                                          index_type* recv_ind_buf, \
-                                          size_t recv_ind_count, \
-                                          value_type* recv_val_buf, \
-                                          size_t recv_val_count, \
-                                          ccl::reduction reduction, \
-                                          const ccl::stream::impl_value_t& stream, \
-                                          const ccl::sparse_allreduce_attr& attr, \
-                                          const ccl::vector_class<ccl::event>& deps = {}) = 0;
+                                        size_t send_ind_count, \
+                                        const value_type* send_val_buf, \
+                                        size_t send_val_count, \
+                                        index_type* recv_ind_buf, \
+                                        size_t recv_ind_count, \
+                                        value_type* recv_val_buf, \
+                                        size_t recv_val_count, \
+                                        ccl::reduction reduction, \
+                                        const ccl::stream::impl_value_t& stream, \
+                                        const ccl::sparse_allreduce_attr& attr, \
+                                        const ccl::vector_class<ccl::event>& deps = {}) = 0;
 
-#define DEVICE_COMM_INTERFACE_COLL_CLASS_DECLARATION(type) \
+#define COMM_INTERFACE_COLL_CLASS_DECLARATION(type) \
 \
     virtual ccl::event allgatherv(const type& send_buf, \
-                                    size_t send_count, \
-                                    type& recv_buf, \
-                                    const ccl::vector_class<size_t>& recv_counts, \
-                                    const ccl::stream::impl_value_t& stream, \
-                                    const ccl::allgatherv_attr& attr, \
-                                    const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                  size_t send_count, \
+                                  type& recv_buf, \
+                                  const ccl::vector_class<size_t>& recv_counts, \
+                                  const ccl::stream::impl_value_t& stream, \
+                                  const ccl::allgatherv_attr& attr, \
+                                  const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event allgatherv( \
         const type& send_buf, \
@@ -241,19 +241,19 @@
         const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event allreduce(const type& send_buf, \
-                                   type& recv_buf, \
-                                   size_t count, \
-                                   ccl::reduction reduction, \
-                                   const ccl::stream::impl_value_t& stream, \
-                                   const ccl::allreduce_attr& attr, \
-                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                 type& recv_buf, \
+                                 size_t count, \
+                                 ccl::reduction reduction, \
+                                 const ccl::stream::impl_value_t& stream, \
+                                 const ccl::allreduce_attr& attr, \
+                                 const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoall(const type& send_buf, \
-                                  type& recv_buf, \
-                                  size_t count, \
-                                  const ccl::stream::impl_value_t& stream, \
-                                  const ccl::alltoall_attr& attr, \
-                                  const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                type& recv_buf, \
+                                size_t count, \
+                                const ccl::stream::impl_value_t& stream, \
+                                const ccl::alltoall_attr& attr, \
+                                const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoall( \
         const ccl::vector_class<ccl::reference_wrapper_class<type>>& send_buf, \
@@ -264,12 +264,12 @@
         const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoallv(const type& send_buf, \
-                                   const ccl::vector_class<size_t>& send_counts, \
-                                   type& recv_buf, \
-                                   const ccl::vector_class<size_t>& recv_counts, \
-                                   const ccl::stream::impl_value_t& stream, \
-                                   const ccl::alltoallv_attr& attr, \
-                                   const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                                 const ccl::vector_class<size_t>& send_counts, \
+                                 type& recv_buf, \
+                                 const ccl::vector_class<size_t>& recv_counts, \
+                                 const ccl::stream::impl_value_t& stream, \
+                                 const ccl::alltoallv_attr& attr, \
+                                 const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event alltoallv( \
         const ccl::vector_class<ccl::reference_wrapper_class<type>>& send_bufs, \
@@ -281,178 +281,178 @@
         const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event bcast(type& buf, \
-                               size_t count, \
-                               size_t root, \
-                               const ccl::stream::impl_value_t& stream, \
-                               const ccl::broadcast_attr& attr, \
-                               const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                             size_t count, \
+                             int root, \
+                             const ccl::stream::impl_value_t& stream, \
+                             const ccl::broadcast_attr& attr, \
+                             const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event reduce(const type& send_buf, \
-                                type& recv_buf, \
-                                size_t count, \
-                                ccl::reduction reduction, \
-                                size_t root, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::reduce_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps = {}) = 0; \
+                              type& recv_buf, \
+                              size_t count, \
+                              ccl::reduction reduction, \
+                              int root, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::reduce_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps = {}) = 0; \
 \
     virtual ccl::event reduce_scatter(const type& send_buf, \
-                                        type& recv_buf, \
-                                        size_t recv_count, \
-                                        ccl::reduction reduction, \
-                                        const ccl::stream::impl_value_t& stream, \
-                                        const ccl::reduce_scatter_attr& attr, \
-                                        const ccl::vector_class<ccl::event>& deps = {}) = 0;
+                                      type& recv_buf, \
+                                      size_t recv_count, \
+                                      ccl::reduction reduction, \
+                                      const ccl::stream::impl_value_t& stream, \
+                                      const ccl::reduce_scatter_attr& attr, \
+                                      const ccl::vector_class<ccl::event>& deps = {}) = 0;
 
-#define DEVICE_COMM_INTERFACE_SPARSE_CLASS_DECLARATION(index_type, value_type) \
+#define COMM_INTERFACE_SPARSE_CLASS_DECLARATION(index_type, value_type) \
 \
     virtual ccl::event sparse_allreduce(const index_type& send_ind_buf, \
-                                          size_t send_ind_count, \
-                                          const value_type& send_val_buf, \
-                                          size_t send_val_count, \
-                                          index_type& recv_ind_buf, \
-                                          size_t recv_ind_count, \
-                                          value_type& recv_val_buf, \
-                                          size_t recv_val_count, \
-                                          ccl::reduction reduction, \
-                                          const ccl::stream::impl_value_t& stream, \
-                                          const ccl::sparse_allreduce_attr& attr, \
-                                          const ccl::vector_class<ccl::event>& deps = {}) = 0;
+                                        size_t send_ind_count, \
+                                        const value_type& send_val_buf, \
+                                        size_t send_val_count, \
+                                        index_type& recv_ind_buf, \
+                                        size_t recv_ind_count, \
+                                        value_type& recv_val_buf, \
+                                        size_t recv_val_count, \
+                                        ccl::reduction reduction, \
+                                        const ccl::stream::impl_value_t& stream, \
+                                        const ccl::sparse_allreduce_attr& attr, \
+                                        const ccl::vector_class<ccl::event>& deps = {}) = 0;
 
 /**
  * Specific coll instantiation
  */
-#define DEVICE_COMM_INTERFACE_COLL_DEFINITION__VOID \
+#define COMM_INTERFACE_COLL_DEFINITION__VOID \
 \
     ccl::event allgatherv(const void* send_buf, \
-                            size_t send_count, \
-                            void* recv_buf, \
-                            const ccl::vector_class<size_t>& recv_counts, \
-                            ccl::datatype dtype, \
-                            const ccl::stream::impl_value_t& stream, \
-                            const ccl::allgatherv_attr& attr, \
-                            const ccl::vector_class<ccl::event>& deps = {}) override { \
+                          size_t send_count, \
+                          void* recv_buf, \
+                          const ccl::vector_class<size_t>& recv_counts, \
+                          ccl::datatype dtype, \
+                          const ccl::stream::impl_value_t& stream, \
+                          const ccl::allgatherv_attr& attr, \
+                          const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allgatherv_impl( \
             send_buf, send_count, recv_buf, recv_counts, dtype, stream, attr, deps); \
     } \
 \
     ccl::event allgatherv(const void* send_buf, \
-                            size_t send_count, \
-                            const ccl::vector_class<void*>& recv_bufs, \
-                            const ccl::vector_class<size_t>& recv_counts, \
-                            ccl::datatype dtype, \
-                            const ccl::stream::impl_value_t& stream, \
-                            const ccl::allgatherv_attr& attr, \
-                            const ccl::vector_class<ccl::event>& deps = {}) override { \
+                          size_t send_count, \
+                          const ccl::vector_class<void*>& recv_bufs, \
+                          const ccl::vector_class<size_t>& recv_counts, \
+                          ccl::datatype dtype, \
+                          const ccl::stream::impl_value_t& stream, \
+                          const ccl::allgatherv_attr& attr, \
+                          const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allgatherv_impl( \
             send_buf, send_count, recv_bufs, recv_counts, dtype, stream, attr, deps); \
     } \
 \
     ccl::event allreduce(const void* send_buf, \
-                           void* recv_buf, \
-                           size_t count, \
-                           ccl::datatype dtype, \
-                           ccl::reduction reduction, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::allreduce_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps = {}) override { \
+                         void* recv_buf, \
+                         size_t count, \
+                         ccl::datatype dtype, \
+                         ccl::reduction reduction, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::allreduce_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allreduce_impl( \
             send_buf, recv_buf, count, dtype, reduction, stream, attr, deps); \
     } \
 \
     ccl::event alltoall(const void* send_buf, \
-                          void* recv_buf, \
-                          size_t count, \
-                          ccl::datatype dtype, \
-                          const ccl::stream::impl_value_t& stream, \
-                          const ccl::alltoall_attr& attr, \
-                          const ccl::vector_class<ccl::event>& deps = {}) override { \
+                        void* recv_buf, \
+                        size_t count, \
+                        ccl::datatype dtype, \
+                        const ccl::stream::impl_value_t& stream, \
+                        const ccl::alltoall_attr& attr, \
+                        const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->alltoall_impl(send_buf, recv_buf, count, dtype, stream, attr, deps); \
     } \
     ccl::event alltoall(const ccl::vector_class<void*>& send_buf, \
-                          const ccl::vector_class<void*>& recv_buf, \
-                          size_t count, \
-                          ccl::datatype dtype, \
-                          const ccl::stream::impl_value_t& stream, \
-                          const ccl::alltoall_attr& attr, \
-                          const ccl::vector_class<ccl::event>& deps = {}) override { \
+                        const ccl::vector_class<void*>& recv_buf, \
+                        size_t count, \
+                        ccl::datatype dtype, \
+                        const ccl::stream::impl_value_t& stream, \
+                        const ccl::alltoall_attr& attr, \
+                        const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->alltoall_impl(send_buf, recv_buf, count, dtype, stream, attr, deps); \
     } \
 \
     ccl::event alltoallv(const void* send_buf, \
-                           const ccl::vector_class<size_t>& send_counts, \
-                           void* recv_buf, \
-                           const ccl::vector_class<size_t>& recv_counts, \
-                           ccl::datatype dtype, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::alltoallv_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps = {}) override { \
+                         const ccl::vector_class<size_t>& send_counts, \
+                         void* recv_buf, \
+                         const ccl::vector_class<size_t>& recv_counts, \
+                         ccl::datatype dtype, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::alltoallv_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->alltoallv_impl( \
             send_buf, send_counts, recv_buf, recv_counts, dtype, stream, attr, deps); \
     } \
     ccl::event alltoallv(const ccl::vector_class<void*>& send_bufs, \
-                           const ccl::vector_class<size_t>& send_counts, \
-                           const ccl::vector_class<void*>& recv_bufs, \
-                           const ccl::vector_class<size_t>& recv_counts, \
-                           ccl::datatype dtype, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::alltoallv_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps) override { \
+                         const ccl::vector_class<size_t>& send_counts, \
+                         const ccl::vector_class<void*>& recv_bufs, \
+                         const ccl::vector_class<size_t>& recv_counts, \
+                         ccl::datatype dtype, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::alltoallv_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->alltoallv_impl( \
             send_bufs, send_counts, recv_bufs, recv_counts, dtype, stream, attr, deps); \
     } \
 \
     ccl::event bcast(void* buf, \
-                       size_t count, \
-                       ccl::datatype dtype, \
-                       size_t root, \
-                       const ccl::stream::impl_value_t& stream, \
-                       const ccl::broadcast_attr& attr, \
-                       const ccl::vector_class<ccl::event>& deps = {}) override { \
+                     size_t count, \
+                     ccl::datatype dtype, \
+                     int root, \
+                     const ccl::stream::impl_value_t& stream, \
+                     const ccl::broadcast_attr& attr, \
+                     const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->broadcast_impl(buf, count, dtype, root, stream, attr, deps); \
     } \
 \
     ccl::event reduce(const void* send_buf, \
-                        void* recv_buf, \
-                        size_t count, \
-                        ccl::datatype dtype, \
-                        ccl::reduction reduction, \
-                        size_t root, \
-                        const ccl::stream::impl_value_t& stream, \
-                        const ccl::reduce_attr& attr, \
-                        const ccl::vector_class<ccl::event>& deps = {}) override { \
+                      void* recv_buf, \
+                      size_t count, \
+                      ccl::datatype dtype, \
+                      ccl::reduction reduction, \
+                      int root, \
+                      const ccl::stream::impl_value_t& stream, \
+                      const ccl::reduce_attr& attr, \
+                      const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->reduce_impl( \
             send_buf, recv_buf, count, dtype, reduction, root, stream, attr, deps); \
     } \
 \
     ccl::event reduce_scatter(const void* send_buf, \
-                                void* recv_buf, \
-                                size_t recv_count, \
-                                ccl::datatype dtype, \
-                                ccl::reduction reduction, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::reduce_scatter_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps) override { \
+                              void* recv_buf, \
+                              size_t recv_count, \
+                              ccl::datatype dtype, \
+                              ccl::reduction reduction, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::reduce_scatter_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->reduce_scatter_impl( \
             send_buf, recv_buf, recv_count, dtype, reduction, stream, attr, deps); \
     }
 
-#define DEVICE_COMM_INTERFACE_SPARSE_DEFINITION__VOID \
+#define COMM_INTERFACE_SPARSE_DEFINITION__VOID \
 \
     ccl::event sparse_allreduce(const void* send_ind_buf, \
-                                  size_t send_ind_count, \
-                                  const void* send_val_buf, \
-                                  size_t send_val_count, \
-                                  void* recv_ind_buf, \
-                                  size_t recv_ind_count, \
-                                  void* recv_val_buf, \
-                                  size_t recv_val_count, \
-                                  ccl::datatype index_dtype, \
-                                  ccl::datatype value_dtype, \
-                                  ccl::reduction reduction, \
-                                  const ccl::stream::impl_value_t& stream, \
-                                  const ccl::sparse_allreduce_attr& attr, \
-                                  const ccl::vector_class<ccl::event>& deps = {}) override { \
+                                size_t send_ind_count, \
+                                const void* send_val_buf, \
+                                size_t send_val_count, \
+                                void* recv_ind_buf, \
+                                size_t recv_ind_count, \
+                                void* recv_val_buf, \
+                                size_t recv_val_count, \
+                                ccl::datatype index_dtype, \
+                                ccl::datatype value_dtype, \
+                                ccl::reduction reduction, \
+                                const ccl::stream::impl_value_t& stream, \
+                                const ccl::sparse_allreduce_attr& attr, \
+                                const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->sparse_allreduce_impl(send_ind_buf, \
                                                  send_ind_count, \
                                                  send_val_buf, \
@@ -469,126 +469,126 @@
                                                  deps); \
     }
 
-#define DEVICE_COMM_INTERFACE_COLL_DEFINITION(type) \
+#define COMM_INTERFACE_COLL_DEFINITION(type) \
 \
     ccl::event allgatherv(const type* send_buf, \
-                            size_t send_count, \
-                            type* recv_buf, \
-                            const ccl::vector_class<size_t>& recv_counts, \
-                            const ccl::stream::impl_value_t& stream, \
-                            const ccl::allgatherv_attr& attr, \
-                            const ccl::vector_class<ccl::event>& deps = {}) override { \
+                          size_t send_count, \
+                          type* recv_buf, \
+                          const ccl::vector_class<size_t>& recv_counts, \
+                          const ccl::stream::impl_value_t& stream, \
+                          const ccl::allgatherv_attr& attr, \
+                          const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allgatherv_impl( \
             send_buf, send_count, recv_buf, recv_counts, stream, attr, deps); \
     } \
 \
     ccl::event allgatherv(const type* send_buf, \
-                            size_t send_count, \
-                            ccl::vector_class<type*>& recv_buf, \
-                            const ccl::vector_class<size_t>& recv_counts, \
-                            const ccl::stream::impl_value_t& stream, \
-                            const ccl::allgatherv_attr& attr, \
-                            const ccl::vector_class<ccl::event>& deps = {}) override { \
+                          size_t send_count, \
+                          ccl::vector_class<type*>& recv_buf, \
+                          const ccl::vector_class<size_t>& recv_counts, \
+                          const ccl::stream::impl_value_t& stream, \
+                          const ccl::allgatherv_attr& attr, \
+                          const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allgatherv_impl( \
             send_buf, send_count, recv_buf, recv_counts, stream, attr, deps); \
     } \
 \
     ccl::event allreduce(const type* send_buf, \
-                           type* recv_buf, \
-                           size_t count, \
-                           ccl::reduction reduction, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::allreduce_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps = {}) override { \
+                         type* recv_buf, \
+                         size_t count, \
+                         ccl::reduction reduction, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::allreduce_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allreduce_impl( \
             send_buf, recv_buf, count, reduction, stream, attr, deps); \
     } \
 \
     ccl::event alltoall(const type* send_buf, \
-                          type* recv_buf, \
-                          size_t count, \
-                          const ccl::stream::impl_value_t& stream, \
-                          const ccl::alltoall_attr& attr, \
-                          const ccl::vector_class<ccl::event>& deps) override { \
+                        type* recv_buf, \
+                        size_t count, \
+                        const ccl::stream::impl_value_t& stream, \
+                        const ccl::alltoall_attr& attr, \
+                        const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->alltoall_impl(send_buf, recv_buf, count, stream, attr, deps); \
     } \
     ccl::event alltoall(const ccl::vector_class<type*>& send_buf, \
-                          const ccl::vector_class<type*>& recv_buf, \
-                          size_t count, \
-                          const ccl::stream::impl_value_t& stream, \
-                          const ccl::alltoall_attr& attr, \
-                          const ccl::vector_class<ccl::event>& deps) override { \
+                        const ccl::vector_class<type*>& recv_buf, \
+                        size_t count, \
+                        const ccl::stream::impl_value_t& stream, \
+                        const ccl::alltoall_attr& attr, \
+                        const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->alltoall_impl(send_buf, recv_buf, count, stream, attr, deps); \
     } \
 \
     ccl::event alltoallv(const type* send_buf, \
-                           const ccl::vector_class<size_t>& send_counts, \
-                           type* recv_buf, \
-                           const ccl::vector_class<size_t>& recv_counts, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::alltoallv_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps) override { \
+                         const ccl::vector_class<size_t>& send_counts, \
+                         type* recv_buf, \
+                         const ccl::vector_class<size_t>& recv_counts, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::alltoallv_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->alltoallv_impl( \
             send_buf, send_counts, recv_buf, recv_counts, stream, attr, deps); \
     } \
 \
     ccl::event alltoallv(const ccl::vector_class<type*>& send_bufs, \
-                           const ccl::vector_class<size_t>& send_counts, \
-                           const ccl::vector_class<type*>& recv_bufs, \
-                           const ccl::vector_class<size_t>& recv_counts, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::alltoallv_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps) override { \
+                         const ccl::vector_class<size_t>& send_counts, \
+                         const ccl::vector_class<type*>& recv_bufs, \
+                         const ccl::vector_class<size_t>& recv_counts, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::alltoallv_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->alltoallv_impl( \
             send_bufs, send_counts, recv_bufs, recv_counts, stream, attr, deps); \
     } \
 \
     ccl::event bcast(type* buf, \
-                       size_t count, \
-                       size_t root, \
-                       const ccl::stream::impl_value_t& stream, \
-                       const ccl::broadcast_attr& attr, \
-                       const ccl::vector_class<ccl::event>& deps) override { \
+                     size_t count, \
+                     int root, \
+                     const ccl::stream::impl_value_t& stream, \
+                     const ccl::broadcast_attr& attr, \
+                     const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->broadcast_impl(buf, count, root, stream, attr, deps); \
     } \
 \
     ccl::event reduce(const type* send_buf, \
-                        type* recv_buf, \
-                        size_t count, \
-                        ccl::reduction reduction, \
-                        size_t root, \
-                        const ccl::stream::impl_value_t& stream, \
-                        const ccl::reduce_attr& attr, \
-                        const ccl::vector_class<ccl::event>& deps) override { \
+                      type* recv_buf, \
+                      size_t count, \
+                      ccl::reduction reduction, \
+                      int root, \
+                      const ccl::stream::impl_value_t& stream, \
+                      const ccl::reduce_attr& attr, \
+                      const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->reduce_impl( \
             send_buf, recv_buf, count, reduction, root, stream, attr, deps); \
     } \
 \
     ccl::event reduce_scatter(const type* send_buf, \
-                                type* recv_buf, \
-                                size_t recv_count, \
-                                ccl::reduction reduction, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::reduce_scatter_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps) override { \
+                              type* recv_buf, \
+                              size_t recv_count, \
+                              ccl::reduction reduction, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::reduce_scatter_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->reduce_scatter_impl( \
             send_buf, recv_buf, recv_count, reduction, stream, attr, deps); \
     }
 
-#define DEVICE_COMM_INTERFACE_SPARSE_DEFINITION(index_type, value_type) \
+#define COMM_INTERFACE_SPARSE_DEFINITION(index_type, value_type) \
 \
     ccl::event sparse_allreduce(const index_type* send_ind_buf, \
-                                  size_t send_ind_count, \
-                                  const value_type* send_val_buf, \
-                                  size_t send_val_count, \
-                                  index_type* recv_ind_buf, \
-                                  size_t recv_ind_count, \
-                                  value_type* recv_val_buf, \
-                                  size_t recv_val_count, \
-                                  ccl::reduction reduction, \
-                                  const ccl::stream::impl_value_t& stream, \
-                                  const ccl::sparse_allreduce_attr& attr, \
-                                  const ccl::vector_class<ccl::event>& deps = {}) override { \
+                                size_t send_ind_count, \
+                                const value_type* send_val_buf, \
+                                size_t send_val_count, \
+                                index_type* recv_ind_buf, \
+                                size_t recv_ind_count, \
+                                value_type* recv_val_buf, \
+                                size_t recv_val_count, \
+                                ccl::reduction reduction, \
+                                const ccl::stream::impl_value_t& stream, \
+                                const ccl::sparse_allreduce_attr& attr, \
+                                const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->sparse_allreduce_impl(send_ind_buf, \
                                                  send_ind_count, \
                                                  send_val_buf, \
@@ -603,126 +603,126 @@
                                                  deps); \
     }
 
-#define DEVICE_COMM_INTERFACE_COLL_CLASS_DEFINITION(type) \
+#define COMM_INTERFACE_COLL_CLASS_DEFINITION(type) \
 \
     ccl::event allgatherv(const type& send_buf, \
-                            size_t send_count, \
-                            type& recv_buf, \
-                            const ccl::vector_class<size_t>& recv_counts, \
-                            const ccl::stream::impl_value_t& stream, \
-                            const ccl::allgatherv_attr& attr, \
-                            const ccl::vector_class<ccl::event>& deps = {}) override { \
+                          size_t send_count, \
+                          type& recv_buf, \
+                          const ccl::vector_class<size_t>& recv_counts, \
+                          const ccl::stream::impl_value_t& stream, \
+                          const ccl::allgatherv_attr& attr, \
+                          const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allgatherv_impl( \
             send_buf, send_count, recv_buf, recv_counts, stream, attr, deps); \
     } \
 \
     ccl::event allgatherv(const type& send_buf, \
-                            size_t send_count, \
-                            ccl::vector_class<ccl::reference_wrapper_class<type>>& recv_buf, \
-                            const ccl::vector_class<size_t>& recv_counts, \
-                            const ccl::stream::impl_value_t& stream, \
-                            const ccl::allgatherv_attr& attr, \
-                            const ccl::vector_class<ccl::event>& deps = {}) override { \
+                          size_t send_count, \
+                          ccl::vector_class<ccl::reference_wrapper_class<type>>& recv_buf, \
+                          const ccl::vector_class<size_t>& recv_counts, \
+                          const ccl::stream::impl_value_t& stream, \
+                          const ccl::allgatherv_attr& attr, \
+                          const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allgatherv_impl( \
             send_buf, send_count, recv_buf, recv_counts, stream, attr, deps); \
     } \
 \
     ccl::event allreduce(const type& send_buf, \
-                           type& recv_buf, \
-                           size_t count, \
-                           ccl::reduction reduction, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::allreduce_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps = {}) override { \
+                         type& recv_buf, \
+                         size_t count, \
+                         ccl::reduction reduction, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::allreduce_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->allreduce_impl( \
             send_buf, recv_buf, count, reduction, stream, attr, deps); \
     } \
 \
     ccl::event alltoall(const type& send_buf, \
-                          type& recv_buf, \
-                          size_t count, \
-                          const ccl::stream::impl_value_t& stream, \
-                          const ccl::alltoall_attr& attr, \
-                          const ccl::vector_class<ccl::event>& deps = {}) override { \
+                        type& recv_buf, \
+                        size_t count, \
+                        const ccl::stream::impl_value_t& stream, \
+                        const ccl::alltoall_attr& attr, \
+                        const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->alltoall_impl(send_buf, recv_buf, count, stream, attr, deps); \
     } \
     ccl::event alltoall(const ccl::vector_class<ccl::reference_wrapper_class<type>>& send_buf, \
-                          const ccl::vector_class<ccl::reference_wrapper_class<type>>& recv_buf, \
-                          size_t count, \
-                          const ccl::stream::impl_value_t& stream, \
-                          const ccl::alltoall_attr& attr, \
-                          const ccl::vector_class<ccl::event>& deps = {}) override { \
+                        const ccl::vector_class<ccl::reference_wrapper_class<type>>& recv_buf, \
+                        size_t count, \
+                        const ccl::stream::impl_value_t& stream, \
+                        const ccl::alltoall_attr& attr, \
+                        const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->alltoall_impl(send_buf, recv_buf, count, stream, attr, deps); \
     } \
 \
     ccl::event alltoallv(const type& send_buf, \
-                           const ccl::vector_class<size_t>& send_counts, \
-                           type& recv_buf, \
-                           const ccl::vector_class<size_t>& recv_counts, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::alltoallv_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps = {}) override { \
+                         const ccl::vector_class<size_t>& send_counts, \
+                         type& recv_buf, \
+                         const ccl::vector_class<size_t>& recv_counts, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::alltoallv_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->alltoallv_impl( \
             send_buf, send_counts, recv_buf, recv_counts, stream, attr, deps); \
     } \
 \
     ccl::event alltoallv(const ccl::vector_class<ccl::reference_wrapper_class<type>>& send_buf, \
-                           const ccl::vector_class<size_t>& send_counts, \
-                           const ccl::vector_class<ccl::reference_wrapper_class<type>>& recv_buf, \
-                           const ccl::vector_class<size_t>& recv_counts, \
-                           const ccl::stream::impl_value_t& stream, \
-                           const ccl::alltoallv_attr& attr, \
-                           const ccl::vector_class<ccl::event>& deps) override { \
+                         const ccl::vector_class<size_t>& send_counts, \
+                         const ccl::vector_class<ccl::reference_wrapper_class<type>>& recv_buf, \
+                         const ccl::vector_class<size_t>& recv_counts, \
+                         const ccl::stream::impl_value_t& stream, \
+                         const ccl::alltoallv_attr& attr, \
+                         const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->alltoallv_impl( \
             send_buf, send_counts, recv_buf, recv_counts, stream, attr, deps); \
     } \
 \
     ccl::event bcast(type& buf, \
-                       size_t count, \
-                       size_t root, \
-                       const ccl::stream::impl_value_t& stream, \
-                       const ccl::broadcast_attr& attr, \
-                       const ccl::vector_class<ccl::event>& deps = {}) override { \
+                     size_t count, \
+                     int root, \
+                     const ccl::stream::impl_value_t& stream, \
+                     const ccl::broadcast_attr& attr, \
+                     const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->broadcast_impl(buf, count, root, stream, attr, deps); \
     } \
 \
     ccl::event reduce(const type& send_buf, \
-                        type& recv_buf, \
-                        size_t count, \
-                        ccl::reduction reduction, \
-                        size_t root, \
-                        const ccl::stream::impl_value_t& stream, \
-                        const ccl::reduce_attr& attr, \
-                        const ccl::vector_class<ccl::event>& deps = {}) override { \
+                      type& recv_buf, \
+                      size_t count, \
+                      ccl::reduction reduction, \
+                      int root, \
+                      const ccl::stream::impl_value_t& stream, \
+                      const ccl::reduce_attr& attr, \
+                      const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->reduce_impl( \
             send_buf, recv_buf, count, reduction, root, stream, attr, deps); \
     } \
 \
     ccl::event reduce_scatter(const type& send_buf, \
-                                type& recv_buf, \
-                                size_t recv_count, \
-                                ccl::reduction reduction, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::reduce_scatter_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps) override { \
+                              type& recv_buf, \
+                              size_t recv_count, \
+                              ccl::reduction reduction, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::reduce_scatter_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps) override { \
         return get_impl()->reduce_scatter_impl( \
             send_buf, recv_buf, recv_count, reduction, stream, attr, deps); \
     }
 
-#define DEVICE_COMM_INTERFACE_SPARSE_CLASS_DEFINITION(index_type, value_type) \
+#define COMM_INTERFACE_SPARSE_CLASS_DEFINITION(index_type, value_type) \
 \
     ccl::event sparse_allreduce(const index_type& send_ind_buf, \
-                                  size_t send_ind_count, \
-                                  const value_type& send_val_buf, \
-                                  size_t send_val_count, \
-                                  index_type& recv_ind_buf, \
-                                  size_t recv_ind_count, \
-                                  value_type& recv_val_buf, \
-                                  size_t recv_val_count, \
-                                  ccl::reduction reduction, \
-                                  const ccl::stream::impl_value_t& stream, \
-                                  const ccl::sparse_allreduce_attr& attr, \
-                                  const ccl::vector_class<ccl::event>& deps = {}) override { \
+                                size_t send_ind_count, \
+                                const value_type& send_val_buf, \
+                                size_t send_val_count, \
+                                index_type& recv_ind_buf, \
+                                size_t recv_ind_count, \
+                                value_type& recv_val_buf, \
+                                size_t recv_val_count, \
+                                ccl::reduction reduction, \
+                                const ccl::stream::impl_value_t& stream, \
+                                const ccl::sparse_allreduce_attr& attr, \
+                                const ccl::vector_class<ccl::event>& deps = {}) override { \
         return get_impl()->sparse_allreduce_impl(send_ind_buf, \
                                                  send_ind_count, \
                                                  send_val_buf, \
@@ -740,229 +740,229 @@
 /**
  * Coll implementations
  */
-#define DEVICE_COMM_IMPL_DECLARATION \
+#define COMM_IMPL_DECLARATION \
     ccl::event allgatherv_base_impl(const void* send_buf, \
-                                 size_t send_count, \
-                                 void* recv_buf, \
-                                 const ccl::vector_class<size_t>& recv_counts, \
-                                 ccl::datatype dtype, \
-                                 const ccl::stream::impl_value_t& stream, \
-                                 const ccl_coll_attr& attr, \
-                                 const ccl::vector_class<ccl::event>& deps); \
+                                    size_t send_count, \
+                                    void* recv_buf, \
+                                    const ccl::vector_class<size_t>& recv_counts, \
+                                    ccl::datatype dtype, \
+                                    const ccl::stream::impl_value_t& stream, \
+                                    const ccl_coll_attr& attr, \
+                                    const ccl::vector_class<ccl::event>& deps); \
     ccl::event allgatherv_impl(const void* send_buf, \
-                                 size_t send_count, \
-                                 void* recv_buf, \
-                                 const ccl::vector_class<size_t>& recv_counts, \
-                                 ccl::datatype dtype, \
-                                 const ccl::stream::impl_value_t& stream, \
-                                 const ccl::allgatherv_attr& attr, \
-                                 const ccl::vector_class<ccl::event>& deps); \
+                               size_t send_count, \
+                               void* recv_buf, \
+                               const ccl::vector_class<size_t>& recv_counts, \
+                               ccl::datatype dtype, \
+                               const ccl::stream::impl_value_t& stream, \
+                               const ccl::allgatherv_attr& attr, \
+                               const ccl::vector_class<ccl::event>& deps); \
     ccl::event allgatherv_impl(const void* send_buf, \
-                                 size_t send_count, \
-                                 const ccl::vector_class<void*>& recv_bufs, \
-                                 const ccl::vector_class<size_t>& recv_counts, \
-                                 ccl::datatype dtype, \
-                                 const ccl::stream::impl_value_t& stream, \
-                                 const ccl::allgatherv_attr& attr, \
-                                 const ccl::vector_class<ccl::event>& deps); \
+                               size_t send_count, \
+                               const ccl::vector_class<void*>& recv_bufs, \
+                               const ccl::vector_class<size_t>& recv_counts, \
+                               ccl::datatype dtype, \
+                               const ccl::stream::impl_value_t& stream, \
+                               const ccl::allgatherv_attr& attr, \
+                               const ccl::vector_class<ccl::event>& deps); \
 \
     template <class buffer_type> \
     ccl::event allgatherv_base_impl(const buffer_type* send_buf, \
-                                 size_t send_count, \
-                                 buffer_type* recv_buf, \
-                                 const ccl::vector_class<size_t>& recv_counts, \
-                                 const ccl::stream::impl_value_t& stream, \
-                                 const ccl_coll_attr& attr, \
-                                 const ccl::vector_class<ccl::event>& deps); \
+                                    size_t send_count, \
+                                    buffer_type* recv_buf, \
+                                    const ccl::vector_class<size_t>& recv_counts, \
+                                    const ccl::stream::impl_value_t& stream, \
+                                    const ccl_coll_attr& attr, \
+                                    const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event allgatherv_impl(const buffer_type* send_buf, \
-                                 size_t send_count, \
-                                 buffer_type* recv_buf, \
-                                 const ccl::vector_class<size_t>& recv_counts, \
-                                 const ccl::stream::impl_value_t& stream, \
-                                 const ccl::allgatherv_attr& attr, \
-                                 const ccl::vector_class<ccl::event>& deps); \
+                               size_t send_count, \
+                               buffer_type* recv_buf, \
+                               const ccl::vector_class<size_t>& recv_counts, \
+                               const ccl::stream::impl_value_t& stream, \
+                               const ccl::allgatherv_attr& attr, \
+                               const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event allgatherv_impl(const buffer_type* send_buf, \
-                                 size_t send_count, \
-                                 ccl::vector_class<buffer_type*>& recv_buf, \
-                                 const ccl::vector_class<size_t>& recv_counts, \
-                                 const ccl::stream::impl_value_t& stream, \
-                                 const ccl::allgatherv_attr& attr, \
-                                 const ccl::vector_class<ccl::event>& deps); \
+                               size_t send_count, \
+                               ccl::vector_class<buffer_type*>& recv_buf, \
+                               const ccl::vector_class<size_t>& recv_counts, \
+                               const ccl::stream::impl_value_t& stream, \
+                               const ccl::allgatherv_attr& attr, \
+                               const ccl::vector_class<ccl::event>& deps); \
 \
     ccl::event allreduce_impl(const void* send_buf, \
-                                void* recv_buf, \
-                                size_t count, \
-                                ccl::datatype dtype, \
-                                ccl::reduction reduction, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::allreduce_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              void* recv_buf, \
+                              size_t count, \
+                              ccl::datatype dtype, \
+                              ccl::reduction reduction, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::allreduce_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
 \
     template <class buffer_type> \
     ccl::event allreduce_impl(const buffer_type* send_buf, \
-                                buffer_type* recv_buf, \
-                                size_t count, \
-                                ccl::reduction reduction, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::allreduce_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              buffer_type* recv_buf, \
+                              size_t count, \
+                              ccl::reduction reduction, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::allreduce_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
 \
     ccl::event alltoall_impl(const void* send_buf, \
-                               void* recv_buf, \
-                               size_t count, \
-                               ccl::datatype dtype, \
-                               const ccl::stream::impl_value_t& stream, \
-                               const ccl::alltoall_attr& attr, \
-                               const ccl::vector_class<ccl::event>& deps); \
+                             void* recv_buf, \
+                             size_t count, \
+                             ccl::datatype dtype, \
+                             const ccl::stream::impl_value_t& stream, \
+                             const ccl::alltoall_attr& attr, \
+                             const ccl::vector_class<ccl::event>& deps); \
     ccl::event alltoall_impl(const ccl::vector_class<void*>& send_buf, \
-                               const ccl::vector_class<void*>& recv_buf, \
-                               size_t count, \
-                               ccl::datatype dtype, \
-                               const ccl::stream::impl_value_t& stream, \
-                               const ccl::alltoall_attr& attr, \
-                               const ccl::vector_class<ccl::event>& deps); \
+                             const ccl::vector_class<void*>& recv_buf, \
+                             size_t count, \
+                             ccl::datatype dtype, \
+                             const ccl::stream::impl_value_t& stream, \
+                             const ccl::alltoall_attr& attr, \
+                             const ccl::vector_class<ccl::event>& deps); \
 \
     template <class buffer_type> \
     ccl::event alltoall_impl(const buffer_type* send_buf, \
-                               buffer_type* recv_buf, \
-                               size_t count, \
-                               const ccl::stream::impl_value_t& stream, \
-                               const ccl::alltoall_attr& attr, \
-                               const ccl::vector_class<ccl::event>& deps); \
+                             buffer_type* recv_buf, \
+                             size_t count, \
+                             const ccl::stream::impl_value_t& stream, \
+                             const ccl::alltoall_attr& attr, \
+                             const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event alltoall_impl(const ccl::vector_class<buffer_type*>& send_buf, \
-                               const ccl::vector_class<buffer_type*>& recv_buf, \
-                               size_t count, \
-                               const ccl::stream::impl_value_t& stream, \
-                               const ccl::alltoall_attr& attr, \
-                               const ccl::vector_class<ccl::event>& deps); \
+                             const ccl::vector_class<buffer_type*>& recv_buf, \
+                             size_t count, \
+                             const ccl::stream::impl_value_t& stream, \
+                             const ccl::alltoall_attr& attr, \
+                             const ccl::vector_class<ccl::event>& deps); \
 \
     ccl::event alltoallv_impl(const void* send_buf, \
-                                const ccl::vector_class<size_t>& send_counts, \
-                                void* recv_buf, \
-                                const ccl::vector_class<size_t>& recv_counts, \
-                                ccl::datatype dtype, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::alltoallv_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              const ccl::vector_class<size_t>& send_counts, \
+                              void* recv_buf, \
+                              const ccl::vector_class<size_t>& recv_counts, \
+                              ccl::datatype dtype, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::alltoallv_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
     ccl::event alltoallv_impl(const ccl::vector_class<void*>& send_buf, \
-                                const ccl::vector_class<size_t>& send_counts, \
-                                ccl::vector_class<void*> recv_buf, \
-                                const ccl::vector_class<size_t>& recv_counts, \
-                                ccl::datatype dtype, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::alltoallv_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              const ccl::vector_class<size_t>& send_counts, \
+                              ccl::vector_class<void*> recv_buf, \
+                              const ccl::vector_class<size_t>& recv_counts, \
+                              ccl::datatype dtype, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::alltoallv_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
 \
     template <class buffer_type> \
     ccl::event alltoallv_impl(const ccl::vector_class<buffer_type*>& send_buf, \
-                                const ccl::vector_class<size_t>& send_counts, \
-                                const ccl::vector_class<buffer_type*>& recv_buf, \
-                                const ccl::vector_class<size_t>& recv_counts, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::alltoallv_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              const ccl::vector_class<size_t>& send_counts, \
+                              const ccl::vector_class<buffer_type*>& recv_buf, \
+                              const ccl::vector_class<size_t>& recv_counts, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::alltoallv_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
 \
     template <class buffer_type> \
     ccl::event alltoallv_impl(const buffer_type* send_buf, \
-                                const ccl::vector_class<size_t>& send_counts, \
-                                buffer_type* recv_buf, \
-                                const ccl::vector_class<size_t>& recv_counts, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::alltoallv_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              const ccl::vector_class<size_t>& send_counts, \
+                              buffer_type* recv_buf, \
+                              const ccl::vector_class<size_t>& recv_counts, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::alltoallv_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
 \
     ccl::event broadcast_impl(void* buf, \
-                                size_t count, \
-                                ccl::datatype dtype, \
-                                size_t root, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::broadcast_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              size_t count, \
+                              ccl::datatype dtype, \
+                              int root, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::broadcast_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event broadcast_impl(buffer_type* buf, \
-                                size_t count, \
-                                size_t root, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::broadcast_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              size_t count, \
+                              int root, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::broadcast_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
 \
     ccl::event reduce_impl(const void* send_buf, \
-                             void* recv_buf, \
-                             size_t count, \
-                             ccl::datatype dtype, \
-                             ccl::reduction reduction, \
-                             size_t root, \
-                             const ccl::stream::impl_value_t& stream, \
-                             const ccl::reduce_attr& attr, \
-                             const ccl::vector_class<ccl::event>& deps); \
+                           void* recv_buf, \
+                           size_t count, \
+                           ccl::datatype dtype, \
+                           ccl::reduction reduction, \
+                           int root, \
+                           const ccl::stream::impl_value_t& stream, \
+                           const ccl::reduce_attr& attr, \
+                           const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event reduce_impl(const buffer_type* send_buf, \
-                             buffer_type* recv_buf, \
-                             size_t count, \
-                             ccl::reduction reduction, \
-                             size_t root, \
-                             const ccl::stream::impl_value_t& stream, \
-                             const ccl::reduce_attr& attr, \
-                             const ccl::vector_class<ccl::event>& deps); \
+                           buffer_type* recv_buf, \
+                           size_t count, \
+                           ccl::reduction reduction, \
+                           int root, \
+                           const ccl::stream::impl_value_t& stream, \
+                           const ccl::reduce_attr& attr, \
+                           const ccl::vector_class<ccl::event>& deps); \
 \
     ccl::event reduce_scatter_impl(const void* send_buf, \
-                                     void* recv_buf, \
-                                     size_t recv_count, \
-                                     ccl::datatype dtype, \
-                                     ccl::reduction reduction, \
-                                     const ccl::stream::impl_value_t& stream, \
-                                     const ccl::reduce_scatter_attr& attr, \
-                                     const ccl::vector_class<ccl::event>& deps); \
+                                   void* recv_buf, \
+                                   size_t recv_count, \
+                                   ccl::datatype dtype, \
+                                   ccl::reduction reduction, \
+                                   const ccl::stream::impl_value_t& stream, \
+                                   const ccl::reduce_scatter_attr& attr, \
+                                   const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event reduce_scatter_impl(const buffer_type* send_buf, \
-                                     buffer_type* recv_buf, \
-                                     size_t recv_count, \
-                                     ccl::reduction reduction, \
-                                     const ccl::stream::impl_value_t& stream, \
-                                     const ccl::reduce_scatter_attr& attr, \
-                                     const ccl::vector_class<ccl::event>& deps);
+                                   buffer_type* recv_buf, \
+                                   size_t recv_count, \
+                                   ccl::reduction reduction, \
+                                   const ccl::stream::impl_value_t& stream, \
+                                   const ccl::reduce_scatter_attr& attr, \
+                                   const ccl::vector_class<ccl::event>& deps);
 
-#define DEVICE_COMM_IMPL_SPARSE_DECLARATION \
+#define COMM_IMPL_SPARSE_DECLARATION \
     ccl::event sparse_allreduce_impl(const void* send_ind_buf, \
-                                       size_t send_ind_count, \
-                                       const void* send_val_buf, \
-                                       size_t send_val_count, \
-                                       void* recv_ind_buf, \
-                                       size_t recv_ind_count, \
-                                       void* recv_val_buf, \
-                                       size_t recv_val_count, \
-                                       ccl::datatype index_dtype, \
-                                       ccl::datatype value_dtype, \
-                                       ccl::reduction reduction, \
-                                       const ccl::stream::impl_value_t& stream, \
-                                       const ccl::sparse_allreduce_attr& attr, \
-                                       const ccl::vector_class<ccl::event>& deps); \
+                                     size_t send_ind_count, \
+                                     const void* send_val_buf, \
+                                     size_t send_val_count, \
+                                     void* recv_ind_buf, \
+                                     size_t recv_ind_count, \
+                                     void* recv_val_buf, \
+                                     size_t recv_val_count, \
+                                     ccl::datatype index_dtype, \
+                                     ccl::datatype value_dtype, \
+                                     ccl::reduction reduction, \
+                                     const ccl::stream::impl_value_t& stream, \
+                                     const ccl::sparse_allreduce_attr& attr, \
+                                     const ccl::vector_class<ccl::event>& deps); \
     template <class index_type, class value_type> \
     ccl::event sparse_allreduce_impl(const index_type* send_ind_buf, \
-                                       size_t send_ind_count, \
-                                       const value_type* send_val_buf, \
-                                       size_t send_val_count, \
-                                       index_type* recv_ind_buf, \
-                                       size_t recv_ind_count, \
-                                       value_type* recv_val_buf, \
-                                       size_t recv_val_count, \
-                                       ccl::reduction reduction, \
-                                       const ccl::stream::impl_value_t& stream, \
-                                       const ccl::sparse_allreduce_attr& attr, \
-                                       const ccl::vector_class<ccl::event>& deps);
+                                     size_t send_ind_count, \
+                                     const value_type* send_val_buf, \
+                                     size_t send_val_count, \
+                                     index_type* recv_ind_buf, \
+                                     size_t recv_ind_count, \
+                                     value_type* recv_val_buf, \
+                                     size_t recv_val_count, \
+                                     ccl::reduction reduction, \
+                                     const ccl::stream::impl_value_t& stream, \
+                                     const ccl::sparse_allreduce_attr& attr, \
+                                     const ccl::vector_class<ccl::event>& deps);
 
-#define DEVICE_COMM_IMPL_CLASS_DECLARATION \
+#define COMM_IMPL_CLASS_DECLARATION \
     template <class buffer_type> \
     ccl::event allgatherv_impl(const buffer_type& send_buf, \
-                                 size_t send_count, \
-                                 buffer_type& recv_buf, \
-                                 const ccl::vector_class<size_t>& recv_counts, \
-                                 const ccl::stream::impl_value_t& stream, \
-                                 const ccl::allgatherv_attr& attr, \
-                                 const ccl::vector_class<ccl::event>& deps); \
+                               size_t send_count, \
+                               buffer_type& recv_buf, \
+                               const ccl::vector_class<size_t>& recv_counts, \
+                               const ccl::stream::impl_value_t& stream, \
+                               const ccl::allgatherv_attr& attr, \
+                               const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event allgatherv_impl( \
         const buffer_type& send_buf, \
@@ -975,19 +975,19 @@
 \
     template <class buffer_type> \
     ccl::event allreduce_impl(const buffer_type& send_buf, \
-                                buffer_type& recv_buf, \
-                                size_t count, \
-                                ccl::reduction reduction, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::allreduce_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              buffer_type& recv_buf, \
+                              size_t count, \
+                              ccl::reduction reduction, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::allreduce_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event alltoall_impl(const buffer_type& send_buf, \
-                               buffer_type& recv_buf, \
-                               size_t count, \
-                               const ccl::stream::impl_value_t& stream, \
-                               const ccl::alltoall_attr& attr, \
-                               const ccl::vector_class<ccl::event>& deps); \
+                             buffer_type& recv_buf, \
+                             size_t count, \
+                             const ccl::stream::impl_value_t& stream, \
+                             const ccl::alltoall_attr& attr, \
+                             const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event alltoall_impl( \
         const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf, \
@@ -999,12 +999,12 @@
 \
     template <class buffer_type> \
     ccl::event alltoallv_impl(const buffer_type& send_buf, \
-                                const ccl::vector_class<size_t>& send_counts, \
-                                buffer_type& recv_buf, \
-                                const ccl::vector_class<size_t>& recv_counts, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::alltoallv_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              const ccl::vector_class<size_t>& send_counts, \
+                              buffer_type& recv_buf, \
+                              const ccl::vector_class<size_t>& recv_counts, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::alltoallv_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event alltoallv_impl( \
         const ccl::vector_class<ccl::reference_wrapper_class<buffer_type>>& send_buf, \
@@ -1017,72 +1017,71 @@
 \
     template <class buffer_type> \
     ccl::event broadcast_impl(buffer_type& buf, \
-                                size_t count, \
-                                size_t root, \
-                                const ccl::stream::impl_value_t& stream, \
-                                const ccl::broadcast_attr& attr, \
-                                const ccl::vector_class<ccl::event>& deps); \
+                              size_t count, \
+                              int root, \
+                              const ccl::stream::impl_value_t& stream, \
+                              const ccl::broadcast_attr& attr, \
+                              const ccl::vector_class<ccl::event>& deps); \
     template <class buffer_type> \
     ccl::event reduce_impl(const buffer_type& send_buf, \
-                             buffer_type& recv_buf, \
-                             size_t count, \
-                             ccl::reduction reduction, \
-                             size_t root, \
-                             const ccl::stream::impl_value_t& stream, \
-                             const ccl::reduce_attr& attr, \
-                             const ccl::vector_class<ccl::event>& deps); \
+                           buffer_type& recv_buf, \
+                           size_t count, \
+                           ccl::reduction reduction, \
+                           int root, \
+                           const ccl::stream::impl_value_t& stream, \
+                           const ccl::reduce_attr& attr, \
+                           const ccl::vector_class<ccl::event>& deps); \
 \
     template <class buffer_type> \
     ccl::event reduce_scatter_impl(const buffer_type& send_buf, \
-                                     buffer_type& recv_buf, \
-                                     size_t recv_count, \
-                                     ccl::reduction reduction, \
-                                     const ccl::stream::impl_value_t& stream, \
-                                     const ccl::reduce_scatter_attr& attr, \
-                                     const ccl::vector_class<ccl::event>& deps);
+                                   buffer_type& recv_buf, \
+                                   size_t recv_count, \
+                                   ccl::reduction reduction, \
+                                   const ccl::stream::impl_value_t& stream, \
+                                   const ccl::reduce_scatter_attr& attr, \
+                                   const ccl::vector_class<ccl::event>& deps);
 
-#define DEVICE_COMM_IMPL_SPARSE_CLASS_DECLARATION \
+#define COMM_IMPL_SPARSE_CLASS_DECLARATION \
     template <class index_type, class value_type> \
     ccl::event sparse_allreduce_impl(const index_type& send_ind_buf, \
-                                       size_t send_ind_count, \
-                                       const value_type& send_val_buf, \
-                                       size_t send_val_count, \
-                                       index_type& recv_ind_buf, \
-                                       size_t recv_ind_count, \
-                                       value_type& recv_val_buf, \
-                                       size_t recv_val_count, \
-                                       ccl::reduction reduction, \
-                                       const ccl::stream::impl_value_t& stream, \
-                                       const ccl::sparse_allreduce_attr& attr, \
-                                       const ccl::vector_class<ccl::event>& deps);
+                                     size_t send_ind_count, \
+                                     const value_type& send_val_buf, \
+                                     size_t send_val_count, \
+                                     index_type& recv_ind_buf, \
+                                     size_t recv_ind_count, \
+                                     value_type& recv_val_buf, \
+                                     size_t recv_val_count, \
+                                     ccl::reduction reduction, \
+                                     const ccl::stream::impl_value_t& stream, \
+                                     const ccl::sparse_allreduce_attr& attr, \
+                                     const ccl::vector_class<ccl::event>& deps);
 
 /**
  * Force intantiations
  */
-#define DEVICE_COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(comm_class, type) \
-    template ccl::event comm_class::allgatherv_impl( \
-        const type& send_buf, \
-        size_t send_count, \
-        type& recv_buf, \
-        const ccl::vector_class<size_t>& recv_counts, \
-        const ccl::stream::impl_value_t& stream, \
-        const ccl::allgatherv_attr& attr, \
-        const ccl::vector_class<ccl::event>& deps); \
+#define COMM_INTERFACE_COLL_CLASS_INSTANTIATIONS(comm_class, type) \
+    template ccl::event comm_class::allgatherv_impl(const type& send_buf, \
+                                                    size_t send_count, \
+                                                    type& recv_buf, \
+                                                    const ccl::vector_class<size_t>& recv_counts, \
+                                                    const ccl::stream::impl_value_t& stream, \
+                                                    const ccl::allgatherv_attr& attr, \
+                                                    const ccl::vector_class<ccl::event>& deps); \
 \
     template ccl::event comm_class::allreduce_impl(const type& send_buf, \
-                                                     type& recv_buf, \
-                                                     size_t count, \
-                                                     ccl::reduction reduction, \
-                                                     const ccl::stream::impl_value_t& stream, \
-                                                     const ccl::allreduce_attr& attr, \
-                                                     const ccl::vector_class<ccl::event>& deps); \
+                                                   type& recv_buf, \
+                                                   size_t count, \
+                                                   ccl::reduction reduction, \
+                                                   const ccl::stream::impl_value_t& stream, \
+                                                   const ccl::allreduce_attr& attr, \
+                                                   const ccl::vector_class<ccl::event>& deps); \
 \
     template ccl::event comm_class::alltoall_impl(const type& send_buf, \
-                                                    type& recv_buf, \
-                                                    size_t count, \
-                                                    const ccl::stream::impl_value_t& stream, \
-                                                    const ccl::alltoall_attr& attr, \
-                                                    const ccl::vector_class<ccl::event>& deps); \
+                                                  type& recv_buf, \
+                                                  size_t count, \
+                                                  const ccl::stream::impl_value_t& stream, \
+                                                  const ccl::alltoall_attr& attr, \
+                                                  const ccl::vector_class<ccl::event>& deps); \
 \
     ccl::event alltoall_impl( \
         const ccl::vector_class<ccl::reference_wrapper_class<type>>& send_buf, \
@@ -1110,60 +1109,58 @@
         const ccl::vector_class<ccl::event>& deps); \
 \
     template ccl::event comm_class::broadcast_impl(type& buf, \
-                                                     size_t count, \
-                                                     size_t root, \
-                                                     const ccl::stream::impl_value_t& stream, \
-                                                     const ccl::broadcast_attr& attr, \
-                                                     const ccl::vector_class<ccl::event>& deps); \
+                                                   size_t count, \
+                                                   int root, \
+                                                   const ccl::stream::impl_value_t& stream, \
+                                                   const ccl::broadcast_attr& attr, \
+                                                   const ccl::vector_class<ccl::event>& deps); \
 \
     template ccl::event comm_class::reduce_impl(const type& send_buf, \
-                                                  type& recv_buf, \
-                                                  size_t count, \
-                                                  ccl::reduction reduction, \
-                                                  size_t root, \
-                                                  const ccl::stream::impl_value_t& stream, \
-                                                  const ccl::reduce_attr& attr, \
-                                                  const ccl::vector_class<ccl::event>& deps);
+                                                type& recv_buf, \
+                                                size_t count, \
+                                                ccl::reduction reduction, \
+                                                int root, \
+                                                const ccl::stream::impl_value_t& stream, \
+                                                const ccl::reduce_attr& attr, \
+                                                const ccl::vector_class<ccl::event>& deps);
 
-#define DEVICE_COMM_INTERFACE_COLL_INSTANTIATIONS(comm_class, type) \
+#define COMM_INTERFACE_COLL_INSTANTIATIONS(comm_class, type) \
 \
-    template ccl::event comm_class::allgatherv_impl( \
-        const type* send_buf, \
-        size_t send_count, \
-        type* recv_buf, \
-        const ccl::vector_class<size_t>& recv_counts, \
-        const ccl::stream::impl_value_t& stream, \
-        const ccl::allgatherv_attr& attr, \
-        const ccl::vector_class<ccl::event>& deps); \
-    template ccl::event comm_class::allgatherv_impl( \
-        const type* send_buf, \
-        size_t send_count, \
-        ccl::vector_class<type*>& recv_buf, \
-        const ccl::vector_class<size_t>& recv_counts, \
-        const ccl::stream::impl_value_t& stream, \
-        const ccl::allgatherv_attr& attr, \
-        const ccl::vector_class<ccl::event>& deps); \
-\
-    template ccl::event comm_class::allreduce_impl(const type* send_buf, \
-                                                     type* recv_buf, \
-                                                     size_t count, \
-                                                     ccl::reduction reduction, \
-                                                     const ccl::stream::impl_value_t& stream, \
-                                                     const ccl::allreduce_attr& attr, \
-                                                     const ccl::vector_class<ccl::event>& deps); \
-\
-    template ccl::event comm_class::alltoall_impl(const type* send_buf, \
+    template ccl::event comm_class::allgatherv_impl(const type* send_buf, \
+                                                    size_t send_count, \
                                                     type* recv_buf, \
-                                                    size_t count, \
+                                                    const ccl::vector_class<size_t>& recv_counts, \
                                                     const ccl::stream::impl_value_t& stream, \
-                                                    const ccl::alltoall_attr& attr, \
+                                                    const ccl::allgatherv_attr& attr, \
                                                     const ccl::vector_class<ccl::event>& deps); \
-    template ccl::event comm_class::alltoall_impl(const ccl::vector_class<type*>& send_buf, \
-                                                    const ccl::vector_class<type*>& recv_buf, \
-                                                    size_t count, \
+    template ccl::event comm_class::allgatherv_impl(const type* send_buf, \
+                                                    size_t send_count, \
+                                                    ccl::vector_class<type*>& recv_buf, \
+                                                    const ccl::vector_class<size_t>& recv_counts, \
                                                     const ccl::stream::impl_value_t& stream, \
-                                                    const ccl::alltoall_attr& attr, \
+                                                    const ccl::allgatherv_attr& attr, \
                                                     const ccl::vector_class<ccl::event>& deps); \
+\
+    template ccl::event comm_class::allreduce_impl(const type* send_buf, \
+                                                   type* recv_buf, \
+                                                   size_t count, \
+                                                   ccl::reduction reduction, \
+                                                   const ccl::stream::impl_value_t& stream, \
+                                                   const ccl::allreduce_attr& attr, \
+                                                   const ccl::vector_class<ccl::event>& deps); \
+\
+    template ccl::event comm_class::alltoall_impl(const type* send_buf, \
+                                                  type* recv_buf, \
+                                                  size_t count, \
+                                                  const ccl::stream::impl_value_t& stream, \
+                                                  const ccl::alltoall_attr& attr, \
+                                                  const ccl::vector_class<ccl::event>& deps); \
+    template ccl::event comm_class::alltoall_impl(const ccl::vector_class<type*>& send_buf, \
+                                                  const ccl::vector_class<type*>& recv_buf, \
+                                                  size_t count, \
+                                                  const ccl::stream::impl_value_t& stream, \
+                                                  const ccl::alltoall_attr& attr, \
+                                                  const ccl::vector_class<ccl::event>& deps); \
 \
     template ccl::event comm_class::alltoallv_impl<type>( \
         const type* send_buf, \
@@ -1183,20 +1180,20 @@
         const ccl::vector_class<ccl::event>& deps); \
 \
     template ccl::event comm_class::broadcast_impl(type* buf, \
-                                                     size_t count, \
-                                                     size_t root, \
-                                                     const ccl::stream::impl_value_t& stream, \
-                                                     const ccl::broadcast_attr& attr, \
-                                                     const ccl::vector_class<ccl::event>& deps); \
+                                                   size_t count, \
+                                                   int root, \
+                                                   const ccl::stream::impl_value_t& stream, \
+                                                   const ccl::broadcast_attr& attr, \
+                                                   const ccl::vector_class<ccl::event>& deps); \
 \
     template ccl::event comm_class::reduce_impl(const type* send_buf, \
-                                                  type* recv_buf, \
-                                                  size_t count, \
-                                                  ccl::reduction reduction, \
-                                                  size_t root, \
-                                                  const ccl::stream::impl_value_t& stream, \
-                                                  const ccl::reduce_attr& attr, \
-                                                  const ccl::vector_class<ccl::event>& deps); \
+                                                type* recv_buf, \
+                                                size_t count, \
+                                                ccl::reduction reduction, \
+                                                int root, \
+                                                const ccl::stream::impl_value_t& stream, \
+                                                const ccl::reduce_attr& attr, \
+                                                const ccl::vector_class<ccl::event>& deps); \
     template ccl::event comm_class::reduce_scatter_impl( \
         const type* send_buf, \
         type* recv_buf, \
@@ -1206,8 +1203,7 @@
         const ccl::reduce_scatter_attr& attr, \
         const ccl::vector_class<ccl::event>& deps);
 
-#define DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION( \
-    comm_class, index_type, value_type) \
+#define COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_INSTANTIATION(comm_class, index_type, value_type) \
     template ccl::event comm_class::sparse_allreduce_impl( \
         const index_type* send_ind_buf, \
         size_t send_ind_count, \
@@ -1222,7 +1218,7 @@
         const ccl::sparse_allreduce_attr& attr, \
         const ccl::vector_class<ccl::event>& deps);
 
-#define DEVICE_COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION( \
+#define COMM_INTERFACE_SPARSE_ALLREDUCE_EXPLICIT_CLASS_INSTANTIATION( \
     comm_class, index_type, value_type) \
     template ccl::event comm_class::sparse_allreduce_impl( \
         const index_type& send_ind_buf, \
diff --git a/src/unified_context_impl.hpp b/src/unified_context_impl.hpp
index 326c7c69d..7c979376b 100644
--- a/src/unified_context_impl.hpp
+++ b/src/unified_context_impl.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/log/log.hpp"
 
 namespace ccl {
@@ -27,7 +27,6 @@ namespace ccl {
 #else
 #ifdef MULTI_GPU_SUPPORT
 
-
 #else //MULTI_GPU_SUPPORT
 
 #endif
diff --git a/src/unified_device_impl.hpp b/src/unified_device_impl.hpp
index 3954dcbf4..eb666c811 100644
--- a/src/unified_device_impl.hpp
+++ b/src/unified_device_impl.hpp
@@ -15,8 +15,8 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_types.hpp"
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/types.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/log/log.hpp"
 #include "native_device_api/compiler_ccl_wrappers_dispatcher.hpp"
 
diff --git a/src/unified_event_type.hpp b/src/unified_event_type.hpp
index b1824b0e0..4693682cb 100644
--- a/src/unified_event_type.hpp
+++ b/src/unified_event_type.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/log/log.hpp"
 
 namespace ccl {
diff --git a/src/unified_platform_impl.hpp b/src/unified_platform_impl.hpp
index c8ea3fc0a..1cef1cc3e 100644
--- a/src/unified_platform_impl.hpp
+++ b/src/unified_platform_impl.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/log/log.hpp"
 
 namespace ccl {
diff --git a/src/unified_stream_type.hpp b/src/unified_stream_type.hpp
index 6b903e19e..157fb5155 100644
--- a/src/unified_stream_type.hpp
+++ b/src/unified_stream_type.hpp
@@ -15,7 +15,7 @@
 */
 #pragma once
 
-#include "oneapi/ccl/ccl_type_traits.hpp"
+#include "oneapi/ccl/type_traits.hpp"
 #include "common/log/log.hpp"
 
 namespace ccl {
@@ -23,8 +23,7 @@ namespace ccl {
 #else
 #ifdef MULTI_GPU_SUPPORT
 
-
-#else  //MULTI_GPU_SUPPORT
+#else //MULTI_GPU_SUPPORT
 
 #endif
 #endif
diff --git a/src/unordered_coll/unordered_coll.cpp b/src/unordered_coll/unordered_coll.cpp
index 13608c928..d225f3aba 100644
--- a/src/unordered_coll/unordered_coll.cpp
+++ b/src/unordered_coll/unordered_coll.cpp
@@ -29,18 +29,17 @@ struct ccl_unordered_coll_ctx {
 };
 
 ccl_unordered_coll_manager::ccl_unordered_coll_manager(ccl_comm& parent_comm) {
+    coordination_comm = std::unique_ptr<ccl_comm>(
+        new ccl_comm(parent_comm.rank(),
+                     parent_comm.size(),
+                     ccl::global_data::get().comm_ids->acquire(true /*internal_id_space*/),
+                     parent_comm.atl,
+                     true /*share_resources*/));
 
-       coordination_comm =
-           std::unique_ptr<ccl_comm>(new ccl_comm(parent_comm.rank(),
-                                                  parent_comm.size(),
-                                                  ccl::global_data::get().comm_ids->acquire(true/*internal_id_space*/),
-                                                  parent_comm.atl,
-                                                  true/*share_resources*/));
+    CCL_ASSERT(coordination_comm.get(), "coordination_comm is null");
 
-       CCL_ASSERT(coordination_comm.get(), "coordination_comm is null");
-
-       if (parent_comm.rank() == 0)
-           LOG_INFO("created unordered collectives manager");
+    if (parent_comm.rank() == 0)
+        LOG_INFO("created unordered collectives manager");
 }
 
 ccl_unordered_coll_manager::~ccl_unordered_coll_manager() {
@@ -159,7 +158,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
 
     ccl_coll_param coll_param{};
     coll_param.ctype = ccl_coll_internal;
-    coll_param.dtype = ccl_datatype_char;
+    coll_param.dtype = ccl_datatype_int8;
     coll_param.comm = coordination_comm.get();
 
     std::unique_ptr<ccl_extra_sched> service_sched(
@@ -200,7 +199,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     match_id_size_param.ctype = ccl_coll_bcast;
     match_id_size_param.buf = ccl_buffer(&ctx->match_id_size, sizeof(size_t));
     match_id_size_param.count = sizeof(size_t);
-    match_id_size_param.dtype = ccl_datatype_char;
+    match_id_size_param.dtype = ccl_datatype_int8;
     match_id_size_param.root = CCL_UNORDERED_COLL_COORDINATOR;
     match_id_size_param.comm = coll_param.comm;
     entry_factory::make_entry<coll_entry>(service_sched.get(), match_id_size_param);
@@ -212,7 +211,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     match_id_val_param.ctype = ccl_coll_bcast;
     match_id_val_param.buf = ccl_buffer();
     match_id_val_param.count = 0;
-    match_id_val_param.dtype = ccl_datatype_char;
+    match_id_val_param.dtype = ccl_datatype_int8;
     match_id_val_param.root = CCL_UNORDERED_COLL_COORDINATOR;
     match_id_val_param.comm = coll_param.comm;
     auto entry = entry_factory::make_entry<coll_entry>(service_sched.get(), match_id_val_param);
@@ -227,16 +226,16 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
             }
             ccl_buffer* buf_ptr = (ccl_buffer*)field_ptr;
             buf_ptr->set(ctx->match_id_value, ctx->match_id_size);
-            return ccl_status_success;
+            return ccl::status::success;
         },
         ctx);
 
     entry->set_field_fn<ccl_sched_entry_field_cnt>(
-        [](const void* fn_ctx, void* field_ptr) -> ccl_status_t {
+        [](const void* fn_ctx, void* field_ptr) -> ccl::status {
             auto ctx = static_cast<ccl_unordered_coll_ctx*>(const_cast<void*>(fn_ctx));
             auto count_ptr = static_cast<size_t*>(field_ptr);
             *count_ptr = ctx->match_id_size;
-            return ccl_status_success;
+            return ccl::status::success;
         },
         ctx);
 
@@ -247,7 +246,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     reserved_comm_id_param.ctype = ccl_coll_bcast;
     reserved_comm_id_param.buf = ccl_buffer(&ctx->reserved_comm_id, sizeof(ccl_comm_id_t));
     reserved_comm_id_param.count = sizeof(ccl_comm_id_t);
-    reserved_comm_id_param.dtype = ccl_datatype_char;
+    reserved_comm_id_param.dtype = ccl_datatype_int8;
     reserved_comm_id_param.root = CCL_UNORDERED_COLL_COORDINATOR;
     reserved_comm_id_param.comm = coll_param.comm;
     entry_factory::make_entry<coll_entry>(service_sched.get(), reserved_comm_id_param);
@@ -257,10 +256,10 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     /* 4. start post actions (create communicator and start postponed schedules) */
     entry_factory::make_entry<function_entry>(
         service_sched.get(),
-        [](const void* func_ctx) -> ccl_status_t {
+        [](const void* func_ctx) -> ccl::status {
             auto ctx = static_cast<ccl_unordered_coll_ctx*>(const_cast<void*>(func_ctx));
             ctx->manager->start_post_coordination_actions(ctx);
-            return ccl_status_success;
+            return ccl::status::success;
         },
         ctx);
 
diff --git a/tests/functional/CMakeLists.txt b/tests/functional/CMakeLists.txt
index 08b5be681..f7249e3e7 100644
--- a/tests/functional/CMakeLists.txt
+++ b/tests/functional/CMakeLists.txt
@@ -79,7 +79,7 @@ foreach(src ${sources})
     target_link_libraries(${executable} PUBLIC dl)
     target_link_libraries(${executable} PRIVATE m)
     target_link_libraries(${executable} PUBLIC mpi) 
-    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_TESTS})
+    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_TESTS} OPTIONAL)
     # FIXME: enable allreduce_custom for non-direct case only
     if(NOT ${executable} STREQUAL "allreduce_custom_test")
         add_test (NAME ${executable} CONFIGURATIONS Default COMMAND mpiexec.hydra -l -n 2 -ppn 1 ${CCL_INSTALL_TESTS}/${executable} --gtest_output=xml:${CCL_INSTALL_TESTS}/${executable}_default_report.junit.xml)
diff --git a/tests/functional/allgatherv_test.cpp b/tests/functional/allgatherv_test.cpp
index 6ebb4e0b3..d560b6360 100644
--- a/tests/functional/allgatherv_test.cpp
+++ b/tests/functional/allgatherv_test.cpp
@@ -52,7 +52,7 @@ class allgatherv_test : public base_test<T> {
             for (size_t elem_idx = 0; elem_idx < param.buffer_count; elem_idx++) {
                 /* each buffer is different size */
                 param.recv_buf[elem_idx].resize(param.elem_count * param.process_count);
-                if (param.test_conf.datatype == DT_BF16) {
+                if (param.test_conf.datatype == DT_BFLOAT16) {
                     param.recv_buf_bf16[elem_idx].resize(param.elem_count * param.process_count);
                 }
             }
@@ -75,7 +75,7 @@ class allgatherv_test : public base_test<T> {
                     if (param.test_conf.place_type == PT_OOP) {
                         param.recv_buf[buf_idx][offsets[elem_idx] + recv_count_idx] =
                             static_cast<T>(SOME_VALUE);
-                        if (param.test_conf.datatype == DT_BF16) {
+                        if (param.test_conf.datatype == DT_BFLOAT16) {
                             param.recv_buf_bf16[buf_idx][offsets[elem_idx] + recv_count_idx] =
                                 static_cast<short>(SOME_VALUE);
                         }
@@ -121,14 +121,14 @@ class allgatherv_test : public base_test<T> {
             send_buf = param.get_send_buf(new_idx);
             recv_buf = param.get_recv_buf(new_idx);
 
-            param.reqs[buf_idx] = ccl::allgatherv(
-                (test_conf.place_type == PT_IN) ? recv_buf : send_buf,
-                count,
-                recv_buf,
-                recv_counts,
-                datatype,
-                GlobalData::instance().comms[0],
-                attr);
+            param.reqs[buf_idx] =
+                ccl::allgatherv((test_conf.place_type == PT_IN) ? recv_buf : send_buf,
+                                count,
+                                recv_buf,
+                                recv_counts,
+                                datatype,
+                                GlobalData::instance().comms[0],
+                                attr);
         }
     }
 };
diff --git a/tests/functional/allreduce_custom_test.cpp b/tests/functional/allreduce_custom_test.cpp
index 22e2c989c..d6798e1fb 100644
--- a/tests/functional/allreduce_custom_test.cpp
+++ b/tests/functional/allreduce_custom_test.cpp
@@ -50,7 +50,7 @@ size_t get_dtype_size(ccl_datatype_t dtype)
 }
 
 template <typename T>
-ccl_status_t do_prologue_T_2x(const void* in_buf, size_t in_count, ccl_datatype_t in_dtype,
+ccl::status do_prologue_T_2x(const void* in_buf, size_t in_count, ccl_datatype_t in_dtype,
                               void** out_buf, size_t* out_count, ccl_datatype_t* out_dtype,
                               const ccl::fn_context* ctx)
 {
@@ -75,11 +75,11 @@ ccl_status_t do_prologue_T_2x(const void* in_buf, size_t in_count, ccl_datatype_
     {
         ((T*)(*out_buf))[buf_idx] = ((T*)in_buf)[buf_idx] * 2;
     }
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename T>
-ccl_status_t do_epilogue_T_2x(const void* in_buf, size_t in_count, ccl_datatype_t in_dtype,
+ccl::status do_epilogue_T_2x(const void* in_buf, size_t in_count, ccl_datatype_t in_dtype,
                               void* out_buf, size_t* out_count, ccl_datatype_t out_dtype,
                               const ccl::fn_context* ctx)
 {
@@ -100,11 +100,11 @@ ccl_status_t do_epilogue_T_2x(const void* in_buf, size_t in_count, ccl_datatype_
         ((T*)out_buf)[buf_idx] = ((T*)in_buf)[buf_idx] * 2;
 
     }
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename T>
-ccl_status_t do_prologue_T_to_char(const void* in_buf, size_t in_count, ccl_datatype_t in_dtype,
+ccl::status do_prologue_T_to_char(const void* in_buf, size_t in_count, ccl_datatype_t in_dtype,
                                    void** out_buf, size_t* out_count, ccl_datatype_t* out_dtype,
                                    const ccl::fn_context* ctx)
 {
@@ -131,11 +131,11 @@ ccl_status_t do_prologue_T_to_char(const void* in_buf, size_t in_count, ccl_data
         int ival = (int)fval;
         ((char*)(*out_buf))[buf_idx] = (char)(ival % 256);
     }
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename T>
-ccl_status_t do_epilogue_char_to_T(const void* in_buf, size_t in_count, ccl_datatype_t in_dtype,
+ccl::status do_epilogue_char_to_T(const void* in_buf, size_t in_count, ccl_datatype_t in_dtype,
                                    void* out_buf, size_t* out_count, ccl_datatype_t out_dtype,
                                    const ccl::fn_context* ctx)
 {
@@ -161,11 +161,11 @@ ccl_status_t do_epilogue_char_to_T(const void* in_buf, size_t in_count, ccl_data
     {
         free((void*)in_buf);
     }
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename T>
-ccl_status_t do_reduction_null(const void* in_buf, size_t in_count, void* inout_buf,
+ccl::status do_reduction_null(const void* in_buf, size_t in_count, void* inout_buf,
                                size_t* out_count, ccl_datatype_t dtype, const ccl::fn_context* ctx)
 {
     size_t buf_idx;
@@ -211,11 +211,11 @@ ccl_status_t do_reduction_null(const void* in_buf, size_t in_count, void* inout_
             ASSERT(0, "unexpected dtype %d", dtype);
             break;
     }
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename T>
-ccl_status_t do_reduction_custom(const void* in_buf, size_t in_count, void* inout_buf,
+ccl::status do_reduction_custom(const void* in_buf, size_t in_count, void* inout_buf,
                                  size_t* out_count, ccl_datatype_t dtype, const ccl::fn_context* ctx)
 {
     size_t buf_idx;
@@ -261,7 +261,7 @@ ccl_status_t do_reduction_custom(const void* in_buf, size_t in_count, void* inou
             ASSERT(0, "unexpected dtype %d", dtype);
             break;
     }
-    return ccl_status_success;
+    return ccl::status::success;
 }
 
 template <typename T>
@@ -527,7 +527,7 @@ template <typename T> class allreduce_custom_test : public base_test <T>
         const ccl_test_conf& test_conf = param.get_conf();
         glob_match_id.resize(param.buffer_count);
 
-        if (test_conf.datatype == DT_BF16)
+        if (test_conf.datatype == DT_BFLOAT16)
         {
             printf("WARNING! BF16 is not supported for custom reduction, test skipped");
             return result;
diff --git a/tests/functional/allreduce_test.cpp b/tests/functional/allreduce_test.cpp
index 1a8c9d82e..daebcb2f3 100644
--- a/tests/functional/allreduce_test.cpp
+++ b/tests/functional/allreduce_test.cpp
@@ -80,14 +80,14 @@ class allreduce_test : public base_test<T> {
             send_buf = param.get_send_buf(new_idx);
             recv_buf = param.get_recv_buf(new_idx);
 
-            param.reqs[buf_idx] = ccl::allreduce(
-                (test_conf.place_type == PT_IN) ? recv_buf : send_buf,
-                recv_buf,
-                count,
-                datatype,
-                reduction,
-                GlobalData::instance().comms[0],
-                attr);
+            param.reqs[buf_idx] =
+                ccl::allreduce((test_conf.place_type == PT_IN) ? recv_buf : send_buf,
+                               recv_buf,
+                               count,
+                               datatype,
+                               reduction,
+                               GlobalData::instance().comms[0],
+                               attr);
         }
     }
 };
diff --git a/tests/functional/alltoall_test.cpp b/tests/functional/alltoall_test.cpp
index b72ce1ac0..57c04d112 100644
--- a/tests/functional/alltoall_test.cpp
+++ b/tests/functional/alltoall_test.cpp
@@ -43,7 +43,7 @@ class alltoall_test : public base_test<T> {
                 param.send_buf[buf_idx][proc_idx] = param.process_idx;
                 if (param.test_conf.place_type == PT_OOP) {
                     param.recv_buf[buf_idx][proc_idx] = static_cast<T>(SOME_VALUE);
-                    if (param.test_conf.datatype == DT_BF16) {
+                    if (param.test_conf.datatype == DT_BFLOAT16) {
                         param.recv_buf_bf16[buf_idx][proc_idx] = static_cast<short>(SOME_VALUE);
                     }
                 }
@@ -77,13 +77,13 @@ class alltoall_test : public base_test<T> {
             send_buf = param.get_send_buf(new_idx);
             recv_buf = param.get_recv_buf(new_idx);
 
-            param.reqs[buf_idx] = ccl::alltoall(
-                (test_conf.place_type == PT_IN) ? recv_buf : send_buf,
-                recv_buf,
-                count,
-                datatype,
-                GlobalData::instance().comms[0],
-                attr);
+            param.reqs[buf_idx] =
+                ccl::alltoall((test_conf.place_type == PT_IN) ? recv_buf : send_buf,
+                              recv_buf,
+                              count,
+                              datatype,
+                              GlobalData::instance().comms[0],
+                              attr);
         }
     }
 };
diff --git a/tests/functional/alltoallv_test.cpp b/tests/functional/alltoallv_test.cpp
index 832d2f774..8cf1341a5 100644
--- a/tests/functional/alltoallv_test.cpp
+++ b/tests/functional/alltoallv_test.cpp
@@ -120,14 +120,14 @@ class alltoallv_test : public base_test<T> {
             send_buf = param.get_send_buf(new_idx);
             recv_buf = param.get_recv_buf(new_idx);
 
-            param.reqs[buf_idx] = ccl::alltoallv(
-                (test_conf.place_type == PT_IN) ? recv_buf : send_buf,
-                send_counts,
-                recv_buf,
-                recv_counts,
-                datatype,
-                GlobalData::instance().comms[0],
-                attr);
+            param.reqs[buf_idx] =
+                ccl::alltoallv((test_conf.place_type == PT_IN) ? recv_buf : send_buf,
+                               send_counts,
+                               recv_buf,
+                               recv_counts,
+                               datatype,
+                               GlobalData::instance().comms[0],
+                               attr);
         }
     }
 };
diff --git a/tests/functional/base.hpp b/tests/functional/base.hpp
index 0c65aaa62..f1ee0967d 100644
--- a/tests/functional/base.hpp
+++ b/tests/functional/base.hpp
@@ -61,7 +61,7 @@ struct typed_test_param {
     std::vector<std::vector<short>> recv_buf_bf16;
 
     std::vector<ccl::event> reqs;
-    std::string match_id;
+    ccl::string_class match_id;
 
     typed_test_param(ccl_test_conf tconf)
             : test_conf(tconf),
@@ -89,14 +89,14 @@ struct typed_test_param {
     void print(std::ostream& output);
 
     void* get_send_buf(size_t buf_idx) {
-        if (test_conf.datatype == DT_BF16)
+        if (test_conf.datatype == DT_BFLOAT16)
             return static_cast<void*>(send_buf_bf16[buf_idx].data());
         else
             return static_cast<void*>(send_buf[buf_idx].data());
     }
 
     void* get_recv_buf(size_t buf_idx) {
-        if (test_conf.datatype == DT_BF16)
+        if (test_conf.datatype == DT_BFLOAT16)
             return static_cast<void*>(recv_buf_bf16[buf_idx].data());
         else
             return static_cast<void*>(recv_buf[buf_idx].data());
@@ -138,21 +138,23 @@ class MainTest : public ::testing ::TestWithParam<ccl_test_conf> {
 public:
     int test(ccl_test_conf& param) {
         switch (param.datatype) {
-            case DT_CHAR: return run<char>(param);
-            case DT_INT:
-                return run<int>(param);
-                //TODO: add additional type to testing
+            case DT_INT8: return run<int8_t>(param);
+            /*case DT_UINT8: return run<uint8_t>(param);*/
+            case DT_INT16: return run<int16_t>(param);
+            /*case DT_UINT16: return run<uint16_t>(param);*/
+            case DT_INT32: return run<int32_t>(param);
+            /*case DT_UINT32: return run<uint32_t>(param);
+            case DT_INT64: return run<int64_t>(param);
+            case DT_UINT64: return run<uint64_t>(param);
+            case DT_FLOAT16: return TEST_SUCCESS;*/
+            case DT_FLOAT32:
+                return run<float>(param);
+                /*case DT_FLOAT64: return run<double>(param);*/
 #ifdef CCL_BF16_COMPILER
-            case DT_BF16: return run<float>(param);
+            case DT_BFLOAT16: return run<float>(param);
 #endif
-            case DT_FLOAT: return run<float>(param);
-            case DT_DOUBLE: return run<double>(param);
-            // case DT_INT64:
-            // return TEST_SUCCESS;
-            // case DT_UINT64:
-            // return TEST_SUCCESS;
             default:
-                EXPECT_TRUE(false) << "Unknown data type: " << param.datatype;
+                EXPECT_TRUE(false) << "Unexpected data type: " << param.datatype;
                 return TEST_FAILURE;
         }
     }
diff --git a/tests/functional/base_bf16.hpp b/tests/functional/base_bf16.hpp
index f41f3cd44..11141adc2 100644
--- a/tests/functional/base_bf16.hpp
+++ b/tests/functional/base_bf16.hpp
@@ -23,7 +23,7 @@
 
 #include "base.hpp"
 
-#define FLOATS_IN_M512  16
+#define FLOATS_IN_M512 16
 #define BF16_SHIFT     16
 #define BF16_PRECISION 0.0390625 // 2^-8
 
diff --git a/tests/functional/base_impl.hpp b/tests/functional/base_impl.hpp
index 40489172c..fd6047226 100644
--- a/tests/functional/base_impl.hpp
+++ b/tests/functional/base_impl.hpp
@@ -22,14 +22,16 @@ template <typename T>
 template <class coll_attr_type>
 void typed_test_param<T>::prepare_coll_attr(coll_attr_type& coll_attr, size_t idx) {
     coll_attr.template set<ccl::operation_attr_id::priority>(generate_priority_value(idx));
-    coll_attr.template set<ccl::operation_attr_id::to_cache>(test_conf.cache_type == CT_CACHE_1 ? true : false);
+    coll_attr.template set<ccl::operation_attr_id::to_cache>(
+        test_conf.cache_type == CT_CACHE_1 ? true : false);
 
     char* test_unordered_coll = getenv("CCL_UNORDERED_COLL");
     if (test_unordered_coll && atoi(test_unordered_coll) == 1) {
         coll_attr.template set<ccl::operation_attr_id::synchronous>(false);
     }
     else {
-        coll_attr.template set<ccl::operation_attr_id::synchronous>(test_conf.sync_type == SNCT_SYNC_1 ? true : false);
+        coll_attr.template set<ccl::operation_attr_id::synchronous>(
+            test_conf.sync_type == SNCT_SYNC_1 ? true : false);
     }
 
     match_id = create_match_id(idx);
@@ -171,7 +173,7 @@ int base_test<T>::check_error(typed_test_param<T>& param,
                               size_t elem_idx) {
     double max_error = 0;
 
-    if (param.test_conf.datatype == DT_BF16) {
+    if (param.test_conf.datatype == DT_BFLOAT16) {
         /* TODO: handle float and double */
 
         // sources https://www.mcs.anl.gov/papers/P4093-0713_1.pdf
@@ -212,7 +214,7 @@ void base_test<T>::alloc_buffers(typed_test_param<T>& param) {
         param.recv_buf[buf_idx].resize(param.elem_count * param.process_count);
     }
 
-    if (param.test_conf.datatype == DT_BF16) {
+    if (param.test_conf.datatype == DT_BFLOAT16) {
         param.send_buf_bf16.resize(param.buffer_count);
         param.recv_buf_bf16.resize(param.buffer_count);
 
@@ -251,7 +253,7 @@ int base_test<T>::run(typed_test_param<T>& param) {
             param.swap_buffers(iter);
             param.define_start_order();
 
-            if (param.test_conf.datatype == DT_BF16) {
+            if (param.test_conf.datatype == DT_BFLOAT16) {
 #ifdef CCL_BF16_COMPILER
                 make_bf16_prologue<T>(param, get_recv_buf_size(param));
 #else
@@ -262,7 +264,7 @@ int base_test<T>::run(typed_test_param<T>& param) {
             run_derived(param);
             param.complete();
 
-            if (param.test_conf.datatype == DT_BF16) {
+            if (param.test_conf.datatype == DT_BFLOAT16) {
 #ifdef CCL_BF16_COMPILER
                 make_bf16_epilogue<T>(param, get_recv_buf_size(param));
 #else
diff --git a/tests/functional/bcast_test.cpp b/tests/functional/bcast_test.cpp
index 9225ce030..6ec573c9d 100644
--- a/tests/functional/bcast_test.cpp
+++ b/tests/functional/bcast_test.cpp
@@ -41,7 +41,7 @@ class bcast_test : public base_test<T> {
                 }
                 else {
                     param.recv_buf[buf_idx][elem_idx] = static_cast<T>(SOME_VALUE);
-                    if (param.test_conf.datatype == DT_BF16) {
+                    if (param.test_conf.datatype == DT_BFLOAT16) {
                         param.recv_buf_bf16[buf_idx][elem_idx] = static_cast<short>(SOME_VALUE);
                     }
                 }
diff --git a/tests/functional/ccl_test_conf.hpp b/tests/functional/ccl_test_conf.hpp
index dbd77985b..ea2dd30c3 100644
--- a/tests/functional/ccl_test_conf.hpp
+++ b/tests/functional/ccl_test_conf.hpp
@@ -118,37 +118,52 @@ std::map<int, const char*> ccl_epilog_type_str = { { ETYPE_NULL, "ETYPE_NULL" },
 };
 
 typedef enum {
-    DT_CHAR = 0,
-    DT_INT,
-    // DT_INT64,
-    // DT_UINT64,
-    DT_FLOAT,
-    DT_DOUBLE,
-    DT_BF16,
+    DT_INT8 = 0,
+    /*DT_UINT8,*/
+    DT_INT16,
+    /*DT_UINT16,*/
+    DT_INT32,
+    /*DT_UINT32,
+    DT_INT64,
+    DT_UINT64,
+    DT_FLOAT16,*/
+    DT_FLOAT32,
+    /*DT_FLOAT64,*/
+    DT_BFLOAT16,
 
     DT_LAST
 } ccl_data_type;
-ccl_data_type first_ccl_data_type = DT_CHAR;
+ccl_data_type first_ccl_data_type = DT_INT8;
 ccl_data_type last_ccl_data_type = DT_LAST;
 
 std::map<int, const char*> ccl_data_type_str = {
-    { DT_CHAR, "DT_CHAR" },
-    { DT_INT, "DT_INT" },
-    // { DT_INT64, "INT64" },
-    // { DT_UINT64, "UINT64" }
-    { DT_FLOAT, "DT_FLOAT" },
-    { DT_DOUBLE, "DT_DOUBLE" },
-    { DT_BF16, "DT_BF16" },
+    { DT_INT8, "DT_INT8" },
+    /*{ DT_UINT8, "DT_UINT8" },*/
+    { DT_INT16, "DT_INT16" },
+    /*{ DT_UINT16, "DT_UINT16" },*/
+    { DT_INT32, "DT_INT32" },
+    /*{ DT_UINT32, "DT_UINT32" },
+    { DT_INT64, "DT_INT64" },
+    { DT_UINT64, "DT_UINT64" },
+    { DT_FLOAT16, "DT_FLOAT16" },*/
+    { DT_FLOAT32, "DT_FLOAT32" },
+    /*{ DT_FLOAT64, "DT_FLOAT64" },*/
+    { DT_BFLOAT16, "DT_BFLOAT16" },
 };
 
 std::map<int, ccl::datatype> ccl_datatype_values = {
-    { DT_CHAR, ccl::datatype::int8 },
-    { DT_INT, ccl::datatype::int32 },
-    // { DT_INT64, ccl::datatype::int64 },
-    // { DT_UINT64, ccl::datatype::uint64 },
-    { DT_FLOAT, ccl::datatype::float32 },
-    { DT_DOUBLE, ccl::datatype::float64 },
-    { DT_BF16, ccl::datatype::bfloat16 },
+    { DT_INT8, ccl::datatype::int8 },
+    /*{ DT_UINT8, ccl::datatype::uint8 },*/
+    { DT_INT16, ccl::datatype::int16 },
+    /*{ DT_UINT16, ccl::datatype::uint16 },*/
+    { DT_INT32, ccl::datatype::int32 },
+    /*{ DT_UINT32, ccl::datatype::uint32 },
+    { DT_INT64, ccl::datatype::int64 },
+    { DT_UINT64, ccl::datatype::uint64 },
+    { DT_FLOAT16, ccl::datatype::float16 },*/
+    { DT_FLOAT32, ccl::datatype::float32 },
+    /*{ DT_FLOAT64, ccl::datatype::float64 },*/
+    { DT_BFLOAT16, ccl::datatype::bfloat16 },
 };
 
 typedef enum {
@@ -270,7 +285,9 @@ ccl::reduction get_ccl_lib_reduction(const ccl_test_conf& test_conf) {
     return ccl_reduction_values[test_conf.reduction];
 }
 
-#define max_test_count() (ORDER_LAST * ORDER_LAST * CMPT_LAST * SNCT_LAST * DT_LAST * ST_LAST * RT_LAST * BC_LAST * CT_LAST * PT_LAST * PTYPE_LAST * ETYPE_LAST)
+#define max_test_count() \
+    (ORDER_LAST * ORDER_LAST * CMPT_LAST * SNCT_LAST * DT_LAST * ST_LAST * RT_LAST * BC_LAST * \
+     CT_LAST * PT_LAST * PTYPE_LAST * ETYPE_LAST)
 
 size_t calculate_test_count() {
     size_t test_count = max_test_count();
@@ -290,7 +307,7 @@ size_t calculate_test_count() {
 
     if (test_data_type_enabled && atoi(test_data_type_enabled) == 0) {
         test_count /= last_ccl_data_type;
-        first_ccl_data_type = static_cast<ccl_data_type>(DT_FLOAT);
+        first_ccl_data_type = static_cast<ccl_data_type>(DT_FLOAT32);
         last_ccl_data_type = static_cast<ccl_data_type>(first_ccl_data_type + 1);
     }
 
@@ -377,7 +394,6 @@ int is_bf16_enabled() {
 std::vector<ccl_test_conf> test_params;
 
 void init_test_params() {
-
     test_params.resize(calculate_test_count());
 
     size_t idx = 0;
@@ -403,7 +419,7 @@ void init_test_params() {
                             for (ccl_data_type data_type = first_ccl_data_type;
                                  data_type < last_ccl_data_type;
                                  data_type++) {
-                                if (data_type == DT_BF16 && !is_bf16_enabled())
+                                if (data_type == DT_BFLOAT16 && !is_bf16_enabled())
                                     continue;
 
                                 for (ccl_completion_type completion_type =
@@ -435,8 +451,7 @@ void init_test_params() {
                                                     test_params[idx].sync_type = sync_type;
                                                     test_params[idx].completion_type =
                                                         completion_type;
-                                                    test_params[idx].reduction =
-                                                        reduction_type;
+                                                    test_params[idx].reduction = reduction_type;
                                                     test_params[idx].buffer_count = buffer_count;
                                                     test_params[idx].start_order_type =
                                                         start_order_type;
diff --git a/tests/functional/reduce_scatter_test.cpp b/tests/functional/reduce_scatter_test.cpp
index 9bf22c929..84193fe62 100644
--- a/tests/functional/reduce_scatter_test.cpp
+++ b/tests/functional/reduce_scatter_test.cpp
@@ -23,12 +23,10 @@ template <typename T>
 class reduce_scatter_test : public base_test<T> {
 public:
     int check(typed_test_param<T>& param) {
-
         size_t my_rank = GlobalData::instance().comms[0].rank();
 
         for (size_t buf_idx = 0; buf_idx < param.buffer_count; buf_idx++) {
             for (size_t elem_idx = 0; elem_idx < param.elem_count; elem_idx++) {
-
                 size_t real_elem_idx = my_rank * param.elem_count + elem_idx;
 
                 if (param.test_conf.reduction == RT_SUM) {
@@ -86,14 +84,14 @@ class reduce_scatter_test : public base_test<T> {
             send_buf = param.get_send_buf(new_idx);
             recv_buf = param.get_recv_buf(new_idx);
 
-            param.reqs[buf_idx] = ccl::reduce_scatter(
-                (test_conf.place_type == PT_IN) ? recv_buf : send_buf,
-                recv_buf,
-                count,
-                datatype,
-                reduction,
-                GlobalData::instance().comms[0],
-                attr);
+            param.reqs[buf_idx] =
+                ccl::reduce_scatter((test_conf.place_type == PT_IN) ? recv_buf : send_buf,
+                                    recv_buf,
+                                    count,
+                                    datatype,
+                                    reduction,
+                                    GlobalData::instance().comms[0],
+                                    attr);
         }
     }
 };
diff --git a/tests/functional/reduce_test.cpp b/tests/functional/reduce_test.cpp
index 86c76d36c..cee3118d7 100644
--- a/tests/functional/reduce_test.cpp
+++ b/tests/functional/reduce_test.cpp
@@ -23,7 +23,6 @@ template <typename T>
 class reduce_test : public base_test<T> {
 public:
     int check(typed_test_param<T>& param) {
-
         if (param.process_idx != ROOT_PROCESS_IDX)
             return TEST_SUCCESS;
 
@@ -84,15 +83,14 @@ class reduce_test : public base_test<T> {
             send_buf = param.get_send_buf(new_idx);
             recv_buf = param.get_recv_buf(new_idx);
 
-            param.reqs[buf_idx] = ccl::reduce(
-                (test_conf.place_type == PT_IN) ? recv_buf : send_buf,
-                recv_buf,
-                count,
-                datatype,
-                reduction,
-                ROOT_PROCESS_IDX,
-                GlobalData::instance().comms[0],
-                attr);
+            param.reqs[buf_idx] = ccl::reduce((test_conf.place_type == PT_IN) ? recv_buf : send_buf,
+                                              recv_buf,
+                                              count,
+                                              datatype,
+                                              reduction,
+                                              ROOT_PROCESS_IDX,
+                                              GlobalData::instance().comms[0],
+                                              attr);
         }
     }
 };
diff --git a/tests/functional/utils.hpp b/tests/functional/utils.hpp
index 7a14262d5..09fd9882e 100644
--- a/tests/functional/utils.hpp
+++ b/tests/functional/utils.hpp
@@ -182,7 +182,7 @@
             MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD); \
             gd.kvs = ccl::create_kvs(main_addr); \
         } \
-        gd.comms.push_back(ccl::create_communicator(size, rank, gd.kvs)); \
+        gd.comms.emplace_back(ccl::create_communicator(size, rank, gd.kvs)); \
         PATCH_OUTPUT_NAME_ARG(argc, argv); \
         testing::InitGoogleTest(&argc, argv); \
         int res = RUN_ALL_TESTS(); \
@@ -215,7 +215,9 @@ void print_err_message(char* err_message, std::ostream& output) {
     }
 
     char* arrerr_message = new char[full_message_len];
-    ccl::allgatherv(err_message, message_len, arrerr_message, arr_message_len, comm).wait();
+    ccl::allgatherv(
+        err_message, message_len, arrerr_message, arr_message_len, ccl::datatype::int8, comm)
+        .wait();
 
     if (process_idx == 0) {
         output << arrerr_message;
diff --git a/third-party-programs.txt b/third-party-programs.txt
index 95cddd8b4..ca0aaebe8 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -1,9 +1,9 @@
-Intel(R) oneAPI Collective Communications Library (oneCCL)
-2021.1-beta10 Third Party Programs File
+Intel(R) oneAPI Collective Communications Library (oneCCL) 
+2021.1.0 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
-Intel end user license agreement for the Intel software you are licensing. The 
-third party programs and their corresponding required notices and/or license 
+Intel end user license agreement for the Intel software you are licensing.
+Third party programs and their corresponding required notices and/or license 
 terms are listed below.
 
 -------------------------------------------------------------------------------