intel · ct-clmsn · Mar 30, 2025 · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -50,6 +50,12 @@ elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
   endif()
   include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/sse2neon)
   set(DISABLE_WDDM_LINUX TRUE)
+elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64")
+  set(NEO_TARGET_PROCESSOR "riscv64")
+  if(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL ${CMAKE_SYSTEM_PROCESSOR})
+    set(NEO_DISABLE_LD_LLD TRUE)
+    set(NEO_DISABLE_LD_GOLD TRUE)
+  endif()
 endif()
 message(STATUS "Host processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
 message(STATUS "Target processor: ${CMAKE_SYSTEM_PROCESSOR}")
@@ -788,65 +794,71 @@ if(NOT NEO_DISABLE_MITIGATIONS)
       message(WARNING "Spectre mitigation is not supported by the compiler")
     endif()
   else()
-    if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
-      check_cxx_compiler_flag(-mretpoline COMPILER_SUPPORTS_RETPOLINE)
-      if(COMPILER_SUPPORTS_RETPOLINE)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mretpoline")
-      else()
-        message(WARNING "Spectre mitigation -mretpoline flag is not supported by the compiler")
-      endif()
-    else()
-      check_cxx_compiler_flag(-mindirect-branch=thunk COMPILER_SUPPORTS_INDIRECT_BRANCH_THUNK)
-      if(COMPILER_SUPPORTS_INDIRECT_BRANCH_THUNK)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mindirect-branch=thunk")
-      else()
-        message(WARNING "Spectre mitigation -mindirect-branch=thunk flag is not supported by the compiler")
-      endif()
-      check_cxx_compiler_flag(-mfunction-return=thunk COMPILER_SUPPORTS_FUNCTION_RETURN_THUNK)
-      if(COMPILER_SUPPORTS_FUNCTION_RETURN_THUNK)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfunction-return=thunk")
-      else()
-        message(WARNING "Spectre mitigation -mfunction-return=thunk flag is not supported by the compiler")
-      endif()
-      check_cxx_compiler_flag(-mindirect-branch-register COMPILER_SUPPORTS_INDIRECT_BRANCH_REGISTER)
-      if(COMPILER_SUPPORTS_INDIRECT_BRANCH_REGISTER)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mindirect-branch-register")
+    if(NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64")
+      if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+        check_cxx_compiler_flag(-mretpoline COMPILER_SUPPORTS_RETPOLINE)
+        if(COMPILER_SUPPORTS_RETPOLINE)
+          set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mretpoline")
+        else()
+          message(WARNING "Spectre mitigation -mretpoline flag is not supported by the compiler")
+        endif()
       else()
-        message(WARNING "Spectre mitigation -mindirect-branch-register flag is not supported by the compiler")
+        check_cxx_compiler_flag(-mindirect-branch=thunk COMPILER_SUPPORTS_INDIRECT_BRANCH_THUNK)
+        if(COMPILER_SUPPORTS_INDIRECT_BRANCH_THUNK)
+          set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mindirect-branch=thunk")
+        else()
+          message(WARNING "Spectre mitigation -mindirect-branch=thunk flag is not supported by the compiler")
+        endif()
+        check_cxx_compiler_flag(-mfunction-return=thunk COMPILER_SUPPORTS_FUNCTION_RETURN_THUNK)
+        if(COMPILER_SUPPORTS_FUNCTION_RETURN_THUNK)
+          set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfunction-return=thunk")
+        else()
+          message(WARNING "Spectre mitigation -mfunction-return=thunk flag is not supported by the compiler")
+        endif()
+        check_cxx_compiler_flag(-mindirect-branch-register COMPILER_SUPPORTS_INDIRECT_BRANCH_REGISTER)
+        if(COMPILER_SUPPORTS_INDIRECT_BRANCH_REGISTER)
+          set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mindirect-branch-register")
+        else()
+          message(WARNING "Spectre mitigation -mindirect-branch-register flag is not supported by the compiler")
+        endif()
       endif()
     endif()
   endif()
 else()
   message(WARNING "Spectre mitigation DISABLED")
 endif()
 
-if(NOT MSVC)
-  check_cxx_compiler_flag(-msse4.2 COMPILER_SUPPORTS_SSE42)
-  check_cxx_compiler_flag(-mavx2 COMPILER_SUPPORTS_AVX2)
-  check_cxx_compiler_flag(-march=armv8-a+simd COMPILER_SUPPORTS_NEON)
+if(NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64")
+  if(NOT MSVC)
+    check_cxx_compiler_flag(-msse4.2 COMPILER_SUPPORTS_SSE42)
+    check_cxx_compiler_flag(-mavx2 COMPILER_SUPPORTS_AVX2)
+    check_cxx_compiler_flag(-march=armv8-a+simd COMPILER_SUPPORTS_NEON)
+  endif()
 endif()
 
 if(NOT MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftemplate-depth=1024")
 endif()
 
 # intrinsics (_mm_clflushopt and waitpkg) support
-if(NOT MSVC)
-  check_cxx_compiler_flag(-mclflushopt SUPPORTS_CLFLUSHOPT)
-  check_cxx_compiler_flag(-mwaitpkg SUPPORTS_WAITPKG)
-  if(SUPPORTS_CLFLUSHOPT)
+if(NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "riscv64")
+  if(NOT MSVC)
+    check_cxx_compiler_flag(-mclflushopt SUPPORTS_CLFLUSHOPT)
+    check_cxx_compiler_flag(-mwaitpkg SUPPORTS_WAITPKG)
+    if(SUPPORTS_CLFLUSHOPT)
+      add_compile_definitions(SUPPORTS_CLFLUSHOPT)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mclflushopt")
+    endif()
+    if(SUPPORTS_WAITPKG)
+      add_compile_definitions(SUPPORTS_WAITPKG)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mwaitpkg")
+    else()
+      message(WARNING "-mwaitpkg flag is not supported by the compiler")
+    endif()
+  else()
     add_compile_definitions(SUPPORTS_CLFLUSHOPT)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mclflushopt")
-  endif()
-  if(SUPPORTS_WAITPKG)
     add_compile_definitions(SUPPORTS_WAITPKG)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mwaitpkg")
-  else()
-    message(WARNING "-mwaitpkg flag is not supported by the compiler")
   endif()
-else()
-  add_compile_definitions(SUPPORTS_CLFLUSHOPT)
-  add_compile_definitions(SUPPORTS_WAITPKG)
 endif()
 
 # Compiler warning flags

diff --git a/scripts/packaging/build_deb.sh b/scripts/packaging/build_deb.sh
@@ -162,6 +162,10 @@ EOF
         NEO_SKIP_UNIT_TESTS="TRUE"
         export NEO_DISABLE_BUILTINS_COMPILATION="TRUE"
     fi
+    if [ "${TARGET_ARCH}" == "riscv64" ]; then
+        NEO_SKIP_UNIT_TESTS="TRUE"
+        export NEO_DISABLE_BUILTINS_COMPILATION="TRUE"
+    fi
     export NEO_DISABLE_BUILTINS_COMPILATION
     export NEO_SKIP_UNIT_TESTS
 

diff --git a/scripts/packaging/l0_gpu_driver/build_l0_gpu_driver_deb.sh b/scripts/packaging/l0_gpu_driver/build_l0_gpu_driver_deb.sh
@@ -130,6 +130,10 @@ EOF
         NEO_SKIP_UNIT_TESTS="TRUE"
         export NEO_DISABLE_BUILTINS_COMPILATION="TRUE"
     fi
+    if [ "${TARGET_ARCH}" == "riscv64" ]; then
+        NEO_SKIP_UNIT_TESTS="TRUE"
+        export NEO_DISABLE_BUILTINS_COMPILATION="TRUE"
+    fi
     export NEO_DISABLE_BUILTINS_COMPILATION
     export NEO_SKIP_UNIT_TESTS
 

diff --git a/scripts/packaging/opencl/build_opencl_deb.sh b/scripts/packaging/opencl/build_opencl_deb.sh
@@ -129,6 +129,10 @@ EOF
         NEO_SKIP_UNIT_TESTS="TRUE"
         export NEO_DISABLE_BUILTINS_COMPILATION="TRUE"
     fi
+    if [ "${TARGET_ARCH}" == "riscv64" ]; then
+        NEO_SKIP_UNIT_TESTS="TRUE"
+        export NEO_DISABLE_BUILTINS_COMPILATION="TRUE"
+    fi
     export NEO_DISABLE_BUILTINS_COMPILATION
     export NEO_SKIP_UNIT_TESTS
 

diff --git a/shared/source/helpers/local_id_gen.h b/shared/source/helpers/local_id_gen.h
@@ -47,9 +47,11 @@ inline uint32_t getPerThreadSizeLocalIDs(uint32_t simd, uint32_t grfSize, uint32
 }
 
 struct LocalIDHelper {
+#if !defined(__riscv)
     static void (*generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
     static void (*generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
     static void (*generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
+#endif
 
     static LocalIDHelper initializer;
 

diff --git a/shared/source/helpers/riscv64/CMakeLists.txt b/shared/source/helpers/riscv64/CMakeLists.txt
@@ -0,0 +1,14 @@
+#
+# Copyright (C) 2019-2022 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+#
+
+if(${NEO_TARGET_PROCESSOR} STREQUAL "riscv64")
+  list(APPEND NEO_CORE_HELPERS
+       ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
+       ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
+  )
+
+  set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
+endif()
diff --git a/shared/source/helpers/riscv64/local_id_gen.cpp b/shared/source/helpers/riscv64/local_id_gen.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2018-2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/local_id_gen.h"
+
+#include "shared/source/execution_environment/root_device_environment.h"
+#include "shared/source/helpers/aligned_memory.h"
+#include "shared/source/helpers/gfx_core_helper.h"
+#include "shared/source/helpers/local_id_gen_special.inl"
+#include "shared/source/utilities/cpu_info.h"
+
+namespace NEO {
+
+struct uint16x8_t;
+struct uint16x16_t;
+
+// This is the initial value of SIMD for local ID
+// computation.  It correlates to the SIMD lane.
+// Must be 32byte aligned for AVX2 usage
+ALIGNAS(32)
+const uint16_t initialLocalID[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+// Initialize the lookup table based on CPU capabilities
+LocalIDHelper::LocalIDHelper() {
+}
+
+LocalIDHelper LocalIDHelper::initializer;
+
+void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize, uint32_t grfCount, const RootDeviceEnvironment &rootDeviceEnvironment) {
+    generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
+}
+
+} // namespace NEO
diff --git a/shared/source/helpers/uint16_sse4.h b/shared/source/helpers/uint16_sse4.h
@@ -12,8 +12,11 @@
 #include <cstdint>
 #if defined(__ARM_ARCH)
 #include <sse2neon.h>
-#else
+#elif defined(__x86_64__) || defined(_M_X64)
 #include <immintrin.h>
+#elif defined(__riscv)
+#include <cstring>
+typedef std::uint16_t __attribute__((vector_size(8))) __m128i;
 #endif
 
 namespace NEO {
@@ -24,14 +27,28 @@ struct uint16x8_t { // NOLINT(readability-identifier-naming)
     __m128i value;
 
     uint16x8_t() {
+#if defined(__riscv)
+            std::memset(&value, 0, sizeof(std::uint16_t)*8);
+#else
         value = _mm_setzero_si128();
+#endif
     }
 
+#if defined(__riscv)
+    uint16x8_t(__m128i val) {
+            std::memcpy(&value, &val, sizeof(std::uint16_t)*8);
+    }
+#else
     uint16x8_t(__m128i value) : value(value) {
     }
+#endif
 
     uint16x8_t(uint16_t a) {
+#if defined(__riscv)
+        std::memset(&value, a, sizeof(std::uint16_t)*8);
+#else
         value = _mm_set1_epi16(a); // SSE2
+#endif
     }
 
     explicit uint16x8_t(const void *alignedPtr) {
@@ -57,57 +74,117 @@ struct uint16x8_t { // NOLINT(readability-identifier-naming)
 
     inline void load(const void *alignedPtr) {
         DEBUG_BREAK_IF(!isAligned<16>(alignedPtr));
+#if defined(__riscv)
+        std::memcpy(&value, reinterpret_cast<const __m128i *>(alignedPtr), sizeof(std::uint16_t)*8);
+#else
         value = _mm_load_si128(reinterpret_cast<const __m128i *>(alignedPtr)); // SSE2
+#endif
     }
 
     inline void loadUnaligned(const void *ptr) {
+#if defined(__riscv)
+            std::memcpy(&value, reinterpret_cast<const __m128i *>(ptr), sizeof(std::uint16_t)*8);
+#else
         value = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)); // SSE2
+#endif
     }
 
     inline void store(void *alignedPtr) {
         DEBUG_BREAK_IF(!isAligned<16>(alignedPtr));
+#if defined(__riscv)
+        std::memcpy(alignedPtr, &value, sizeof(std::uint16_t)*8);
+#else
         _mm_store_si128(reinterpret_cast<__m128i *>(alignedPtr), value); // SSE2
+#endif
     }
 
     inline void storeUnaligned(void *ptr) {
+#if defined(__riscv)
+            std::memcpy(ptr, &value, sizeof(std::uint16_t)*8);
+#else
         _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr), value); // SSE2
+#endif
     }
 
     inline operator bool() const {
+#if defined(__riscv)
+        unsigned int result = 0;
+        uint16x8_t mask_value = mask();
+        for(unsigned int i = 0; i < 8; ++i) {
+           result += value[i] & mask_value.value[i];
+        }
+        return (result < 1);
+#else
         return _mm_test_all_zeros(value, mask().value) ? false : true; // SSE4.1 alternatives?
+#endif
     }
 
     inline uint16x8_t &operator-=(const uint16x8_t &a) {
+#if defined(__riscv)
+        for(unsigned int i = 0; i < 8; ++i) {
+           value[i] = value[i] - a.value[i];
+        }
+#else
         value = _mm_sub_epi16(value, a.value); // SSE2
+#endif
         return *this;
     }
 
     inline uint16x8_t &operator+=(const uint16x8_t &a) {
+#if defined(__riscv)
+        for(unsigned int i = 0; i < 8; ++i) {
+           value[i] = value[i] + a.value[i];
+        }
+#else
         value = _mm_add_epi16(value, a.value); // SSE2
+#endif
         return *this;
     }
 
     inline friend uint16x8_t operator>=(const uint16x8_t &a, const uint16x8_t &b) {
         uint16x8_t result;
+#if defined(__riscv)
+        std::uint16_t mask = 0;
+        for(unsigned int i = 0; i < 8; ++i) {
+           mask = ( a.value[i] < b.value[i] ) ? 1 : 0;
+           result.value[i] = result.value[i] ^ mask;
+        }
+#else
         result.value =
             _mm_xor_si128(mask().value,
                           _mm_cmplt_epi16(a.value, b.value)); // SSE2
+#endif
         return result;
     }
 
     inline friend uint16x8_t operator&&(const uint16x8_t &a, const uint16x8_t &b) {
         uint16x8_t result;
+#if defined(__riscv)
+        for(unsigned int i = 0; i < 8; ++i) {
+           result.value[i] = a.value[i] & b.value[i];
+        }
+#else
         result.value = _mm_and_si128(a.value, b.value); // SSE2
+#endif
         return result;
     }
 
     // NOTE: uint16x8_t::blend behaves like mask ? a : b
     inline friend uint16x8_t blend(const uint16x8_t &a, const uint16x8_t &b, const uint16x8_t &mask) {
         uint16x8_t result;
-
+#if defined(__riscv)
+        for(unsigned int i = 0; i < 8; ++i) {
+           std::uint8_t mask_values[2] = {  static_cast<uint8_t>((mask.value[i] << 8) >> 8), static_cast<uint8_t>(mask.value[i] >> 8) };
+           std::uint8_t a_values[2] = { static_cast<uint8_t>((a.value[i] << 8) >> 8), static_cast<uint8_t>(a.value[i] >> 8) };
+           std::uint8_t b_values[2] = { static_cast<uint8_t>((b.value[i] << 8) >> 8), static_cast<uint8_t>(b.value[i] >> 8) };
+
+           result.value[i] = (( mask_values[1] ? a_values[1] : b_values[0] ) << 8 ) | ( mask_values[0] ? a_values[0] : b_values[0] );
+        }
+#else
         // Have to swap arguments to get intended calling semantics
         result.value =
             _mm_blendv_epi8(b.value, a.value, mask.value); // SSE4.1 alternatives?
+#endif
         return result;
     }
 };