From 9378b051661f1a5016fad6cdf74f7cc9475766b2 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 23 Apr 2025 03:41:53 -0700 Subject: [PATCH 01/65] Initial implementation of CPUArch dispatcher and unified shared library --- bindings/python/CMakeLists.txt | 67 ++++--- bindings/python/setup.py | 28 +-- bindings/python/src/instantiations.cpp | 19 ++ bindings/python/src/python_bindings.cpp | 2 +- cmake/options.cmake | 2 +- include/svs/core/distance/cosine.h | 40 ++-- include/svs/lib/arch.h | 236 ++++++++++++++++++++++ include/svs/lib/cpuid.h | 253 ++++++++++++++++++++++++ tests/svs/core/distances/cosine.cpp | 5 +- 9 files changed, 592 insertions(+), 60 deletions(-) create mode 100644 bindings/python/src/instantiations.cpp create mode 100644 include/svs/lib/arch.h create mode 100644 include/svs/lib/cpuid.h diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index 495eec2c..f5571355 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -97,45 +97,56 @@ set(CPP_FILES src/vamana_common.cpp src/svs_mkl.cpp ) +# C++ files that are used to instantiate the template classes for each microarchitecture. +set(CPUARCH_CPP_FILES + src/instantiations.cpp +) + +find_package(pybind11 REQUIRED) -# Generate a shared library for each target microarchitecture. +# Generate an object file for each target microarchitecture. +set(OBJECT_FILES "") foreach(MICRO OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) - set(LIB_NAME "_svs_${MICRO}") + set(OBJ_NAME "_svs_${MICRO}") + add_library(${OBJ_NAME} OBJECT ${CPUARCH_CPP_FILES}) - pybind11_add_module(${LIB_NAME} MODULE ${CPP_FILES}) - target_link_libraries(${LIB_NAME} PUBLIC svs::svs) - # Dependency "fmt::fmt" obtained from "svs" - target_link_libraries(${LIB_NAME} PRIVATE svs::compile_options fmt::fmt) + target_link_libraries(${OBJ_NAME} PUBLIC svs::svs) + target_link_libraries(${OBJ_NAME} PRIVATE svs::compile_options fmt::fmt) string(REPLACE "," ";" OPT_FLAGS ${OPT_FLAGS}) message("OPT Flags: ${OPT_FLAGS}") - target_compile_options(${LIB_NAME} PRIVATE ${OPT_FLAGS}) + target_compile_options(${OBJ_NAME} PRIVATE ${OPT_FLAGS} -DSVS_TUNE_TARGET=${MICRO} -DSVS_EXTERNAL_CPUARCH_INSTANCE=1 -fPIC) - # Header files. - target_include_directories( - ${LIB_NAME} - PUBLIC $ - ) + list(APPEND OBJECT_FILES $) +endforeach() - # Comunicate to the C++ library the desired name of the library - target_compile_options(${LIB_NAME} PRIVATE "-DSVS_MODULE_NAME=${LIB_NAME}") +set(LIB_NAME "_svs") +add_library(${LIB_NAME} SHARED ${OBJECT_FILES} ${CPP_FILES}) +target_link_libraries(${LIB_NAME} PRIVATE pybind11::module) +target_link_libraries(${LIB_NAME} PUBLIC svs::svs) +# Dependency "fmt::fmt" obtained from "svs" +target_link_libraries(${LIB_NAME} PRIVATE svs::compile_options fmt::fmt) +# TODO: remove manual specification of base arch optimization flags +target_compile_options(${LIB_NAME} PRIVATE -march=x86-64-v3 -mtune=generic -DSVS_TUNE_TARGET=x86_64_v3) +target_include_directories( + ${LIB_NAME} + PUBLIC $ +) - # If scikit build is running the compilation process, - if(DEFINED SKBUILD) - install(TARGETS ${LIB_NAME} DESTINATION .) +if(DEFINED SKBUILD) + install(TARGETS ${LIB_NAME} DESTINATION .) - # The extension module may need to load build or included libraries when loaded. + # The extension module may need to load build or included libraries when loaded. - # Placing build depedencies in the package and using relative RPATHs that - # don't point outside of the package means that the built package is - # relocatable. This allows for safe binary redistribution. - set_target_properties( - ${LIB_NAME} - PROPERTIES - INSTALL_RPATH "$ORIGIN/${CMAKE_INSTALL_LIBDIR}" - ) - endif() -endforeach() + # Placing build depedencies in the package and using relative RPATHs that + # don't point outside of the package means that the built package is + # relocatable. This allows for safe binary redistribution. + set_target_properties( + ${LIB_NAME} + PROPERTIES + INSTALL_RPATH "$ORIGIN/${CMAKE_INSTALL_LIBDIR}" + ) +endif() if(DEFINED SKBUILD) # Install the manifest JSON file. diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 5d310749..83c46e54 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -29,22 +29,22 @@ def target(arch): return cpu.TARGETS[arch] -# N.B.: cibuildwheel must configure the multi-arch environment variable. -# Also, the micro-architectures defined below should be in order of preference. -if os.environ.get("SVS_MULTIARCH", None) is not None: - svs_microarchs = [ +# TODO: Replace with externally-specified list +svs_microarchs = [ + # "x86_64_v3" # This is the default target for base lib compilation + "broadwell", + "skylake", + "skylake_avx512", "cascadelake", - "x86_64_v3", # conservative base CPU for x86 CPUs. + # TODO: Add support for other architectures (archspec does not support them yet) + # "cooperlake", + # "icelake_server", + "sapphirerapids", + # "graniterapids", + # "graniterapids_d", ] - - # Add the current host to the list of micro-architecture if it doesn't already exist. - last_target = target(svs_microarchs[-1]) - host_name = cpu.host().name - if host_name not in svs_microarchs and target(host_name) < last_target: - svs_microarchs.append(host_name) - - cmake_array = ";".join(svs_microarchs) - cmake_args.append(f"-DSVS_MICROARCHS={cmake_array}") +cmake_array = ";".join(svs_microarchs) +cmake_args.append(f"-DSVS_MICROARCHS={cmake_array}") # Determine the root of the repository base_dir = os.path.relpath(os.path.join(os.path.dirname(__file__), '..', '..')) diff --git a/bindings/python/src/instantiations.cpp b/bindings/python/src/instantiations.cpp new file mode 100644 index 00000000..2dfc8738 --- /dev/null +++ b/bindings/python/src/instantiations.cpp @@ -0,0 +1,19 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "svs/core/distance/cosine.h" + +SVS_INSTANTIATE_CLASS_BY_CPUARCH(svs::distance::CosineSimilarity) diff --git a/bindings/python/src/python_bindings.cpp b/bindings/python/src/python_bindings.cpp index e1ac92b6..8419dd86 100644 --- a/bindings/python/src/python_bindings.cpp +++ b/bindings/python/src/python_bindings.cpp @@ -50,7 +50,7 @@ // The variable allows us to customize the name of the python module to support // micro-architecture versioning. #if !defined(SVS_MODULE_NAME) -#define SVS_MODULE_NAME _svs_native +#define SVS_MODULE_NAME _svs #endif namespace py = pybind11; diff --git a/cmake/options.cmake b/cmake/options.cmake index e374a548..f925d114 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -146,7 +146,7 @@ endif() add_library(svs_native_options INTERFACE) add_library(svs::native_options ALIAS svs_native_options) -target_compile_options(svs_native_options INTERFACE -march=native -mtune=native) +target_compile_options(svs_native_options INTERFACE -DSVS_CPUARCH_NATIVE -march=native -mtune=native) # Use an internal INTERFACE target to apply the same build options to both the # unit test and the compiled binaries. diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 9738e881..157155dd 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -19,6 +19,7 @@ // svs #include "svs/core/distance/distance_core.h" #include "svs/core/distance/simd_utils.h" +#include "svs/lib/arch.h" #include "svs/lib/saveload.h" #include "svs/lib/static.h" @@ -32,7 +33,7 @@ namespace svs::distance { // Forward declare implementation to allow entry point to be near the top. -template struct CosineSimilarityImpl; +template struct CosineSimilarityImpl; // Generic Entry Point // Call as one of either: @@ -41,23 +42,26 @@ template struct CosineSimilarityImpl; // (2) CosineSimilarity::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. +template class CosineSimilarity { public: template static constexpr float compute(const Ea* a, const Eb* b, float a_norm, size_t N) { - return CosineSimilarityImpl::compute( + return CosineSimilarityImpl::compute( a, b, a_norm, lib::MaybeStatic(N) ); } template static constexpr float compute(const Ea* a, const Eb* b, float a_norm) { - return CosineSimilarityImpl::compute( + return CosineSimilarityImpl::compute( a, b, a_norm, lib::MaybeStatic() ); } }; +SVS_DECLARE_CLASS_BY_CPUARCH(CosineSimilarity) + /// /// @brief Functor for computing Cosine Similarity. /// @@ -139,9 +143,17 @@ float compute(DistanceCosineSimilarity distance, std::span a, std::span< assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - return CosineSimilarity::compute(a.data(), b.data(), distance.norm_, a.size()); + SVS_DISPATCH_CLASS_BY_CPUARCH( + CosineSimilarity, + compute, + SVS_PACK_ARGS(a.data(), b.data(), distance.norm_, a.size()) + ); } else { - return CosineSimilarity::compute(a.data(), b.data(), distance.norm_); + SVS_DISPATCH_CLASS_BY_CPUARCH( + CosineSimilarity, + compute, + SVS_PACK_ARGS(a.data(), b.data(), distance.norm_) + ); } } @@ -166,7 +178,7 @@ float generic_cosine_similarity( return result / (a_norm * std::sqrt(accum)); }; -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { static float compute( const Ea* a, const Eb* b, @@ -224,7 +236,7 @@ template <> struct CosineFloatOp<16> : public svs::simd::ConvertToFloat<16> { // Small Integers SVS_VALIDATE_BOOL_ENV(SVS_AVX512_VNNI) #if SVS_AVX512_VNNI -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -250,7 +262,7 @@ template struct CosineSimilarityImpl { } }; -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -278,7 +290,7 @@ template struct CosineSimilarityImpl { #endif // Floating and Mixed Types -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -286,7 +298,7 @@ template struct CosineSimilarityImpl { } }; -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -294,7 +306,7 @@ template struct CosineSimilarityImpl { }; }; -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -302,7 +314,7 @@ template struct CosineSimilarityImpl { }; }; -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -310,7 +322,7 @@ template struct CosineSimilarityImpl { } }; -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -318,7 +330,7 @@ template struct CosineSimilarityImpl { } }; -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h new file mode 100644 index 00000000..74e42a9a --- /dev/null +++ b/include/svs/lib/arch.h @@ -0,0 +1,236 @@ +/* + * Copyright 2023 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "svs/lib/cpuid.h" + +// helper for IDE C++ language support +// #define SVS_CPUARCH_NATIVE 1 + +namespace svs::arch { + +enum class CPUArch { +#if defined(SVS_CPUARCH_NATIVE) + native, +#elif defined(__x86_64__) + // Refer to the GCC docs for the list of targeted architectures: + // https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html + x86_64_v3, + // Q: are aliases needed in future? + haswell = x86_64_v3, + core_avx2 = x86_64_v3, + broadwell, + skylake, + skylake_avx512, + cascadelake, + // TODO: Uncomment once supported on python bindings side + // cooperlake, + // icelake_server, + sapphirerapids, + emeraldrapids = sapphirerapids, + // graniterapids, + // graniterapids_d, +#elif defined(__aarch64__) + neoverse_n1, + neoverse_v1, +#endif + baseline = 0, +}; + +inline bool arch_is_supported(CPUArch arch) { + switch (arch) { +#if defined(SVS_CPUARCH_NATIVE) + case CPUArch::native: + return true; +#elif defined(__x86_64__) + case CPUArch::x86_64_v3: + return check_extensions(std::vector{ + ISAExt::MMX, ISAExt::SSE, ISAExt::SSE2, ISAExt::SSE3, ISAExt::SSSE3, + ISAExt::SSE4_1, ISAExt::SSE4_2, ISAExt::POPCNT, ISAExt::CX16, ISAExt::SAHF, + ISAExt::FXSR, ISAExt::AVX, ISAExt::XSAVE, ISAExt::PCLMUL, ISAExt::FSGSBASE, + ISAExt::RDRND, ISAExt::F16C, ISAExt::AVX2, ISAExt::BMI, ISAExt::BMI2, + ISAExt::LZCNT, ISAExt::FMA, ISAExt::MOVBE + }); + case CPUArch::broadwell: + return arch_is_supported(CPUArch::x86_64_v3) && check_extensions(std::vector{ + ISAExt::RDSEED, ISAExt::ADCX, ISAExt::PREFETCHW + }); + case CPUArch::skylake: + return arch_is_supported(CPUArch::broadwell) && check_extensions(std::vector{ + ISAExt::AES, ISAExt::CLFLUSHOPT, ISAExt::XSAVEC, ISAExt::XSAVES, ISAExt::SGX + }); + case CPUArch::skylake_avx512: + return arch_is_supported(CPUArch::skylake) && check_extensions(std::vector{ + ISAExt::AVX512_F, ISAExt::CLWB, ISAExt::AVX512_VL, ISAExt::AVX512_BW, + ISAExt::AVX512_DQ, ISAExt::AVX512_CD + }); + case CPUArch::cascadelake: + return arch_is_supported(CPUArch::skylake_avx512) && check_extensions(std::vector{ + ISAExt::AVX512_VNNI + }); + // case CPUArch::cooperlake: + // return arch_is_supported(CPUArch::cascadelake) && check_extensions(std::vector{ + // ISAExt::AVX512_BF16 + // }); + // case CPUArch::icelake_server: + // return arch_is_supported(CPUArch::cooperlake) && check_extensions(std::vector{ + // ISAExt::PKU, ISAExt::AVX512_VBMI, ISAExt::AVX512_IFMA, ISAExt::SHA, + // ISAExt::GFNI, ISAExt::VAES, ISAExt::AVX512_VBMI2, ISAExt::VPCLMULQDQ, + // ISAExt::AVX512_BITALG, ISAExt::RDPID, ISAExt::AVX512_VPOPCNTDQ, ISAExt::PCONFIG, + // ISAExt::WBNOINVD, ISAExt::CLWB + // }); + case CPUArch::sapphirerapids: + // return arch_is_supported(CPUArch::icelake_server) && check_extensions(std::vector{ + return arch_is_supported(CPUArch::cascadelake) && check_extensions(std::vector{ + ISAExt::MOVDIRI, ISAExt::MOVDIR64B, ISAExt::ENQCMD, ISAExt::CLDEMOTE, + ISAExt::PTWRITE, ISAExt::WAITPKG, ISAExt::SERIALIZE, ISAExt::TSXLDTRK, + ISAExt::UINTR, ISAExt::AMX_BF16, ISAExt::AMX_TILE, ISAExt::AMX_INT8, + ISAExt::AVX_VNNI, ISAExt::AVX512_FP16, ISAExt::AVX512_BF16 + }); + // case CPUArch::graniterapids: + // return arch_is_supported(CPUArch::sapphirerapids) && check_extensions(std::vector{ + // ISAExt::AMX_FP16, ISAExt::PREFETCHI + // }); + // case CPUArch::graniterapids_d: + // return arch_is_supported(CPUArch::graniterapids) && check_extensions(std::vector{ + // ISAExt::AMX_COMPLEX + // }); +#elif defined(__aarch64__) + // TODO: complete lists of supported extensions + case CPUArch::neoverse_n1: + return check_extension(ISAExt::SVE); + case CPUArch::neoverse_v1: + return arch_is_supported(CPUArch::neoverse_n1) && check_extensions(std::vector{ + ISAExt::SVE2 + }); +#endif + default: + return false; + } +} + +class CPUArchEnvironment { +public: + static CPUArchEnvironment& get_instance() { + // TODO: ensure thread safety + static CPUArchEnvironment instance; + return instance; + } + CPUArch get_cpu_arch() const { + return max_arch_; + } +private: + CPUArchEnvironment() { + const std::vector compiled_archs = { +#if defined(SVS_CPUARCH_NATIVE) + CPUArch::native, +#elif defined(__x86_64__) + // TODO: add support for dynamic list of compiled archs + CPUArch::x86_64_v3, + CPUArch::broadwell, + CPUArch::skylake, + CPUArch::skylake_avx512, + CPUArch::cascadelake, + CPUArch::sapphirerapids, +#endif + }; + compiled_archs_ = compiled_archs; + max_arch_ = CPUArch::baseline; + for (const auto& arch : compiled_archs_) { + if (arch_is_supported(arch)) { + supported_archs_.push_back(arch); + if (static_cast(arch) > static_cast(max_arch_)) { + max_arch_ = arch; + } + } + } + } + + std::vector compiled_archs_; + std::vector supported_archs_; + CPUArch max_arch_; +}; + +#define SVS_PACK_ARGS(...) __VA_ARGS__ +#define SVS_CLASS_METHOD_CPUARCH_CASE(cpuarch, cls, method, args) \ + case svs::arch::CPUArch::cpuarch: \ + return cls::method(args); \ + break; +#if defined(SVS_CPUARCH_NATIVE) + #define SVS_TARGET_CPUARCH svs::arch::CPUArch::native + + #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) template class cls; + + #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ + return cls::method(args); +#elif defined(__x86_64__) + #define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET + + #if defined(SVS_EXTERNAL_CPUARCH_INSTANCE) + #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) + #else + #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) \ + template class cls; \ + extern template class cls; \ + extern template class cls; \ + extern template class cls; \ + extern template class cls; \ + extern template class cls; \ + extern template class cls; + #endif + + #define SVS_INSTANTIATE_CLASS_BY_CPUARCH(cls) template class cls; + + #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ + svs::arch::CPUArch cpu_arch = svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_CPUARCH_CASE(x86_64_v3, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ + } +#elif defined(__aarch64__) + #define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET + + #if defined(SVS_EXTERNAL_CPUARCH_INSTANCE) + #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) + #else + #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) \ + template class cls; \ + extern template class cls; \ + extern template class cls; + #endif + + #define SVS_INSTANTIATE_CLASS_BY_CPUARCH(cls) template class cls; + + #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ + svs::arch::CPUArch cpu_arch = svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_n1, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ + } +#endif + +} // namespace svs::arch diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h new file mode 100644 index 00000000..0621248c --- /dev/null +++ b/include/svs/lib/cpuid.h @@ -0,0 +1,253 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace svs::arch { + +#if defined(__x86_64__) + +enum class ISAExt { + // Common extensions + MMX, SSE, SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, POPCNT, CX16, SAHF, FXSR, + AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, + MOVBE, RDSEED, ADCX, PREFETCHW, AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, + CLWB, PKU, SHA, GFNI, VAES, VPCLMULQDQ, RDPID, PCONFIG, WBNOINVD, MOVDIRI, MOVDIR64B, + ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK, UINTR, PREFETCHI, + + // AVX family + AVX_VNNI, + + // AVX512_ family + AVX512_F, AVX512_VL, AVX512_BW, AVX512_DQ, AVX512_CD, AVX512_VBMI, AVX512_IFMA, AVX512_VNNI, + AVX512_VBMI2, AVX512_BITALG, AVX512_VPOPCNTDQ, AVX512_BF16, AVX512_FP16, + + // AMX family + AMX_BF16, AMX_TILE, AMX_INT8, AMX_FP16, AMX_COMPLEX +}; + +struct CPUIDFlag { + const uint32_t function; // EAX input for CPUID + const uint32_t subfunction; // ECX input for CPUID + const uint32_t reg; // Register index (0=EAX, 1=EBX, 2=ECX, 3=EDX) + const uint32_t bit; // Bit position in the register + const char* name; + + bool get_value() const { + std::array regs{}; + __cpuid_count(function, subfunction, regs[0], regs[1], regs[2], regs[3]); + return (regs[reg] & (1 << bit)) != 0; + } +}; + +inline const std::unordered_map ISAExtInfo = { + // flags are sorted by function, subfunction, register and bit + {ISAExt::MMX, {1, 0, 3, 23, "MMX"}}, + {ISAExt::FXSR, {1, 0, 3, 24, "FXSR"}}, + {ISAExt::SSE, {1, 0, 3, 25, "SSE"}}, + {ISAExt::SSE2, {1, 0, 3, 26, "SSE2"}}, + {ISAExt::SSE3, {1, 0, 2, 0, "SSE3"}}, + {ISAExt::PCLMUL, {1, 0, 2, 1, "PCLMUL"}}, + {ISAExt::SSSE3, {1, 0, 2, 9, "SSSE3"}}, + {ISAExt::FMA, {1, 0, 2, 12, "FMA"}}, + {ISAExt::CX16, {1, 0, 2, 13, "CX16"}}, + {ISAExt::SSE4_1, {1, 0, 2, 19, "SSE4_1"}}, + {ISAExt::SSE4_2, {1, 0, 2, 20, "SSE4_2"}}, + {ISAExt::MOVBE, {1, 0, 2, 22, "MOVBE"}}, + {ISAExt::POPCNT, {1, 0, 2, 23, "POPCNT"}}, + {ISAExt::AES, {1, 0, 2, 25, "AES"}}, + {ISAExt::XSAVE, {1, 0, 2, 26, "XSAVE"}}, + {ISAExt::AVX, {1, 0, 2, 28, "AVX"}}, + {ISAExt::F16C, {1, 0, 2, 29, "F16C"}}, + {ISAExt::RDRND, {1, 0, 2, 30, "RDRND"}}, + {ISAExt::FSGSBASE, {7, 0, 1, 0, "FSGSBASE"}}, + {ISAExt::SGX, {7, 0, 1, 2, "SGX"}}, + {ISAExt::BMI, {7, 0, 1, 3, "BMI"}}, + {ISAExt::AVX2, {7, 0, 1, 5, "AVX2"}}, + {ISAExt::BMI2, {7, 0, 1, 8, "BMI2"}}, + {ISAExt::AVX512_F, {7, 0, 1, 16, "AVX512_F"}}, + {ISAExt::AVX512_DQ, {7, 0, 1, 17, "AVX512_DQ"}}, + {ISAExt::RDSEED, {7, 0, 1, 18, "RDSEED"}}, + {ISAExt::ADCX, {7, 0, 1, 19, "ADCX"}}, + {ISAExt::AVX512_IFMA, {7, 0, 1, 21, "AVX512_IFMA"}}, + {ISAExt::CLFLUSHOPT, {7, 0, 1, 23, "CLFLUSHOPT"}}, + {ISAExt::CLWB, {7, 0, 1, 24, "CLWB"}}, + {ISAExt::AVX512_CD, {7, 0, 1, 28, "AVX512_CD"}}, + {ISAExt::SHA, {7, 0, 1, 29, "SHA"}}, + {ISAExt::AVX512_BW, {7, 0, 1, 30, "AVX512_BW"}}, + {ISAExt::AVX512_VL, {7, 0, 1, 31, "AVX512_VL"}}, + {ISAExt::AVX512_VBMI, {7, 0, 2, 1, "AVX512_VBMI"}}, + {ISAExt::PKU, {7, 0, 2, 3, "PKU"}}, + {ISAExt::WAITPKG, {7, 0, 2, 5, "WAITPKG"}}, + {ISAExt::AVX512_VBMI2, {7, 0, 2, 6, "AVX512_VBMI2"}}, + {ISAExt::GFNI, {7, 0, 2, 8, "GFNI"}}, + {ISAExt::VAES, {7, 0, 2, 9, "VAES"}}, + {ISAExt::VPCLMULQDQ, {7, 0, 2, 10, "VPCLMULQDQ"}}, + {ISAExt::AVX512_VNNI, {7, 0, 2, 11, "AVX512_VNNI"}}, + {ISAExt::AVX512_BITALG, {7, 0, 2, 12, "AVX512_BITALG"}}, + {ISAExt::AVX512_VPOPCNTDQ, {7, 0, 2, 14, "AVX512_VPOPCNTDQ"}}, + {ISAExt::RDPID, {7, 0, 2, 22, "RDPID"}}, + {ISAExt::CLDEMOTE, {7, 0, 2, 25, "CLDEMOTE"}}, + {ISAExt::MOVDIRI, {7, 0, 2, 27, "MOVDIRI"}}, + {ISAExt::MOVDIR64B, {7, 0, 2, 28, "MOVDIR64B"}}, + {ISAExt::ENQCMD, {7, 0, 2, 29, "ENQCMD"}}, + {ISAExt::UINTR, {7, 0, 3, 5, "UINTR"}}, + {ISAExt::SERIALIZE, {7, 0, 3, 14, "SERIALIZE"}}, + {ISAExt::TSXLDTRK, {7, 0, 3, 16, "TSXLDTRK"}}, + {ISAExt::PCONFIG, {7, 0, 3, 18, "PCONFIG"}}, + {ISAExt::AMX_BF16, {7, 0, 3, 22, "AMX_BF16"}}, + {ISAExt::AVX512_FP16, {7, 0, 3, 23, "AVX512_FP16"}}, + {ISAExt::AMX_TILE, {7, 0, 3, 24, "AMX_TILE"}}, + {ISAExt::AMX_INT8, {7, 0, 3, 25, "AMX_INT8"}}, + {ISAExt::AVX_VNNI, {7, 1, 0, 4, "AVX_VNNI"}}, + {ISAExt::AVX512_BF16, {7, 1, 0, 5, "AVX512_BF16"}}, + {ISAExt::AMX_FP16, {7, 1, 0, 21, "AMX_FP16"}}, + {ISAExt::AMX_COMPLEX, {7, 1, 3, 8, "AMX_COMPLEX"}}, + {ISAExt::PREFETCHI, {7, 1, 3, 14, "PREFETCHI"}}, + {ISAExt::XSAVEC, {0xD, 1, 0, 1, "XSAVEC"}}, + {ISAExt::XSAVES, {0xD, 1, 0, 3, "XSAVES"}}, + {ISAExt::PTWRITE, {0x14, 0, 1, 4, "PTWRITE"}}, + {ISAExt::WBNOINVD, {0x80000008, 0, 1, 9, "WBNOINVD"}}, + {ISAExt::SAHF, {0x80000001, 0, 2, 0, "SAHF"}}, + {ISAExt::LZCNT, {0x80000001, 0, 2, 5, "LZCNT"}}, + {ISAExt::PREFETCHW, {0x80000001, 0, 2, 8, "PREFETCHW"}}, +}; + +// if defined(__x86_64__) +#elif defined(__aarch64__) + +// TODO: complete support of __aarch64__ +enum class ISAExt { + SVE, SVE2 +}; + +// Define register ID values for ARM features detection +#define ID_AA64PFR0_EL1 0 +#define ID_AA64PFR1_EL1 1 +#define ID_AA64ISAR0_EL1 2 +#define ID_AA64ISAR1_EL1 3 +#define ID_AA64MMFR0_EL1 4 +#define ID_AA64MMFR1_EL1 5 +#define ID_AA64MMFR2_EL1 6 +#define ID_AA64DFR0_EL1 7 +#define ID_AA64DFR1_EL1 8 +#define ID_AA64ZFR0_EL1 9 + +// Helper template to read system registers with mrs instruction +template +inline uint64_t read_system_reg() { + uint64_t val; + if constexpr (ID == ID_AA64PFR0_EL1) { + asm("mrs %0, id_aa64pfr0_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64PFR1_EL1) { + asm("mrs %0, id_aa64pfr1_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64ISAR0_EL1) { + asm("mrs %0, id_aa64isar0_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64ISAR1_EL1) { + asm("mrs %0, id_aa64isar1_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64MMFR0_EL1) { + asm("mrs %0, id_aa64mmfr0_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64MMFR1_EL1) { + asm("mrs %0, id_aa64mmfr1_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64MMFR2_EL1) { + asm("mrs %0, id_aa64mmfr2_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64DFR0_EL1) { + asm("mrs %0, id_aa64dfr0_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64DFR1_EL1) { + asm("mrs %0, id_aa64dfr1_el1" : "=r" (val)); + } else if constexpr (ID == ID_AA64ZFR0_EL1) { + asm("mrs %0, id_aa64zfr0_el1" : "=r" (val)); + } else { + val = 0; + } + return val; +} + +// Extract bits from register value +inline uint64_t extract_bits(uint64_t val, int pos, int len) { + return (val >> pos) & ((1ULL << len) - 1); +} + +struct MSRFlag { + unsigned int reg_id; // System register ID + int bit_pos; // Bit position in the register + int bit_len; // Number of bits to check + uint64_t expected_val; // Expected value for feature to be present + const char* name; // Feature name + + bool get_value() const { + uint64_t reg_val = 0; + + try { + switch(reg_id) { + case ID_AA64PFR0_EL1: + reg_val = read_system_reg(); + break; + case ID_AA64PFR1_EL1: + reg_val = read_system_reg(); + break; + case ID_AA64ISAR0_EL1: + reg_val = read_system_reg(); + break; + case ID_AA64ISAR1_EL1: + reg_val = read_system_reg(); + break; + case ID_AA64ZFR0_EL1: + // First check if SVE is supported to avoid + if (extract_bits(read_system_reg(), 32, 4) != 0) { + reg_val = read_system_reg(); + } + break; + default: + return false; + } + + return extract_bits(reg_val, bit_pos, bit_len) == expected_val; + } catch (...) { + // If reading the register fails, the feature is not supported + return false; + } + } +}; + +inline const std::unordered_map ISAExtInfo = { + {ISAExt::SVE, {ID_AA64PFR0_EL1, 32, 4, 1, "sve"}}, + {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, +}; + +#endif // if defined(__aarch64__) + +inline bool check_extension(ISAExt ext) { + return ISAExtInfo.at(ext).get_value(); +} + +inline bool check_extensions(std::vector exts) { + for (const auto& ext : exts) { + if (!check_extension(ext)) { + return false; + } + } + return true; +} + +} // namespace svs::arch diff --git a/tests/svs/core/distances/cosine.cpp b/tests/svs/core/distances/cosine.cpp index 8c68ebc1..eb86f5cf 100644 --- a/tests/svs/core/distances/cosine.cpp +++ b/tests/svs/core/distances/cosine.cpp @@ -85,11 +85,12 @@ void test_types(T lo, T hi, size_t num_tests) { // Statically Sized Computation auto a_norm = svs::distance::norm(std::span{a.data(), a.size()}); CATCH_REQUIRE( - (svs::distance::CosineSimilarity::compute(a.data(), b.data(), a_norm) == + // TODO: replace baseline with something else? + (svs::distance::CosineSimilarity::compute(a.data(), b.data(), a_norm) == expected) ); // Dynamically Sized Computation - auto dist = svs::distance::CosineSimilarity::compute(a.data(), b.data(), a_norm, N); + auto dist = svs::distance::CosineSimilarity::compute(a.data(), b.data(), a_norm, N); CATCH_REQUIRE((dist == expected)); } } From d812859c7845e74d1e7c86282ce194ad27eda6b8 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 23 Apr 2025 03:55:45 -0700 Subject: [PATCH 02/65] Fix cpuid header --- include/svs/lib/cpuid.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h index 0621248c..7dafd019 100644 --- a/include/svs/lib/cpuid.h +++ b/include/svs/lib/cpuid.h @@ -19,10 +19,13 @@ #include #include #include -#include #include #include +#if defined(__x86_64__) +#include +#endif + namespace svs::arch { #if defined(__x86_64__) From 08a272e7caba0c6e3509cc2111a7d95000fe41d6 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 23 Apr 2025 08:49:32 -0700 Subject: [PATCH 03/65] Remove dynamic _svs_* loading --- bindings/python/CMakeLists.txt | 4 +- bindings/python/src/svs/__init__.py | 2 +- bindings/python/src/svs/loader.py | 157 +--------------------------- 3 files changed, 3 insertions(+), 160 deletions(-) diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index f5571355..8dcf2cc8 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -102,8 +102,6 @@ set(CPUARCH_CPP_FILES src/instantiations.cpp ) -find_package(pybind11 REQUIRED) - # Generate an object file for each target microarchitecture. set(OBJECT_FILES "") foreach(MICRO OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) @@ -121,7 +119,7 @@ foreach(MICRO OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) endforeach() set(LIB_NAME "_svs") -add_library(${LIB_NAME} SHARED ${OBJECT_FILES} ${CPP_FILES}) +pybind11_add_module(${LIB_NAME} MODULE ${CPP_FILES} ${OBJECT_FILES}) target_link_libraries(${LIB_NAME} PRIVATE pybind11::module) target_link_libraries(${LIB_NAME} PUBLIC svs::svs) # Dependency "fmt::fmt" obtained from "svs" diff --git a/bindings/python/src/svs/__init__.py b/bindings/python/src/svs/__init__.py index dd9948e7..6379826b 100644 --- a/bindings/python/src/svs/__init__.py +++ b/bindings/python/src/svs/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. # Dynamic loading logic. -from .loader import library, current_backend, available_backends +from .loader import library # Reexport all public functions and structs from the inner module. lib = library() diff --git a/bindings/python/src/svs/loader.py b/bindings/python/src/svs/loader.py index 1390cf79..06d057c6 100644 --- a/bindings/python/src/svs/loader.py +++ b/bindings/python/src/svs/loader.py @@ -12,163 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -# dep pre-coms -import archspec.cpu as cpu - -# standard library -import json import importlib -import os -from pathlib import Path - -# Get environment variables for configuring warnings and overriding backend selection. -def _is_quiet(): - """ - Return whether or not backend loading should be "quiet". - In this context, "quiet" means not warning for older architectures. - """ - return os.environ.get("SVS_QUIET", False) - -def _override_backend(): - """ - Return a manual override for the backend. - If no override is set, return `None`. - """ - return os.environ.get("SVS_OVERRIDE_BACKEND", None) - - -# The name of the manifest file. -FLAGS_MANIFEST = "flags_manifest.json" # Keep in-sync with CMakeLists.txt - -def _library_from_suffix(suffix): - return f"._svs_{suffix}" - -def _message_prehook(spec, host = cpu.host()): - """ - Emit any special messages for the given microarchitecture spec. - """ - if _is_quiet(): - return - - if isinstance(spec, str): - spec = cpu.TARGETS[spec] - - import warnings - if spec <= cpu.TARGETS["skylake_avx512"]: - message = f""" - Loading library for an older CPU architecture ({spec}). - Performance may be degraded. - """ - warnings.warn(message, RuntimeWarning) - - if host < spec: - message = """ - Override backend is target for a newer CPU than the one you're currently using. - Application may crash. - """ - warnings.warn(message, RuntimeWarning) - - -# The backend being used for this session -__CURRENT_BACKEND__ = None -def current_backend(): - """ - Return the name of the current backend. - """ - return __CURRENT_BACKEND__ - -def __set_backend_once__(suffix: str, spec): - global __CURRENT_BACKEND__ - if __CURRENT_BACKEND__ == None: - _message_prehook(spec) - __CURRENT_BACKEND__ = str(suffix) - - return current_backend() -# The dynamically loaded module. -__LIBRARY__ = None - -def _load_manifest(): - """ - Determine which shared library to load to supply the C++ extentions. - """ - json_file = Path(__file__).parent / FLAGS_MANIFEST - json_file_alternate = Path(__file__).parent.parent / FLAGS_MANIFEST - - # Try to give a somewhat helpful error message if the JSON manifest file was not - # generated properly by Scikit-build/CMake - if json_file.exists(): - with open(json_file, "r") as io: - return json.load(io) - elif json_file_alternate.exists(): - with open(json_file_alternate, "r") as io: - return json.load(io) - else: - print(Path(str(json_file).replace("ai.similarity-search.gss/", ""))) - raise RuntimeError(f""" - Expected a file {FLAGS_MANIFEST} to exist in the source directory to describe the - attributes of the libraries bundled with this application. - - No such file was found. - - Please report this to the project maintainer! - """) - -def available_backends(): - """ - Return a list of the available backends that where compiled when this module was built. - - Each backend in the list may be used to initialize ``SVS_OVERRIDE_BACKEND`` - environment variable prior to application start to override the default loading logic. - """ - return list(_load_manifest()["libraries"].keys()) - -def _find_library(): - """ - Find the appropriate library to load for this micro architecture. - """ - - # Get the current CPU and the manifest of compiled libraries that ship with this - # library. - host = cpu.host() - manifest = _load_manifest() - - # Respect override requests. - # Down stream loading will fail if the given option doesn't exist. - # - # However, if an override is explicitly given, then we can assume that the use knows - # what they're doing and can respond to a loading failure correctly. - override = _override_backend() - if override is not None: - spec = cpu.TARGETS[manifest["libraries"][override]] - return __set_backend_once__(override, spec) - - # Assume architectures in the manifest are place in order of preference. - # TODO: Revisit this assumption. - for (suffix, microarch) in manifest["libraries"].items(): - # Are we compatible with this micro architecture? - spec = cpu.TARGETS[microarch] - if spec <= host: - return __set_backend_once__(suffix, spec) - - raise RuntimeError(f""" - Could not find a suitable backend for your machine ({host}). - Please contact the project maintainers! - """) - -def __load_module_once__(): - global __LIBRARY__ - if __LIBRARY__ is None: - library_name = _library_from_suffix(_find_library()) - __LIBRARY__ = importlib.import_module(library_name, package = "svs") def library(): - """ - Return the library backend as a module. Dynamically loads the library when first called. - - Dynamically loading the library may trigger warnings related to correctness or - performance. If you really **really** don't want these warnings, they can be suppressed - by defining the environemtn variable ``SVS_QUIET=YES`` prior to application start. - """ - __load_module_once__() - return __LIBRARY__ + return importlib.import_module("._svs", package = "svs") From c7343aa93a12ca9e3269ef9f9996cbac4b7f4a37 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 23 Apr 2025 08:58:36 -0700 Subject: [PATCH 04/65] Remove tests for _svs_* loader --- bindings/python/tests/test_loader.py | 121 --------------------------- 1 file changed, 121 deletions(-) delete mode 100644 bindings/python/tests/test_loader.py diff --git a/bindings/python/tests/test_loader.py b/bindings/python/tests/test_loader.py deleted file mode 100644 index c9abb886..00000000 --- a/bindings/python/tests/test_loader.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Test the dynamic loading logic -import archspec.cpu as cpu -import unittest -import os -import warnings - -import svs.loader as loader - -def set_quiet(): - os.environ["SVS_QUIET"] = "YES" - -def clear_quiet(): - os.environ.pop("SVS_QUIET", None) - -def set_override(override: str): - os.environ["SVS_OVERRIDE_BACKEND"] = override - -def clear_override(): - os.environ.pop("SVS_OVERRIDE_BACKEND", None) - -class LoadingTester(unittest.TestCase): - def __unset_environment_variables__(self): - clear_quiet() - clear_override() - - def tearDown(self): - self.__unset_environment_variables__() - - def test_environment_variables(self): - # Clear the environment variables in question. - self.__unset_environment_variables__() - - # Make sure "is_quiet" behaves correctly. - self.assertFalse(loader._is_quiet()) - set_quiet() - self.assertTrue(loader._is_quiet()) - self.__unset_environment_variables__() - self.assertFalse(loader._is_quiet()) - - # Now, check that "override_backend" works. - self.assertEqual(loader._override_backend(), None) - set_override("hello") - self.assertEqual(loader._override_backend(), "hello") - set_override("north") - self.assertEqual(loader._override_backend(), "north") - clear_override() - self.assertEqual(loader._override_backend(), None) - self.__unset_environment_variables__() - - def test_suffix(self): - self.assertEqual(loader._library_from_suffix("native"), "._svs_native") - self.assertEqual(loader._library_from_suffix("cascadelake"), "._svs_cascadelake") - - def test_available_backends(self): - self.assertGreaterEqual(len(loader.available_backends()), 1) - - def test_manifest(self): - manifest = loader._load_manifest() - self.assertTrue("toolchain" in manifest) - self.assertTrue("libraries" in manifest) - - toolchain = manifest["toolchain"] - self.assertTrue("compiler" in toolchain) - self.assertTrue("compiler_version" in toolchain) - - libraries = manifest["libraries"] - self.assertGreaterEqual(len(libraries), 1) - - def test_message_prehook(self): - # Cause all warnings to always be triggered. - warnings.simplefilter("always") - - # Refer to - # https://docs.python.org/3/library/warnings.html#testing-warnings - # for how to test warnings. - - # Warning for the host being greater than the spec. - spec = cpu.TARGETS["icelake"] - host = cpu.TARGETS["skylake"] - with warnings.catch_warnings(record = True) as w: - loader._message_prehook(spec, host) - self.assertTrue(len(w) == 1) - self.assertTrue(issubclass(w[-1].category, RuntimeWarning)) - self.assertTrue("Override" in str(w[-1].message)) - - # Running again with "quiet" enabled should suppress the warning - set_quiet() - with warnings.catch_warnings(record = True) as w: - loader._message_prehook(spec, host) - self.assertTrue(len(w) == 0) - - # Warning for using an old architecture. - clear_quiet() - archs = ["haswell", "skylake", "skylake_avx512"] - for arch in archs: - with warnings.catch_warnings(record = True) as w: - loader._message_prehook(arch) - # Number of warnings can exceed 1 if running on an older CPU. - # In this latter case, we get a "newer CPU" warning as well. - self.assertTrue(len(w) >= 1) - self.assertTrue(issubclass(w[0].category, RuntimeWarning)) - self.assertTrue("older CPU" in str(w[0].message)) - - def test_loaded(self): - libraries = loader._load_manifest()["libraries"] - self.assertTrue(loader.current_backend() in libraries) - self.assertNotEqual(loader.library(), None) From 56a84e7de2ff62ee5704e0847da9932ff1bcc76c Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 24 Apr 2025 00:32:36 -0700 Subject: [PATCH 05/65] TEMP: enable cmake verbosity options --- bindings/python/setup.py | 2 ++ cmake/options.cmake | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 83c46e54..02d7481c 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -21,6 +21,8 @@ # # This at least lets us have some kind of compatibility with older CPUs. cmake_args = [ + "-DCMAKE_BUILD_TYPE=Debug", + "-DCMAKE_VERBOSE_MAKEFILE=ON", # Export compile commands to allow us to explore compiler flags as needed. "-DCMAKE_EXPORT_COMPILE_COMMANDS=YES", ] diff --git a/cmake/options.cmake b/cmake/options.cmake index f925d114..a7c1863e 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -19,9 +19,13 @@ set(svs_options_cmake_included true) # Default to Release build if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release) + set(CMAKE_BUILD_TYPE Debug) endif() +# Enable all possible debugging options +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -O0") + ##### ##### Official Options ##### From 6cecfedfe7f89dcd4750ea532d98064f3b5ae662 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 24 Apr 2025 05:49:47 -0700 Subject: [PATCH 06/65] Correct instantiation macros workflow --- bindings/python/src/instantiations.cpp | 2 +- include/svs/core/distance/cosine.h | 14 +++++++++-- include/svs/lib/arch.h | 34 +++++--------------------- 3 files changed, 19 insertions(+), 31 deletions(-) diff --git a/bindings/python/src/instantiations.cpp b/bindings/python/src/instantiations.cpp index 2dfc8738..a1b3de39 100644 --- a/bindings/python/src/instantiations.cpp +++ b/bindings/python/src/instantiations.cpp @@ -16,4 +16,4 @@ #include "svs/core/distance/cosine.h" -SVS_INSTANTIATE_CLASS_BY_CPUARCH(svs::distance::CosineSimilarity) +SVS_INSTANTIATE_COSINE_DISTANCE_BY_CPUARCH diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 157155dd..4c5bd90d 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -60,8 +60,6 @@ class CosineSimilarity { } }; -SVS_DECLARE_CLASS_BY_CPUARCH(CosineSimilarity) - /// /// @brief Functor for computing Cosine Similarity. /// @@ -339,4 +337,16 @@ template struct CosineSimilarityImpl; - #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ return cls::method(args); #elif defined(__x86_64__) #define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET - #if defined(SVS_EXTERNAL_CPUARCH_INSTANCE) - #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) - #else - #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) \ - template class cls; \ - extern template class cls; \ - extern template class cls; \ - extern template class cls; \ - extern template class cls; \ - extern template class cls; \ - extern template class cls; - #endif - - #define SVS_INSTANTIATE_CLASS_BY_CPUARCH(cls) template class cls; - #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ svs::arch::CPUArch cpu_arch = svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ switch (cpu_arch) { \ @@ -211,17 +194,6 @@ class CPUArchEnvironment { #elif defined(__aarch64__) #define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET - #if defined(SVS_EXTERNAL_CPUARCH_INSTANCE) - #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) - #else - #define SVS_DECLARE_CLASS_BY_CPUARCH(cls) \ - template class cls; \ - extern template class cls; \ - extern template class cls; - #endif - - #define SVS_INSTANTIATE_CLASS_BY_CPUARCH(cls) template class cls; - #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ svs::arch::CPUArch cpu_arch = svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ switch (cpu_arch) { \ @@ -233,4 +205,10 @@ class CPUArchEnvironment { } #endif +#define SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH(return_type, cls, method, template_args, args) \ + template return_type cls::method(args); +// Distance-specific dispatching macros +#define SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(cls, a_type, b_type) \ + SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) + } // namespace svs::arch From edd5c9b886b2532952fbda58c1d522da319f169c Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 24 Apr 2025 06:47:44 -0700 Subject: [PATCH 07/65] Extend support to L2 and IP distances --- bindings/python/src/instantiations.cpp | 4 ++ include/svs/core/distance/cosine.h | 16 ++--- include/svs/core/distance/euclidean.h | 62 +++++++++++++------ include/svs/core/distance/inner_product.h | 62 +++++++++++++------ include/svs/lib/arch.h | 5 +- .../svs/core/distances/distance_euclidean.cpp | 4 +- tests/svs/core/distances/inner_product.cpp | 4 +- 7 files changed, 104 insertions(+), 53 deletions(-) diff --git a/bindings/python/src/instantiations.cpp b/bindings/python/src/instantiations.cpp index a1b3de39..894111c0 100644 --- a/bindings/python/src/instantiations.cpp +++ b/bindings/python/src/instantiations.cpp @@ -15,5 +15,9 @@ */ #include "svs/core/distance/cosine.h" +#include "svs/core/distance/inner_product.h" +#include "svs/core/distance/euclidean.h" SVS_INSTANTIATE_COSINE_DISTANCE_BY_CPUARCH +SVS_INSTANTIATE_L2_DISTANCE_BY_CPUARCH +SVS_INSTANTIATE_IP_DISTANCE_BY_CPUARCH diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 4c5bd90d..30cc5b2c 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -340,13 +340,13 @@ template struct CosineSimilarityImpl struct L2Impl; +template struct L2Impl; // Generic Entry Point // Call as one of either: @@ -80,16 +81,17 @@ template struct L2Impl; // (2) L2::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. +template class L2 { public: template static constexpr float compute(const Ea* a, const Eb* b, size_t N) { - return L2Impl::compute(a, b, lib::MaybeStatic(N)); + return L2Impl::compute(a, b, lib::MaybeStatic(N)); } template static constexpr float compute(const Ea* a, const Eb* b) { - return L2Impl::compute(a, b, lib::MaybeStatic()); + return L2Impl::compute(a, b, lib::MaybeStatic()); } }; @@ -155,9 +157,17 @@ float compute(DistanceL2 /*unused*/, std::span a, std::span b) { assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - return L2::compute(a.data(), b.data(), a.size()); + SVS_DISPATCH_CLASS_BY_CPUARCH( + L2, + compute, + SVS_PACK_ARGS(a.data(), b.data(), a.size()) + ); } else { - return L2::compute(a.data(), b.data()); + SVS_DISPATCH_CLASS_BY_CPUARCH( + L2, + compute, + SVS_PACK_ARGS(a.data(), b.data()) + ); } } @@ -177,7 +187,7 @@ float generic_l2( return result; } -template struct L2Impl { +template struct L2Impl { static constexpr float compute(const Ea* a, const Eb* b, lib::MaybeStatic length = lib::MaybeStatic()) { return generic_l2(a, b, length); @@ -252,14 +262,14 @@ template <> struct L2VNNIOp : public svs::simd::ConvertForVNNI struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2VNNIOp(), a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2VNNIOp(), a, b, length); @@ -269,42 +279,42 @@ template struct L2Impl { #endif // Floating and Mixed Types -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); }; }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); }; }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); @@ -320,7 +330,7 @@ template struct L2Impl { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -340,7 +350,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -362,7 +372,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -383,7 +393,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -407,7 +417,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -434,7 +444,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -462,4 +472,16 @@ template struct L2Impl { }; #endif + +// NOTE: dispatching doesn't work for other L2 instances than the listed below. +#define SVS_INSTANTIATE_L2_DISTANCE_BY_CPUARCH \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, int8_t, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, uint8_t, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, float) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, svs::float16::Float16) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, svs::float16::Float16, float) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, svs::float16::Float16, svs::float16::Float16) + } // namespace svs::distance diff --git a/include/svs/core/distance/inner_product.h b/include/svs/core/distance/inner_product.h index 2ad51e17..873e8f09 100644 --- a/include/svs/core/distance/inner_product.h +++ b/include/svs/core/distance/inner_product.h @@ -19,6 +19,7 @@ // svs #include "svs/core/distance/distance_core.h" #include "svs/core/distance/simd_utils.h" +#include "svs/lib/arch.h" #include "svs/lib/float16.h" #include "svs/lib/preprocessor.h" #include "svs/lib/saveload.h" @@ -32,7 +33,7 @@ namespace svs::distance { // Forward declare implementation to allow entry point to be near the top. -template struct IPImpl; +template struct IPImpl; // Generic Entry Point // Call as one of either: @@ -41,16 +42,17 @@ template struct IPImpl; // (2) IP::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. +template class IP { public: template static constexpr float compute(const Ea* a, const Eb* b, size_t N) { - return IPImpl::compute(a, b, lib::MaybeStatic(N)); + return IPImpl::compute(a, b, lib::MaybeStatic(N)); } template static constexpr float compute(const Ea* a, const Eb* b) { - return IPImpl::compute(a, b, lib::MaybeStatic()); + return IPImpl::compute(a, b, lib::MaybeStatic()); } }; @@ -117,9 +119,17 @@ float compute(DistanceIP /*unused*/, std::span a, std::span b) { assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - return IP::compute(a.data(), b.data(), a.size()); + SVS_DISPATCH_CLASS_BY_CPUARCH( + IP, + compute, + SVS_PACK_ARGS(a.data(), b.data(), a.size()) + ); } else { - return IP::compute(a.data(), b.data()); + SVS_DISPATCH_CLASS_BY_CPUARCH( + IP, + compute, + SVS_PACK_ARGS(a.data(), b.data()) + ); } } @@ -138,7 +148,7 @@ float generic_ip( return result; } -template struct IPImpl { +template struct IPImpl { static float compute(const Ea* a, const Eb* b, lib::MaybeStatic length = lib::MaybeStatic()) { return generic_ip(a, b, length); @@ -207,14 +217,14 @@ template <> struct IPVNNIOp : public svs::simd::ConvertForVNNI struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(IPVNNIOp(), a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(IPVNNIOp(), a, b, length); @@ -224,42 +234,42 @@ template struct IPImpl { #endif // Floating and Mixed Types -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); }; }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); }; }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); @@ -274,7 +284,7 @@ template struct IPImpl { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -293,7 +303,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -314,7 +324,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -334,7 +344,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -357,7 +367,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -383,7 +393,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -410,4 +420,16 @@ template struct IPImpl { }; #endif + +// NOTE: dispatching doesn't work for other IP instances than the listed below. +#define SVS_INSTANTIATE_IP_DISTANCE_BY_CPUARCH \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, int8_t, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, uint8_t, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, float) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, svs::float16::Float16) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, svs::float16::Float16, float) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, svs::float16::Float16, svs::float16::Float16) + } // namespace svs::distance diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 31137725..fd6f0475 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -207,8 +207,11 @@ class CPUArchEnvironment { #define SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH(return_type, cls, method, template_args, args) \ template return_type cls::method(args); -// Distance-specific dispatching macros +// Generic distance dispatching macro #define SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(cls, a_type, b_type) \ + SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) +// Cosine distance dispatching macro +#define SVS_INST_COSINE_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(cls, a_type, b_type) \ SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) } // namespace svs::arch diff --git a/tests/svs/core/distances/distance_euclidean.cpp b/tests/svs/core/distances/distance_euclidean.cpp index 88c23fe4..8d68cb35 100644 --- a/tests/svs/core/distances/distance_euclidean.cpp +++ b/tests/svs/core/distances/distance_euclidean.cpp @@ -68,9 +68,9 @@ void test_types(T lo, T hi, size_t num_tests) { auto expected = Catch::Approx(euclidean_reference(a, b)); // Statically Sized Computation - CATCH_REQUIRE((svs::distance::L2::compute(a.data(), b.data()) == expected)); + CATCH_REQUIRE((svs::distance::L2::compute(a.data(), b.data()) == expected)); // Dynamically Sized Computation - CATCH_REQUIRE((svs::distance::L2::compute(a.data(), b.data(), N) == expected)); + CATCH_REQUIRE((svs::distance::L2::compute(a.data(), b.data(), N) == expected)); } } } // namespace diff --git a/tests/svs/core/distances/inner_product.cpp b/tests/svs/core/distances/inner_product.cpp index a074a058..c3046d1d 100644 --- a/tests/svs/core/distances/inner_product.cpp +++ b/tests/svs/core/distances/inner_product.cpp @@ -76,9 +76,9 @@ void test_types(T lo, T hi, size_t num_tests) { .margin(INNERPRODUCT_MARGIN); // Statically Sized Computation - CATCH_REQUIRE((svs::distance::IP::compute(a.data(), b.data()) == expected)); + CATCH_REQUIRE((svs::distance::IP::compute(a.data(), b.data()) == expected)); // Dynamically Sized Computation - CATCH_REQUIRE((svs::distance::IP::compute(a.data(), b.data(), N) == expected)); + CATCH_REQUIRE((svs::distance::IP::compute(a.data(), b.data(), N) == expected)); } } } // anonymous namespace From 5ae48617590fa76b0baefdf332316a39ef4cd5e5 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 24 Apr 2025 06:47:56 -0700 Subject: [PATCH 08/65] Revert "TEMP: enable cmake verbosity options" This reverts commit 9eb7a4bb38b238da57b214ca14829f8c31003fdb. --- bindings/python/setup.py | 2 -- cmake/options.cmake | 6 +----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 02d7481c..83c46e54 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -21,8 +21,6 @@ # # This at least lets us have some kind of compatibility with older CPUs. cmake_args = [ - "-DCMAKE_BUILD_TYPE=Debug", - "-DCMAKE_VERBOSE_MAKEFILE=ON", # Export compile commands to allow us to explore compiler flags as needed. "-DCMAKE_EXPORT_COMPILE_COMMANDS=YES", ] diff --git a/cmake/options.cmake b/cmake/options.cmake index a7c1863e..f925d114 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -19,13 +19,9 @@ set(svs_options_cmake_included true) # Default to Release build if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Debug) + set(CMAKE_BUILD_TYPE Release) endif() -# Enable all possible debugging options -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -O0") -set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -O0") - ##### ##### Official Options ##### From c8c9190057317fd5275d34f554257335c8966237 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Fri, 25 Apr 2025 07:21:16 -0700 Subject: [PATCH 09/65] Extend x86_64 archs check --- include/svs/lib/arch.h | 100 ++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 41 deletions(-) diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index fd6f0475..20322ca8 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -29,21 +29,24 @@ enum class CPUArch { #elif defined(__x86_64__) // Refer to the GCC docs for the list of targeted architectures: // https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html - x86_64_v3, - // Q: are aliases needed in future? - haswell = x86_64_v3, - core_avx2 = x86_64_v3, + nehalem, + x86_64_v2 = nehalem, + westmere, + sandybridge, + ivybridge, + haswell, + x86_64_v3 = haswell, broadwell, skylake, skylake_avx512, + x86_64_v4 = skylake_avx512, cascadelake, - // TODO: Uncomment once supported on python bindings side - // cooperlake, - // icelake_server, + cooperlake, + icelake_server, sapphirerapids, emeraldrapids = sapphirerapids, - // graniterapids, - // graniterapids_d, + graniterapids, + graniterapids_d, #elif defined(__aarch64__) neoverse_n1, neoverse_v1, @@ -57,16 +60,30 @@ inline bool arch_is_supported(CPUArch arch) { case CPUArch::native: return true; #elif defined(__x86_64__) - case CPUArch::x86_64_v3: + case CPUArch::nehalem: return check_extensions(std::vector{ ISAExt::MMX, ISAExt::SSE, ISAExt::SSE2, ISAExt::SSE3, ISAExt::SSSE3, ISAExt::SSE4_1, ISAExt::SSE4_2, ISAExt::POPCNT, ISAExt::CX16, ISAExt::SAHF, - ISAExt::FXSR, ISAExt::AVX, ISAExt::XSAVE, ISAExt::PCLMUL, ISAExt::FSGSBASE, - ISAExt::RDRND, ISAExt::F16C, ISAExt::AVX2, ISAExt::BMI, ISAExt::BMI2, - ISAExt::LZCNT, ISAExt::FMA, ISAExt::MOVBE + ISAExt::FXSR + }); + case CPUArch::westmere: + return arch_is_supported(CPUArch::nehalem) && check_extensions(std::vector{ + ISAExt::PCLMUL + }); + case CPUArch::sandybridge: + return arch_is_supported(CPUArch::westmere) && check_extensions(std::vector{ + ISAExt::AVX, ISAExt::XSAVE + }); + case CPUArch::ivybridge: + return arch_is_supported(CPUArch::sandybridge) && check_extensions(std::vector{ + ISAExt::FSGSBASE, ISAExt::RDRND, ISAExt::F16C + }); + case CPUArch::haswell: + return arch_is_supported(CPUArch::ivybridge) && check_extensions(std::vector{ + ISAExt::AVX2, ISAExt::BMI, ISAExt::BMI2, ISAExt::LZCNT, ISAExt::FMA, ISAExt::MOVBE }); case CPUArch::broadwell: - return arch_is_supported(CPUArch::x86_64_v3) && check_extensions(std::vector{ + return arch_is_supported(CPUArch::haswell) && check_extensions(std::vector{ ISAExt::RDSEED, ISAExt::ADCX, ISAExt::PREFETCHW }); case CPUArch::skylake: @@ -82,41 +99,38 @@ inline bool arch_is_supported(CPUArch arch) { return arch_is_supported(CPUArch::skylake_avx512) && check_extensions(std::vector{ ISAExt::AVX512_VNNI }); - // case CPUArch::cooperlake: - // return arch_is_supported(CPUArch::cascadelake) && check_extensions(std::vector{ - // ISAExt::AVX512_BF16 - // }); - // case CPUArch::icelake_server: - // return arch_is_supported(CPUArch::cooperlake) && check_extensions(std::vector{ - // ISAExt::PKU, ISAExt::AVX512_VBMI, ISAExt::AVX512_IFMA, ISAExt::SHA, - // ISAExt::GFNI, ISAExt::VAES, ISAExt::AVX512_VBMI2, ISAExt::VPCLMULQDQ, - // ISAExt::AVX512_BITALG, ISAExt::RDPID, ISAExt::AVX512_VPOPCNTDQ, ISAExt::PCONFIG, - // ISAExt::WBNOINVD, ISAExt::CLWB - // }); - case CPUArch::sapphirerapids: - // return arch_is_supported(CPUArch::icelake_server) && check_extensions(std::vector{ + case CPUArch::cooperlake: return arch_is_supported(CPUArch::cascadelake) && check_extensions(std::vector{ + ISAExt::AVX512_BF16 + }); + case CPUArch::icelake_server: + return arch_is_supported(CPUArch::cooperlake) && check_extensions(std::vector{ + ISAExt::PKU, ISAExt::AVX512_VBMI, ISAExt::AVX512_IFMA, ISAExt::SHA, + ISAExt::GFNI, ISAExt::VAES, ISAExt::AVX512_VBMI2, ISAExt::VPCLMULQDQ, + ISAExt::AVX512_BITALG, ISAExt::RDPID, ISAExt::AVX512_VPOPCNTDQ, ISAExt::PCONFIG, + ISAExt::WBNOINVD, ISAExt::CLWB + }); + case CPUArch::sapphirerapids: + return arch_is_supported(CPUArch::icelake_server) && check_extensions(std::vector{ ISAExt::MOVDIRI, ISAExt::MOVDIR64B, ISAExt::ENQCMD, ISAExt::CLDEMOTE, ISAExt::PTWRITE, ISAExt::WAITPKG, ISAExt::SERIALIZE, ISAExt::TSXLDTRK, ISAExt::UINTR, ISAExt::AMX_BF16, ISAExt::AMX_TILE, ISAExt::AMX_INT8, ISAExt::AVX_VNNI, ISAExt::AVX512_FP16, ISAExt::AVX512_BF16 }); - // case CPUArch::graniterapids: - // return arch_is_supported(CPUArch::sapphirerapids) && check_extensions(std::vector{ - // ISAExt::AMX_FP16, ISAExt::PREFETCHI - // }); - // case CPUArch::graniterapids_d: - // return arch_is_supported(CPUArch::graniterapids) && check_extensions(std::vector{ - // ISAExt::AMX_COMPLEX - // }); + case CPUArch::graniterapids: + return arch_is_supported(CPUArch::sapphirerapids) && check_extensions(std::vector{ + ISAExt::AMX_FP16, ISAExt::PREFETCHI + }); + case CPUArch::graniterapids_d: + return arch_is_supported(CPUArch::graniterapids) && check_extensions(std::vector{ + ISAExt::AMX_COMPLEX + }); #elif defined(__aarch64__) // TODO: complete lists of supported extensions case CPUArch::neoverse_n1: - return check_extension(ISAExt::SVE); + return check_extensions(std::vector{ISAExt::SVE}) case CPUArch::neoverse_v1: - return arch_is_supported(CPUArch::neoverse_n1) && check_extensions(std::vector{ - ISAExt::SVE2 - }); + return arch_is_supported(CPUArch::neoverse_n1) && check_extensions(std::vector{ISAExt::SVE2}); #endif default: return false; @@ -133,6 +147,7 @@ class CPUArchEnvironment { CPUArch get_cpu_arch() const { return max_arch_; } + private: CPUArchEnvironment() { const std::vector compiled_archs = { @@ -140,12 +155,15 @@ class CPUArchEnvironment { CPUArch::native, #elif defined(__x86_64__) // TODO: add support for dynamic list of compiled archs - CPUArch::x86_64_v3, + CPUArch::haswell, CPUArch::broadwell, CPUArch::skylake, CPUArch::skylake_avx512, CPUArch::cascadelake, CPUArch::sapphirerapids, +#elif defined(__aarch64__) + CPUArch::neoverse_n1, + CPUArch::neoverse_v1, #endif }; compiled_archs_ = compiled_archs; @@ -181,7 +199,7 @@ class CPUArchEnvironment { #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ svs::arch::CPUArch cpu_arch = svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ switch (cpu_arch) { \ - SVS_CLASS_METHOD_CPUARCH_CASE(x86_64_v3, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_CPUARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_CPUARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_CPUARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) \ From 43ee728cb7cabf16376946387f49b26d52ee6a84 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Fri, 25 Apr 2025 07:21:37 -0700 Subject: [PATCH 10/65] Remove pybind11 module name define --- bindings/python/src/python_bindings.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/bindings/python/src/python_bindings.cpp b/bindings/python/src/python_bindings.cpp index 8419dd86..30e03acc 100644 --- a/bindings/python/src/python_bindings.cpp +++ b/bindings/python/src/python_bindings.cpp @@ -44,15 +44,6 @@ #include #include -// Get the expected name of the library -// Make sure CMake stays up to date with defining this parameter. -// -// The variable allows us to customize the name of the python module to support -// micro-architecture versioning. -#if !defined(SVS_MODULE_NAME) -#define SVS_MODULE_NAME _svs -#endif - namespace py = pybind11; namespace { @@ -144,7 +135,7 @@ class ScopedModuleNameOverride { } // namespace -PYBIND11_MODULE(SVS_MODULE_NAME, m) { +PYBIND11_MODULE(_svs, m) { // Internall, the top level `__init__.py` imports everything from the C++ module named // `_svs`. // From f42bece8d809a674d6c05e68eb8a03070a0b5925 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 28 Apr 2025 08:46:07 -0700 Subject: [PATCH 11/65] Linting and other fixes --- bindings/python/src/instantiations.cpp | 2 +- include/svs/core/distance/cosine.h | 46 ++- include/svs/core/distance/euclidean.h | 27 +- include/svs/core/distance/inner_product.h | 27 +- include/svs/lib/arch.h | 235 ++++++++------ include/svs/lib/cpuid.h | 297 +++++++++++------- tests/svs/core/distances/cosine.cpp | 9 +- .../svs/core/distances/distance_euclidean.cpp | 12 +- tests/svs/core/distances/inner_product.cpp | 12 +- 9 files changed, 399 insertions(+), 268 deletions(-) diff --git a/bindings/python/src/instantiations.cpp b/bindings/python/src/instantiations.cpp index 894111c0..e31e0ecc 100644 --- a/bindings/python/src/instantiations.cpp +++ b/bindings/python/src/instantiations.cpp @@ -15,8 +15,8 @@ */ #include "svs/core/distance/cosine.h" -#include "svs/core/distance/inner_product.h" #include "svs/core/distance/euclidean.h" +#include "svs/core/distance/inner_product.h" SVS_INSTANTIATE_COSINE_DISTANCE_BY_CPUARCH SVS_INSTANTIATE_L2_DISTANCE_BY_CPUARCH diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 30cc5b2c..088d18bd 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -33,7 +33,8 @@ namespace svs::distance { // Forward declare implementation to allow entry point to be near the top. -template struct CosineSimilarityImpl; +template +struct CosineSimilarityImpl; // Generic Entry Point // Call as one of either: @@ -42,8 +43,7 @@ template struct Co // (2) CosineSimilarity::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -template -class CosineSimilarity { +template class CosineSimilarity { public: template static constexpr float compute(const Ea* a, const Eb* b, float a_norm, size_t N) { @@ -176,7 +176,8 @@ float generic_cosine_similarity( return result / (a_norm * std::sqrt(accum)); }; -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { static float compute( const Ea* a, const Eb* b, @@ -338,15 +339,32 @@ template struct CosineSimilarityImpl struct L2 // (2) L2::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -template -class L2 { +template class L2 { public: template static constexpr float compute(const Ea* a, const Eb* b, size_t N) { @@ -158,15 +157,11 @@ float compute(DistanceL2 /*unused*/, std::span a, std::span b) { constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { SVS_DISPATCH_CLASS_BY_CPUARCH( - L2, - compute, - SVS_PACK_ARGS(a.data(), b.data(), a.size()) + L2, compute, SVS_PACK_ARGS(a.data(), b.data(), a.size()) ); } else { SVS_DISPATCH_CLASS_BY_CPUARCH( - L2, - compute, - SVS_PACK_ARGS(a.data(), b.data()) + L2, compute, SVS_PACK_ARGS(a.data(), b.data()) ); } } @@ -474,14 +469,16 @@ template struct L2Impl { #endif // NOTE: dispatching doesn't work for other L2 instances than the listed below. -#define SVS_INSTANTIATE_L2_DISTANCE_BY_CPUARCH \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, int8_t, int8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, uint8_t, uint8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, float) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, uint8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, int8_t) \ +#define SVS_INSTANTIATE_L2_DISTANCE_BY_CPUARCH \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, int8_t, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, uint8_t, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, float) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, int8_t) \ SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, svs::float16::Float16) \ SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, svs::float16::Float16, float) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, svs::float16::Float16, svs::float16::Float16) + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES( \ + L2, svs::float16::Float16, svs::float16::Float16 \ + ) } // namespace svs::distance diff --git a/include/svs/core/distance/inner_product.h b/include/svs/core/distance/inner_product.h index 873e8f09..3402c4a6 100644 --- a/include/svs/core/distance/inner_product.h +++ b/include/svs/core/distance/inner_product.h @@ -42,8 +42,7 @@ template struct IP // (2) IP::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -template -class IP { +template class IP { public: template static constexpr float compute(const Ea* a, const Eb* b, size_t N) { @@ -120,15 +119,11 @@ float compute(DistanceIP /*unused*/, std::span a, std::span b) { constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { SVS_DISPATCH_CLASS_BY_CPUARCH( - IP, - compute, - SVS_PACK_ARGS(a.data(), b.data(), a.size()) + IP, compute, SVS_PACK_ARGS(a.data(), b.data(), a.size()) ); } else { SVS_DISPATCH_CLASS_BY_CPUARCH( - IP, - compute, - SVS_PACK_ARGS(a.data(), b.data()) + IP, compute, SVS_PACK_ARGS(a.data(), b.data()) ); } } @@ -422,14 +417,16 @@ template struct IPImpl { #endif // NOTE: dispatching doesn't work for other IP instances than the listed below. -#define SVS_INSTANTIATE_IP_DISTANCE_BY_CPUARCH \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, int8_t, int8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, uint8_t, uint8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, float) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, uint8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, int8_t) \ +#define SVS_INSTANTIATE_IP_DISTANCE_BY_CPUARCH \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, int8_t, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, uint8_t, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, float) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, int8_t) \ SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, svs::float16::Float16) \ SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, svs::float16::Float16, float) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, svs::float16::Float16, svs::float16::Float16) + SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES( \ + IP, svs::float16::Float16, svs::float16::Float16 \ + ) } // namespace svs::distance diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 20322ca8..27e5c122 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -1,5 +1,5 @@ /* - * Copyright 2023 Intel Corporation + * Copyright 2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,9 +18,6 @@ #include "svs/lib/cpuid.h" -// helper for IDE C++ language support -// #define SVS_CPUARCH_NATIVE 1 - namespace svs::arch { enum class CPUArch { @@ -62,75 +59,111 @@ inline bool arch_is_supported(CPUArch arch) { #elif defined(__x86_64__) case CPUArch::nehalem: return check_extensions(std::vector{ - ISAExt::MMX, ISAExt::SSE, ISAExt::SSE2, ISAExt::SSE3, ISAExt::SSSE3, - ISAExt::SSE4_1, ISAExt::SSE4_2, ISAExt::POPCNT, ISAExt::CX16, ISAExt::SAHF, - ISAExt::FXSR - }); + ISAExt::MMX, + ISAExt::SSE, + ISAExt::SSE2, + ISAExt::SSE3, + ISAExt::SSSE3, + ISAExt::SSE4_1, + ISAExt::SSE4_2, + ISAExt::POPCNT, + ISAExt::CX16, + ISAExt::SAHF, + ISAExt::FXSR}); case CPUArch::westmere: - return arch_is_supported(CPUArch::nehalem) && check_extensions(std::vector{ - ISAExt::PCLMUL - }); + return arch_is_supported(CPUArch::nehalem) && + check_extensions(std::vector{ISAExt::PCLMUL}); case CPUArch::sandybridge: - return arch_is_supported(CPUArch::westmere) && check_extensions(std::vector{ - ISAExt::AVX, ISAExt::XSAVE - }); + return arch_is_supported(CPUArch::westmere) && + check_extensions(std::vector{ISAExt::AVX, ISAExt::XSAVE}); case CPUArch::ivybridge: - return arch_is_supported(CPUArch::sandybridge) && check_extensions(std::vector{ - ISAExt::FSGSBASE, ISAExt::RDRND, ISAExt::F16C - }); + return arch_is_supported(CPUArch::sandybridge) && + check_extensions(std::vector{ + ISAExt::FSGSBASE, ISAExt::RDRND, ISAExt::F16C}); case CPUArch::haswell: - return arch_is_supported(CPUArch::ivybridge) && check_extensions(std::vector{ - ISAExt::AVX2, ISAExt::BMI, ISAExt::BMI2, ISAExt::LZCNT, ISAExt::FMA, ISAExt::MOVBE - }); + return arch_is_supported(CPUArch::ivybridge) && + check_extensions(std::vector{ + ISAExt::AVX2, + ISAExt::BMI, + ISAExt::BMI2, + ISAExt::LZCNT, + ISAExt::FMA, + ISAExt::MOVBE}); case CPUArch::broadwell: - return arch_is_supported(CPUArch::haswell) && check_extensions(std::vector{ - ISAExt::RDSEED, ISAExt::ADCX, ISAExt::PREFETCHW - }); + return arch_is_supported(CPUArch::haswell) && + check_extensions(std::vector{ + ISAExt::RDSEED, ISAExt::ADCX, ISAExt::PREFETCHW}); case CPUArch::skylake: - return arch_is_supported(CPUArch::broadwell) && check_extensions(std::vector{ - ISAExt::AES, ISAExt::CLFLUSHOPT, ISAExt::XSAVEC, ISAExt::XSAVES, ISAExt::SGX - }); + return arch_is_supported(CPUArch::broadwell) && + check_extensions(std::vector{ + ISAExt::AES, + ISAExt::CLFLUSHOPT, + ISAExt::XSAVEC, + ISAExt::XSAVES, + ISAExt::SGX}); case CPUArch::skylake_avx512: - return arch_is_supported(CPUArch::skylake) && check_extensions(std::vector{ - ISAExt::AVX512_F, ISAExt::CLWB, ISAExt::AVX512_VL, ISAExt::AVX512_BW, - ISAExt::AVX512_DQ, ISAExt::AVX512_CD - }); + return arch_is_supported(CPUArch::skylake) && + check_extensions(std::vector{ + ISAExt::AVX512_F, + ISAExt::CLWB, + ISAExt::AVX512_VL, + ISAExt::AVX512_BW, + ISAExt::AVX512_DQ, + ISAExt::AVX512_CD}); case CPUArch::cascadelake: - return arch_is_supported(CPUArch::skylake_avx512) && check_extensions(std::vector{ - ISAExt::AVX512_VNNI - }); + return arch_is_supported(CPUArch::skylake_avx512) && + check_extensions(std::vector{ISAExt::AVX512_VNNI}); case CPUArch::cooperlake: - return arch_is_supported(CPUArch::cascadelake) && check_extensions(std::vector{ - ISAExt::AVX512_BF16 - }); + return arch_is_supported(CPUArch::cascadelake) && + check_extensions(std::vector{ISAExt::AVX512_BF16}); case CPUArch::icelake_server: - return arch_is_supported(CPUArch::cooperlake) && check_extensions(std::vector{ - ISAExt::PKU, ISAExt::AVX512_VBMI, ISAExt::AVX512_IFMA, ISAExt::SHA, - ISAExt::GFNI, ISAExt::VAES, ISAExt::AVX512_VBMI2, ISAExt::VPCLMULQDQ, - ISAExt::AVX512_BITALG, ISAExt::RDPID, ISAExt::AVX512_VPOPCNTDQ, ISAExt::PCONFIG, - ISAExt::WBNOINVD, ISAExt::CLWB - }); + return arch_is_supported(CPUArch::cooperlake) && + check_extensions(std::vector{ + ISAExt::PKU, + ISAExt::AVX512_VBMI, + ISAExt::AVX512_IFMA, + ISAExt::SHA, + ISAExt::GFNI, + ISAExt::VAES, + ISAExt::AVX512_VBMI2, + ISAExt::VPCLMULQDQ, + ISAExt::AVX512_BITALG, + ISAExt::RDPID, + ISAExt::AVX512_VPOPCNTDQ, + ISAExt::PCONFIG, + ISAExt::WBNOINVD, + ISAExt::CLWB}); case CPUArch::sapphirerapids: - return arch_is_supported(CPUArch::icelake_server) && check_extensions(std::vector{ - ISAExt::MOVDIRI, ISAExt::MOVDIR64B, ISAExt::ENQCMD, ISAExt::CLDEMOTE, - ISAExt::PTWRITE, ISAExt::WAITPKG, ISAExt::SERIALIZE, ISAExt::TSXLDTRK, - ISAExt::UINTR, ISAExt::AMX_BF16, ISAExt::AMX_TILE, ISAExt::AMX_INT8, - ISAExt::AVX_VNNI, ISAExt::AVX512_FP16, ISAExt::AVX512_BF16 - }); + return arch_is_supported(CPUArch::icelake_server) && + check_extensions(std::vector{ + ISAExt::MOVDIRI, + ISAExt::MOVDIR64B, + ISAExt::ENQCMD, + ISAExt::CLDEMOTE, + ISAExt::PTWRITE, + ISAExt::WAITPKG, + ISAExt::SERIALIZE, + ISAExt::TSXLDTRK, + ISAExt::UINTR, + ISAExt::AMX_BF16, + ISAExt::AMX_TILE, + ISAExt::AMX_INT8, + ISAExt::AVX_VNNI, + ISAExt::AVX512_FP16, + ISAExt::AVX512_BF16}); case CPUArch::graniterapids: - return arch_is_supported(CPUArch::sapphirerapids) && check_extensions(std::vector{ - ISAExt::AMX_FP16, ISAExt::PREFETCHI - }); + return arch_is_supported(CPUArch::sapphirerapids) && + check_extensions(std::vector{ISAExt::AMX_FP16, ISAExt::PREFETCHI} + ); case CPUArch::graniterapids_d: - return arch_is_supported(CPUArch::graniterapids) && check_extensions(std::vector{ - ISAExt::AMX_COMPLEX - }); + return arch_is_supported(CPUArch::graniterapids) && + check_extensions(std::vector{ISAExt::AMX_COMPLEX}); #elif defined(__aarch64__) // TODO: complete lists of supported extensions case CPUArch::neoverse_n1: - return check_extensions(std::vector{ISAExt::SVE}) - case CPUArch::neoverse_v1: - return arch_is_supported(CPUArch::neoverse_n1) && check_extensions(std::vector{ISAExt::SVE2}); + return check_extensions(std::vector{ISAExt::SVE}) case CPUArch::neoverse_v1: + return arch_is_supported(CPUArch::neoverse_n1) && + check_extensions(std::vector{ISAExt::SVE2}); #endif default: return false; @@ -138,17 +171,15 @@ inline bool arch_is_supported(CPUArch arch) { } class CPUArchEnvironment { -public: + public: static CPUArchEnvironment& get_instance() { // TODO: ensure thread safety static CPUArchEnvironment instance; return instance; } - CPUArch get_cpu_arch() const { - return max_arch_; - } + CPUArch get_cpu_arch() const { return max_arch_; } -private: + private: CPUArchEnvironment() { const std::vector compiled_archs = { #if defined(SVS_CPUARCH_NATIVE) @@ -185,51 +216,67 @@ class CPUArchEnvironment { #define SVS_PACK_ARGS(...) __VA_ARGS__ #define SVS_CLASS_METHOD_CPUARCH_CASE(cpuarch, cls, method, args) \ - case svs::arch::CPUArch::cpuarch: \ - return cls::method(args); \ + case svs::arch::CPUArch::cpuarch: \ + return cls::method(args); \ break; #if defined(SVS_CPUARCH_NATIVE) - #define SVS_TARGET_CPUARCH svs::arch::CPUArch::native +#define SVS_TARGET_CPUARCH svs::arch::CPUArch::native - #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ - return cls::method(args); +#define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ + return cls::method(args); #elif defined(__x86_64__) - #define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET +#define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET - #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ - svs::arch::CPUArch cpu_arch = svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ - switch (cpu_arch) { \ - SVS_CLASS_METHOD_CPUARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) \ - default: \ - return cls::method(args); \ - break; \ - } +#define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ + svs::arch::CPUArch cpu_arch = \ + svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_CPUARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ + } #elif defined(__aarch64__) - #define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET +#define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET - #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ - svs::arch::CPUArch cpu_arch = svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ - switch (cpu_arch) { \ - SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_n1, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) \ - default: \ - return cls::method(args); \ - break; \ - } +#define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ + svs::arch::CPUArch cpu_arch = \ + svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_n1, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ + } #endif -#define SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH(return_type, cls, method, template_args, args) \ +#define SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH( \ + return_type, cls, method, template_args, args \ +) \ template return_type cls::method(args); // Generic distance dispatching macro #define SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(cls, a_type, b_type) \ - SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) + SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH( \ + float, \ + svs::distance::cls, \ + compute, \ + SVS_PACK_ARGS(a_type, b_type), \ + SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long) \ + ) // Cosine distance dispatching macro #define SVS_INST_COSINE_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(cls, a_type, b_type) \ - SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) + SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH( \ + float, \ + svs::distance::cls, \ + compute, \ + SVS_PACK_ARGS(a_type, b_type), \ + SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long) \ + ) } // namespace svs::arch diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h index 7dafd019..2509a17a 100644 --- a/include/svs/lib/cpuid.h +++ b/include/svs/lib/cpuid.h @@ -16,10 +16,10 @@ #pragma once -#include +#include #include +#include #include -#include #include #if defined(__x86_64__) @@ -32,28 +32,88 @@ namespace svs::arch { enum class ISAExt { // Common extensions - MMX, SSE, SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, POPCNT, CX16, SAHF, FXSR, - AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, - MOVBE, RDSEED, ADCX, PREFETCHW, AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, - CLWB, PKU, SHA, GFNI, VAES, VPCLMULQDQ, RDPID, PCONFIG, WBNOINVD, MOVDIRI, MOVDIR64B, - ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK, UINTR, PREFETCHI, + MMX, + SSE, + SSE2, + SSE3, + SSSE3, + SSE4_1, + SSE4_2, + POPCNT, + CX16, + SAHF, + FXSR, + AVX, + XSAVE, + PCLMUL, + FSGSBASE, + RDRND, + F16C, + AVX2, + BMI, + BMI2, + LZCNT, + FMA, + MOVBE, + RDSEED, + ADCX, + PREFETCHW, + AES, + CLFLUSHOPT, + XSAVEC, + XSAVES, + SGX, + CLWB, + PKU, + SHA, + GFNI, + VAES, + VPCLMULQDQ, + RDPID, + PCONFIG, + WBNOINVD, + MOVDIRI, + MOVDIR64B, + ENQCMD, + CLDEMOTE, + PTWRITE, + WAITPKG, + SERIALIZE, + TSXLDTRK, + UINTR, + PREFETCHI, // AVX family AVX_VNNI, // AVX512_ family - AVX512_F, AVX512_VL, AVX512_BW, AVX512_DQ, AVX512_CD, AVX512_VBMI, AVX512_IFMA, AVX512_VNNI, - AVX512_VBMI2, AVX512_BITALG, AVX512_VPOPCNTDQ, AVX512_BF16, AVX512_FP16, + AVX512_F, + AVX512_VL, + AVX512_BW, + AVX512_DQ, + AVX512_CD, + AVX512_VBMI, + AVX512_IFMA, + AVX512_VNNI, + AVX512_VBMI2, + AVX512_BITALG, + AVX512_VPOPCNTDQ, + AVX512_BF16, + AVX512_FP16, // AMX family - AMX_BF16, AMX_TILE, AMX_INT8, AMX_FP16, AMX_COMPLEX + AMX_BF16, + AMX_TILE, + AMX_INT8, + AMX_FP16, + AMX_COMPLEX }; struct CPUIDFlag { - const uint32_t function; // EAX input for CPUID - const uint32_t subfunction; // ECX input for CPUID - const uint32_t reg; // Register index (0=EAX, 1=EBX, 2=ECX, 3=EDX) - const uint32_t bit; // Bit position in the register + const uint32_t function; // EAX input for CPUID + const uint32_t subfunction; // ECX input for CPUID + const uint32_t reg; // Register index (0=EAX, 1=EBX, 2=ECX, 3=EDX) + const uint32_t bit; // Bit position in the register const char* name; bool get_value() const { @@ -65,121 +125,116 @@ struct CPUIDFlag { inline const std::unordered_map ISAExtInfo = { // flags are sorted by function, subfunction, register and bit - {ISAExt::MMX, {1, 0, 3, 23, "MMX"}}, - {ISAExt::FXSR, {1, 0, 3, 24, "FXSR"}}, - {ISAExt::SSE, {1, 0, 3, 25, "SSE"}}, - {ISAExt::SSE2, {1, 0, 3, 26, "SSE2"}}, - {ISAExt::SSE3, {1, 0, 2, 0, "SSE3"}}, - {ISAExt::PCLMUL, {1, 0, 2, 1, "PCLMUL"}}, - {ISAExt::SSSE3, {1, 0, 2, 9, "SSSE3"}}, - {ISAExt::FMA, {1, 0, 2, 12, "FMA"}}, - {ISAExt::CX16, {1, 0, 2, 13, "CX16"}}, - {ISAExt::SSE4_1, {1, 0, 2, 19, "SSE4_1"}}, - {ISAExt::SSE4_2, {1, 0, 2, 20, "SSE4_2"}}, - {ISAExt::MOVBE, {1, 0, 2, 22, "MOVBE"}}, - {ISAExt::POPCNT, {1, 0, 2, 23, "POPCNT"}}, - {ISAExt::AES, {1, 0, 2, 25, "AES"}}, - {ISAExt::XSAVE, {1, 0, 2, 26, "XSAVE"}}, - {ISAExt::AVX, {1, 0, 2, 28, "AVX"}}, - {ISAExt::F16C, {1, 0, 2, 29, "F16C"}}, - {ISAExt::RDRND, {1, 0, 2, 30, "RDRND"}}, - {ISAExt::FSGSBASE, {7, 0, 1, 0, "FSGSBASE"}}, - {ISAExt::SGX, {7, 0, 1, 2, "SGX"}}, - {ISAExt::BMI, {7, 0, 1, 3, "BMI"}}, - {ISAExt::AVX2, {7, 0, 1, 5, "AVX2"}}, - {ISAExt::BMI2, {7, 0, 1, 8, "BMI2"}}, - {ISAExt::AVX512_F, {7, 0, 1, 16, "AVX512_F"}}, - {ISAExt::AVX512_DQ, {7, 0, 1, 17, "AVX512_DQ"}}, - {ISAExt::RDSEED, {7, 0, 1, 18, "RDSEED"}}, - {ISAExt::ADCX, {7, 0, 1, 19, "ADCX"}}, - {ISAExt::AVX512_IFMA, {7, 0, 1, 21, "AVX512_IFMA"}}, - {ISAExt::CLFLUSHOPT, {7, 0, 1, 23, "CLFLUSHOPT"}}, - {ISAExt::CLWB, {7, 0, 1, 24, "CLWB"}}, - {ISAExt::AVX512_CD, {7, 0, 1, 28, "AVX512_CD"}}, - {ISAExt::SHA, {7, 0, 1, 29, "SHA"}}, - {ISAExt::AVX512_BW, {7, 0, 1, 30, "AVX512_BW"}}, - {ISAExt::AVX512_VL, {7, 0, 1, 31, "AVX512_VL"}}, - {ISAExt::AVX512_VBMI, {7, 0, 2, 1, "AVX512_VBMI"}}, - {ISAExt::PKU, {7, 0, 2, 3, "PKU"}}, - {ISAExt::WAITPKG, {7, 0, 2, 5, "WAITPKG"}}, - {ISAExt::AVX512_VBMI2, {7, 0, 2, 6, "AVX512_VBMI2"}}, - {ISAExt::GFNI, {7, 0, 2, 8, "GFNI"}}, - {ISAExt::VAES, {7, 0, 2, 9, "VAES"}}, - {ISAExt::VPCLMULQDQ, {7, 0, 2, 10, "VPCLMULQDQ"}}, - {ISAExt::AVX512_VNNI, {7, 0, 2, 11, "AVX512_VNNI"}}, - {ISAExt::AVX512_BITALG, {7, 0, 2, 12, "AVX512_BITALG"}}, - {ISAExt::AVX512_VPOPCNTDQ, {7, 0, 2, 14, "AVX512_VPOPCNTDQ"}}, - {ISAExt::RDPID, {7, 0, 2, 22, "RDPID"}}, - {ISAExt::CLDEMOTE, {7, 0, 2, 25, "CLDEMOTE"}}, - {ISAExt::MOVDIRI, {7, 0, 2, 27, "MOVDIRI"}}, - {ISAExt::MOVDIR64B, {7, 0, 2, 28, "MOVDIR64B"}}, - {ISAExt::ENQCMD, {7, 0, 2, 29, "ENQCMD"}}, - {ISAExt::UINTR, {7, 0, 3, 5, "UINTR"}}, - {ISAExt::SERIALIZE, {7, 0, 3, 14, "SERIALIZE"}}, - {ISAExt::TSXLDTRK, {7, 0, 3, 16, "TSXLDTRK"}}, - {ISAExt::PCONFIG, {7, 0, 3, 18, "PCONFIG"}}, - {ISAExt::AMX_BF16, {7, 0, 3, 22, "AMX_BF16"}}, - {ISAExt::AVX512_FP16, {7, 0, 3, 23, "AVX512_FP16"}}, - {ISAExt::AMX_TILE, {7, 0, 3, 24, "AMX_TILE"}}, - {ISAExt::AMX_INT8, {7, 0, 3, 25, "AMX_INT8"}}, - {ISAExt::AVX_VNNI, {7, 1, 0, 4, "AVX_VNNI"}}, - {ISAExt::AVX512_BF16, {7, 1, 0, 5, "AVX512_BF16"}}, - {ISAExt::AMX_FP16, {7, 1, 0, 21, "AMX_FP16"}}, - {ISAExt::AMX_COMPLEX, {7, 1, 3, 8, "AMX_COMPLEX"}}, - {ISAExt::PREFETCHI, {7, 1, 3, 14, "PREFETCHI"}}, - {ISAExt::XSAVEC, {0xD, 1, 0, 1, "XSAVEC"}}, - {ISAExt::XSAVES, {0xD, 1, 0, 3, "XSAVES"}}, - {ISAExt::PTWRITE, {0x14, 0, 1, 4, "PTWRITE"}}, - {ISAExt::WBNOINVD, {0x80000008, 0, 1, 9, "WBNOINVD"}}, - {ISAExt::SAHF, {0x80000001, 0, 2, 0, "SAHF"}}, - {ISAExt::LZCNT, {0x80000001, 0, 2, 5, "LZCNT"}}, - {ISAExt::PREFETCHW, {0x80000001, 0, 2, 8, "PREFETCHW"}}, + {ISAExt::MMX, {1, 0, 3, 23, "MMX"}}, + {ISAExt::FXSR, {1, 0, 3, 24, "FXSR"}}, + {ISAExt::SSE, {1, 0, 3, 25, "SSE"}}, + {ISAExt::SSE2, {1, 0, 3, 26, "SSE2"}}, + {ISAExt::SSE3, {1, 0, 2, 0, "SSE3"}}, + {ISAExt::PCLMUL, {1, 0, 2, 1, "PCLMUL"}}, + {ISAExt::SSSE3, {1, 0, 2, 9, "SSSE3"}}, + {ISAExt::FMA, {1, 0, 2, 12, "FMA"}}, + {ISAExt::CX16, {1, 0, 2, 13, "CX16"}}, + {ISAExt::SSE4_1, {1, 0, 2, 19, "SSE4_1"}}, + {ISAExt::SSE4_2, {1, 0, 2, 20, "SSE4_2"}}, + {ISAExt::MOVBE, {1, 0, 2, 22, "MOVBE"}}, + {ISAExt::POPCNT, {1, 0, 2, 23, "POPCNT"}}, + {ISAExt::AES, {1, 0, 2, 25, "AES"}}, + {ISAExt::XSAVE, {1, 0, 2, 26, "XSAVE"}}, + {ISAExt::AVX, {1, 0, 2, 28, "AVX"}}, + {ISAExt::F16C, {1, 0, 2, 29, "F16C"}}, + {ISAExt::RDRND, {1, 0, 2, 30, "RDRND"}}, + {ISAExt::FSGSBASE, {7, 0, 1, 0, "FSGSBASE"}}, + {ISAExt::SGX, {7, 0, 1, 2, "SGX"}}, + {ISAExt::BMI, {7, 0, 1, 3, "BMI"}}, + {ISAExt::AVX2, {7, 0, 1, 5, "AVX2"}}, + {ISAExt::BMI2, {7, 0, 1, 8, "BMI2"}}, + {ISAExt::AVX512_F, {7, 0, 1, 16, "AVX512_F"}}, + {ISAExt::AVX512_DQ, {7, 0, 1, 17, "AVX512_DQ"}}, + {ISAExt::RDSEED, {7, 0, 1, 18, "RDSEED"}}, + {ISAExt::ADCX, {7, 0, 1, 19, "ADCX"}}, + {ISAExt::AVX512_IFMA, {7, 0, 1, 21, "AVX512_IFMA"}}, + {ISAExt::CLFLUSHOPT, {7, 0, 1, 23, "CLFLUSHOPT"}}, + {ISAExt::CLWB, {7, 0, 1, 24, "CLWB"}}, + {ISAExt::AVX512_CD, {7, 0, 1, 28, "AVX512_CD"}}, + {ISAExt::SHA, {7, 0, 1, 29, "SHA"}}, + {ISAExt::AVX512_BW, {7, 0, 1, 30, "AVX512_BW"}}, + {ISAExt::AVX512_VL, {7, 0, 1, 31, "AVX512_VL"}}, + {ISAExt::AVX512_VBMI, {7, 0, 2, 1, "AVX512_VBMI"}}, + {ISAExt::PKU, {7, 0, 2, 3, "PKU"}}, + {ISAExt::WAITPKG, {7, 0, 2, 5, "WAITPKG"}}, + {ISAExt::AVX512_VBMI2, {7, 0, 2, 6, "AVX512_VBMI2"}}, + {ISAExt::GFNI, {7, 0, 2, 8, "GFNI"}}, + {ISAExt::VAES, {7, 0, 2, 9, "VAES"}}, + {ISAExt::VPCLMULQDQ, {7, 0, 2, 10, "VPCLMULQDQ"}}, + {ISAExt::AVX512_VNNI, {7, 0, 2, 11, "AVX512_VNNI"}}, + {ISAExt::AVX512_BITALG, {7, 0, 2, 12, "AVX512_BITALG"}}, + {ISAExt::AVX512_VPOPCNTDQ, {7, 0, 2, 14, "AVX512_VPOPCNTDQ"}}, + {ISAExt::RDPID, {7, 0, 2, 22, "RDPID"}}, + {ISAExt::CLDEMOTE, {7, 0, 2, 25, "CLDEMOTE"}}, + {ISAExt::MOVDIRI, {7, 0, 2, 27, "MOVDIRI"}}, + {ISAExt::MOVDIR64B, {7, 0, 2, 28, "MOVDIR64B"}}, + {ISAExt::ENQCMD, {7, 0, 2, 29, "ENQCMD"}}, + {ISAExt::UINTR, {7, 0, 3, 5, "UINTR"}}, + {ISAExt::SERIALIZE, {7, 0, 3, 14, "SERIALIZE"}}, + {ISAExt::TSXLDTRK, {7, 0, 3, 16, "TSXLDTRK"}}, + {ISAExt::PCONFIG, {7, 0, 3, 18, "PCONFIG"}}, + {ISAExt::AMX_BF16, {7, 0, 3, 22, "AMX_BF16"}}, + {ISAExt::AVX512_FP16, {7, 0, 3, 23, "AVX512_FP16"}}, + {ISAExt::AMX_TILE, {7, 0, 3, 24, "AMX_TILE"}}, + {ISAExt::AMX_INT8, {7, 0, 3, 25, "AMX_INT8"}}, + {ISAExt::AVX_VNNI, {7, 1, 0, 4, "AVX_VNNI"}}, + {ISAExt::AVX512_BF16, {7, 1, 0, 5, "AVX512_BF16"}}, + {ISAExt::AMX_FP16, {7, 1, 0, 21, "AMX_FP16"}}, + {ISAExt::AMX_COMPLEX, {7, 1, 3, 8, "AMX_COMPLEX"}}, + {ISAExt::PREFETCHI, {7, 1, 3, 14, "PREFETCHI"}}, + {ISAExt::XSAVEC, {0xD, 1, 0, 1, "XSAVEC"}}, + {ISAExt::XSAVES, {0xD, 1, 0, 3, "XSAVES"}}, + {ISAExt::PTWRITE, {0x14, 0, 1, 4, "PTWRITE"}}, + {ISAExt::WBNOINVD, {0x80000008, 0, 1, 9, "WBNOINVD"}}, + {ISAExt::SAHF, {0x80000001, 0, 2, 0, "SAHF"}}, + {ISAExt::LZCNT, {0x80000001, 0, 2, 5, "LZCNT"}}, + {ISAExt::PREFETCHW, {0x80000001, 0, 2, 8, "PREFETCHW"}}, }; -// if defined(__x86_64__) #elif defined(__aarch64__) -// TODO: complete support of __aarch64__ -enum class ISAExt { - SVE, SVE2 -}; +enum class ISAExt { SVE, SVE2 }; // Define register ID values for ARM features detection -#define ID_AA64PFR0_EL1 0 -#define ID_AA64PFR1_EL1 1 -#define ID_AA64ISAR0_EL1 2 -#define ID_AA64ISAR1_EL1 3 -#define ID_AA64MMFR0_EL1 4 -#define ID_AA64MMFR1_EL1 5 -#define ID_AA64MMFR2_EL1 6 -#define ID_AA64DFR0_EL1 7 -#define ID_AA64DFR1_EL1 8 -#define ID_AA64ZFR0_EL1 9 +#define ID_AA64PFR0_EL1 0 +#define ID_AA64PFR1_EL1 1 +#define ID_AA64ISAR0_EL1 2 +#define ID_AA64ISAR1_EL1 3 +#define ID_AA64MMFR0_EL1 4 +#define ID_AA64MMFR1_EL1 5 +#define ID_AA64MMFR2_EL1 6 +#define ID_AA64DFR0_EL1 7 +#define ID_AA64DFR1_EL1 8 +#define ID_AA64ZFR0_EL1 9 // Helper template to read system registers with mrs instruction -template -inline uint64_t read_system_reg() { +template inline uint64_t read_system_reg() { uint64_t val; if constexpr (ID == ID_AA64PFR0_EL1) { - asm("mrs %0, id_aa64pfr0_el1" : "=r" (val)); + asm("mrs %0, id_aa64pfr0_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64PFR1_EL1) { - asm("mrs %0, id_aa64pfr1_el1" : "=r" (val)); + asm("mrs %0, id_aa64pfr1_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64ISAR0_EL1) { - asm("mrs %0, id_aa64isar0_el1" : "=r" (val)); + asm("mrs %0, id_aa64isar0_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64ISAR1_EL1) { - asm("mrs %0, id_aa64isar1_el1" : "=r" (val)); + asm("mrs %0, id_aa64isar1_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64MMFR0_EL1) { - asm("mrs %0, id_aa64mmfr0_el1" : "=r" (val)); + asm("mrs %0, id_aa64mmfr0_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64MMFR1_EL1) { - asm("mrs %0, id_aa64mmfr1_el1" : "=r" (val)); + asm("mrs %0, id_aa64mmfr1_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64MMFR2_EL1) { - asm("mrs %0, id_aa64mmfr2_el1" : "=r" (val)); + asm("mrs %0, id_aa64mmfr2_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64DFR0_EL1) { - asm("mrs %0, id_aa64dfr0_el1" : "=r" (val)); + asm("mrs %0, id_aa64dfr0_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64DFR1_EL1) { - asm("mrs %0, id_aa64dfr1_el1" : "=r" (val)); + asm("mrs %0, id_aa64dfr1_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64ZFR0_EL1) { - asm("mrs %0, id_aa64zfr0_el1" : "=r" (val)); + asm("mrs %0, id_aa64zfr0_el1" : "=r"(val)); } else { val = 0; } @@ -192,17 +247,17 @@ inline uint64_t extract_bits(uint64_t val, int pos, int len) { } struct MSRFlag { - unsigned int reg_id; // System register ID - int bit_pos; // Bit position in the register - int bit_len; // Number of bits to check - uint64_t expected_val; // Expected value for feature to be present - const char* name; // Feature name + unsigned int reg_id; // System register ID + int bit_pos; // Bit position in the register + int bit_len; // Number of bits to check + uint64_t expected_val; // Expected value for feature to be present + const char* name; // Feature name bool get_value() const { uint64_t reg_val = 0; try { - switch(reg_id) { + switch (reg_id) { case ID_AA64PFR0_EL1: reg_val = read_system_reg(); break; @@ -216,7 +271,7 @@ struct MSRFlag { reg_val = read_system_reg(); break; case ID_AA64ZFR0_EL1: - // First check if SVE is supported to avoid + // First check if SVE is supported to avoid if (extract_bits(read_system_reg(), 32, 4) != 0) { reg_val = read_system_reg(); } @@ -234,15 +289,13 @@ struct MSRFlag { }; inline const std::unordered_map ISAExtInfo = { - {ISAExt::SVE, {ID_AA64PFR0_EL1, 32, 4, 1, "sve"}}, - {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, + {ISAExt::SVE, {ID_AA64PFR0_EL1, 32, 4, 1, "sve"}}, + {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, }; #endif // if defined(__aarch64__) -inline bool check_extension(ISAExt ext) { - return ISAExtInfo.at(ext).get_value(); -} +inline bool check_extension(ISAExt ext) { return ISAExtInfo.at(ext).get_value(); } inline bool check_extensions(std::vector exts) { for (const auto& ext : exts) { diff --git a/tests/svs/core/distances/cosine.cpp b/tests/svs/core/distances/cosine.cpp index eb86f5cf..c18fb9e8 100644 --- a/tests/svs/core/distances/cosine.cpp +++ b/tests/svs/core/distances/cosine.cpp @@ -86,11 +86,14 @@ void test_types(T lo, T hi, size_t num_tests) { auto a_norm = svs::distance::norm(std::span{a.data(), a.size()}); CATCH_REQUIRE( // TODO: replace baseline with something else? - (svs::distance::CosineSimilarity::compute(a.data(), b.data(), a_norm) == - expected) + (svs::distance::CosineSimilarity::compute( + a.data(), b.data(), a_norm + ) == expected) ); // Dynamically Sized Computation - auto dist = svs::distance::CosineSimilarity::compute(a.data(), b.data(), a_norm, N); + auto dist = svs::distance::CosineSimilarity::compute( + a.data(), b.data(), a_norm, N + ); CATCH_REQUIRE((dist == expected)); } } diff --git a/tests/svs/core/distances/distance_euclidean.cpp b/tests/svs/core/distances/distance_euclidean.cpp index 8d68cb35..216d5bc5 100644 --- a/tests/svs/core/distances/distance_euclidean.cpp +++ b/tests/svs/core/distances/distance_euclidean.cpp @@ -68,9 +68,17 @@ void test_types(T lo, T hi, size_t num_tests) { auto expected = Catch::Approx(euclidean_reference(a, b)); // Statically Sized Computation - CATCH_REQUIRE((svs::distance::L2::compute(a.data(), b.data()) == expected)); + CATCH_REQUIRE( + (svs::distance::L2::compute( + a.data(), b.data() + ) == expected) + ); // Dynamically Sized Computation - CATCH_REQUIRE((svs::distance::L2::compute(a.data(), b.data(), N) == expected)); + CATCH_REQUIRE( + (svs::distance::L2::compute( + a.data(), b.data(), N + ) == expected) + ); } } } // namespace diff --git a/tests/svs/core/distances/inner_product.cpp b/tests/svs/core/distances/inner_product.cpp index c3046d1d..5ff2865e 100644 --- a/tests/svs/core/distances/inner_product.cpp +++ b/tests/svs/core/distances/inner_product.cpp @@ -76,9 +76,17 @@ void test_types(T lo, T hi, size_t num_tests) { .margin(INNERPRODUCT_MARGIN); // Statically Sized Computation - CATCH_REQUIRE((svs::distance::IP::compute(a.data(), b.data()) == expected)); + CATCH_REQUIRE( + (svs::distance::IP::compute( + a.data(), b.data() + ) == expected) + ); // Dynamically Sized Computation - CATCH_REQUIRE((svs::distance::IP::compute(a.data(), b.data(), N) == expected)); + CATCH_REQUIRE( + (svs::distance::IP::compute( + a.data(), b.data(), N + ) == expected) + ); } } } // anonymous namespace From 2ebee92150bdad85119a64c55f5a6259088d09ba Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 29 Apr 2025 08:07:32 -0700 Subject: [PATCH 12/65] Enable microarch dispatcher for all build targets --- CMakeLists.txt | 5 +- benchmark/CMakeLists.txt | 4 +- bindings/python/CMakeLists.txt | 90 +---------- cmake/cpuarch.cmake | 120 +++++++++++++++ {bindings/python => cmake}/microarch.py | 13 +- .../microarch_instantiations.cpp | 0 cmake/microarch_targets_aaarch64 | 2 + cmake/microarch_targets_x86_64 | 6 + cmake/options.cmake | 8 - examples/cpp/CMakeLists.txt | 12 +- include/svs/lib/arch.h | 143 +++++++++++------- include/svs/lib/cpuid.h | 4 +- tests/CMakeLists.txt | 4 +- utils/CMakeLists.txt | 4 +- 14 files changed, 241 insertions(+), 174 deletions(-) create mode 100644 cmake/cpuarch.cmake rename {bindings/python => cmake}/microarch.py (89%) rename bindings/python/src/instantiations.cpp => cmake/microarch_instantiations.cpp (100%) create mode 100644 cmake/microarch_targets_aaarch64 create mode 100644 cmake/microarch_targets_x86_64 diff --git a/CMakeLists.txt b/CMakeLists.txt index 314a6b33..6a9b86c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ target_compile_options( include("cmake/options.cmake") +include("cmake/cpuarch.cmake") include("cmake/clang-tidy.cmake") include("cmake/eve.cmake") include("cmake/pthread.cmake") @@ -80,6 +81,8 @@ include("cmake/toml.cmake") ##### Build Objects ##### +create_microarch_instantiations() + if(SVS_BUILD_BINARIES) add_subdirectory(utils) endif() @@ -112,7 +115,7 @@ set(LIB_CONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/svs") # Install headers and target information. install( - TARGETS svs_devel svs_compile_options svs_native_options + TARGETS svs_devel svs_compile_options svs_microarch_options_base EXPORT svs-targets INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 5c042c29..840c9c36 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -48,7 +48,7 @@ set(SHARED_LIBRARY_FILES src/inverted/memory/executables/memory_test.cpp ) -add_library(svs_benchmark_library SHARED ${SHARED_LIBRARY_FILES}) +add_library(svs_benchmark_library SHARED ${SHARED_LIBRARY_FILES} ${MICROARCH_OBJECT_FILES}) target_include_directories(svs_benchmark_library PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include) # Minimal @@ -104,7 +104,7 @@ target_link_libraries( PUBLIC ${SVS_LIB} svs_compile_options - svs_native_options + svs_microarch_options_base fmt::fmt ) diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index 8dcf2cc8..6bc333c6 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -24,68 +24,9 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(pybind11) -# Try to find the Python executable. -# -# If it's given as part of the Cmake arguments given by "scikit build", then use that. -# Otherwise, fall back to using plain old "python". -# If *THAT* doesn't work, give up. -if(DEFINED PYTHON_EXECUTABLE) - set(SVS_PYTHON_EXECUTABLE "${PYTHON_EXECUTABLE}") -else() - set(SVS_PYTHON_EXECUTABLE "python") -endif() - -# The micro architectures to compile for. -if(NOT DEFINED SVS_MICROARCHS) - set(SVS_MICROARCHS native) -endif() - # Include the SVS library directly. add_subdirectory("../.." "${CMAKE_CURRENT_BINARY_DIR}/svs") -# Run the python script to get optimization flags for the desired back-ends. -# -# FLAGS_SCRIPT - Path to the Python script that will take the compiler, compiler version, -# and list of desired microarchitectures and generate optimization flags for each -# microarchitecture. -# -# FLAGS_TEXT_FILE - List of optimization flags for each architecture. -# Expected format: -# -march=arch1,-mtune=arch1 -# -march=arch2,-mtune=arch2 -# ... -# -march=archN,-mtune=archN -# -# The number of lines should be equal to the number of microarchitectures. -# NOTE: The entries within each line are separated by a comma on purpose to allow CMake -# to read the whole file as a List and then use string replacement on the commas to turn -# each line into a list in its own right. -# -# TEMP_JSON - JSON Manifest file describing the generated binaries. This is meant to be -# included in the Python package to allow the Python code to reason about the packaged -# libraries and select the correct one for loading. -# -set(FLAGS_SCRIPT "${CMAKE_CURRENT_LIST_DIR}/microarch.py") -set(FLAGS_TEXT_FILE "${CMAKE_CURRENT_BINARY_DIR}/optimization_flags.txt") -set(FLAGS_MANIFEST_JSON "${CMAKE_CURRENT_BINARY_DIR}/flags_manifest.json") - -execute_process( - COMMAND - ${SVS_PYTHON_EXECUTABLE} - ${FLAGS_SCRIPT} - ${FLAGS_TEXT_FILE} - ${FLAGS_MANIFEST_JSON} - --compiler ${CMAKE_CXX_COMPILER_ID} - --compiler-version ${CMAKE_CXX_COMPILER_VERSION} - --microarchitectures ${SVS_MICROARCHS} - COMMAND_ERROR_IS_FATAL ANY -) - -file(STRINGS "${FLAGS_TEXT_FILE}" OPTIMIZATION_FLAGS) -message("Flags: ${OPTIMIZATION_FLAGS}") -list(LENGTH OPTIMIZATION_FLAGS OPT_FLAGS_LENGTH) -message("Length of flags: ${OPT_FLAGS_LENGTH}") - # C++ files makind up the python bindings. set(CPP_FILES src/allocator.cpp @@ -97,35 +38,13 @@ set(CPP_FILES src/vamana_common.cpp src/svs_mkl.cpp ) -# C++ files that are used to instantiate the template classes for each microarchitecture. -set(CPUARCH_CPP_FILES - src/instantiations.cpp -) - -# Generate an object file for each target microarchitecture. -set(OBJECT_FILES "") -foreach(MICRO OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) - set(OBJ_NAME "_svs_${MICRO}") - add_library(${OBJ_NAME} OBJECT ${CPUARCH_CPP_FILES}) - - target_link_libraries(${OBJ_NAME} PUBLIC svs::svs) - target_link_libraries(${OBJ_NAME} PRIVATE svs::compile_options fmt::fmt) - - string(REPLACE "," ";" OPT_FLAGS ${OPT_FLAGS}) - message("OPT Flags: ${OPT_FLAGS}") - target_compile_options(${OBJ_NAME} PRIVATE ${OPT_FLAGS} -DSVS_TUNE_TARGET=${MICRO} -DSVS_EXTERNAL_CPUARCH_INSTANCE=1 -fPIC) - - list(APPEND OBJECT_FILES $) -endforeach() set(LIB_NAME "_svs") -pybind11_add_module(${LIB_NAME} MODULE ${CPP_FILES} ${OBJECT_FILES}) +pybind11_add_module(${LIB_NAME} MODULE ${CPP_FILES} ${MICROARCH_OBJECT_FILES}) target_link_libraries(${LIB_NAME} PRIVATE pybind11::module) target_link_libraries(${LIB_NAME} PUBLIC svs::svs) # Dependency "fmt::fmt" obtained from "svs" -target_link_libraries(${LIB_NAME} PRIVATE svs::compile_options fmt::fmt) -# TODO: remove manual specification of base arch optimization flags -target_compile_options(${LIB_NAME} PRIVATE -march=x86-64-v3 -mtune=generic -DSVS_TUNE_TARGET=x86_64_v3) +target_link_libraries(${LIB_NAME} PRIVATE svs::compile_options fmt::fmt svs::microarch_options_base) target_include_directories( ${LIB_NAME} PUBLIC $ @@ -147,11 +66,6 @@ if(DEFINED SKBUILD) endif() if(DEFINED SKBUILD) - # Install the manifest JSON file. - # This is kind of a hack to avoid the needing to explicitly move JSON file into the - # source folder of the python library. - install(FILES ${FLAGS_MANIFEST_JSON} DESTINATION .) - # Install header files. install( DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/include/svs" diff --git a/cmake/cpuarch.cmake b/cmake/cpuarch.cmake new file mode 100644 index 00000000..5d7e3a83 --- /dev/null +++ b/cmake/cpuarch.cmake @@ -0,0 +1,120 @@ +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(svs_microarch_cmake_included) + return() +endif() +set(svs_microarch_cmake_included true) + +# N.B.: first microarch listed in targets file is treated as "base" microarch +# which is used to build base object files, shared libs and executables +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_x86_64" SVS_MICROARCHS) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_aarch64" SVS_MICROARCHS) +endif() + +# Try to find the Python executable. +# +# If it's given as part of the Cmake arguments given by "scikit build", then use that. +# Otherwise, fall back to using plain old "python". +# If *THAT* doesn't work, give up. +if(DEFINED PYTHON_EXECUTABLE) + set(SVS_PYTHON_EXECUTABLE "${PYTHON_EXECUTABLE}") +else() + set(SVS_PYTHON_EXECUTABLE "python") +endif() + +# Run the python script to get optimization flags for the desired back-ends. +# +# FLAGS_SCRIPT - Path to the Python script that will take the compiler, compiler version, +# and list of desired microarchitectures and generate optimization flags for each +# microarchitecture. +# +# FLAGS_TEXT_FILE - List of optimization flags for each architecture. +# Expected format: +# -march=arch1,-mtune=arch1 +# -march=arch2,-mtune=arch2 +# ... +# -march=archN,-mtune=archN +# +# The number of lines should be equal to the number of microarchitectures. +# NOTE: The entries within each line are separated by a comma on purpose to allow CMake +# to read the whole file as a List and then use string replacement on the commas to turn +# each line into a list in its own right. +# +set(FLAGS_SCRIPT "${CMAKE_CURRENT_LIST_DIR}/microarch.py") +set(FLAGS_TEXT_FILE "${CMAKE_CURRENT_BINARY_DIR}/optimization_flags.txt") + +execute_process( + COMMAND + ${SVS_PYTHON_EXECUTABLE} + ${FLAGS_SCRIPT} + ${FLAGS_TEXT_FILE} + --compiler ${CMAKE_CXX_COMPILER_ID} + --compiler-version ${CMAKE_CXX_COMPILER_VERSION} + --microarchitectures ${SVS_MICROARCHS} + COMMAND_ERROR_IS_FATAL ANY +) + +file(STRINGS "${FLAGS_TEXT_FILE}" OPTIMIZATION_FLAGS) +message("Flags: ${OPTIMIZATION_FLAGS}") +list(LENGTH OPTIMIZATION_FLAGS OPT_FLAGS_LENGTH) +message("Length of flags: ${OPT_FLAGS_LENGTH}") + +##### +##### Helper targets to support required microarchs and apply relevant compiler optimizations. +##### + +# Set up "base" target to include opt. flags for base microarch +# and flags to enable support of other microarchs in dispatcher +add_library(svs_microarch_options_base INTERFACE) +add_library(svs::microarch_options_base ALIAS svs_microarch_options_base) + +# Get opt. flags for base microarch +list(POP_FRONT SVS_MICROARCHS BASE_MICROARCH) +list(POP_FRONT OPTIMIZATION_FLAGS BASE_OPT_FLAGS) +string(REPLACE "," ";" BASE_OPT_FLAGS ${BASE_OPT_FLAGS}) +message("Opt.flags[base=${BASE_MICROARCH}]: ${BASE_OPT_FLAGS}") + +target_compile_options(svs_microarch_options_base INTERFACE ${BASE_OPT_FLAGS} -DSVS_CPUARCH_SUPPORT_${BASE_MICROARCH}) + +foreach(MICROARCH OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) + # Tell the microarch dispatcher to include this microarch branch + target_compile_options(svs_microarch_options_base INTERFACE -DSVS_CPUARCH_SUPPORT_${MICROARCH}) + + string(REPLACE "," ";" OPT_FLAGS ${OPT_FLAGS}) + message("Opt.flags[${MICROARCH}]: ${OPT_FLAGS}") + + # Create a new target for this microarch + add_library(svs_microarch_options_${MICROARCH} INTERFACE) + add_library(svs::microarch_options_${MICROARCH} ALIAS svs_microarch_options_${MICROARCH}) + target_compile_options(svs_microarch_options_${MICROARCH} INTERFACE ${OPT_FLAGS} -DSVS_TUNE_TARGET=${MICROARCH}) +endforeach() + +set(MICROARCH_CPP_FILES "${CMAKE_CURRENT_LIST_DIR}/microarch_instantiations.cpp") + +# function to create a set of object files with microarch instantiations +function(create_microarch_instantiations) + set(MICROARCH_OBJECT_FILES "") + foreach(MICROARCH OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) + set(OBJ_NAME "microarch_${MICROARCH}") + add_library(${OBJ_NAME} OBJECT ${MICROARCH_CPP_FILES}) + + target_link_libraries(${OBJ_NAME} PRIVATE ${SVS_LIB} svs::compile_options fmt::fmt svs_microarch_options_${MICROARCH}) + + list(APPEND MICROARCH_OBJECT_FILES $) + endforeach() + set(MICROARCH_OBJECT_FILES "${MICROARCH_OBJECT_FILES}" PARENT_SCOPE) +endfunction() diff --git a/bindings/python/microarch.py b/cmake/microarch.py similarity index 89% rename from bindings/python/microarch.py rename to cmake/microarch.py index 99a4ae36..e1d8ec87 100644 --- a/bindings/python/microarch.py +++ b/cmake/microarch.py @@ -17,21 +17,18 @@ # (1) A text file with compiler optimization flags for each microarchitecture formatted for # relatively easy consumption by CMake. # -# (2) A JSON manifest file describing the micreoarchitecture for each compiled library -# that the python library can use to select the correct shared library. -# import archspec import archspec.cpu as cpu import argparse import json + def build_parser(): parser = argparse.ArgumentParser() parser.add_argument( "cmake_flags_text_file", help = "file path to where CMake's text file will go." ) - parser.add_argument("python_output_json_file") parser.add_argument("--compiler", required = True) parser.add_argument("--compiler-version", required = True) parser.add_argument( @@ -48,6 +45,7 @@ def resolve_microarch(name: str): """ custom_aliases = { "native": cpu.host().name, + "icelake_client": "icelake", } # Allow the custom aliases to override the current name. # If an alias doesn't exist, juse pass the name straight through. @@ -96,7 +94,6 @@ def run(): # Extract elements from the parser architectures = args.microarchitectures output_text = args.cmake_flags_text_file - output_json = args.python_output_json_file compiler = resolve_compiler(args.compiler) compiler_version = args.compiler_version @@ -120,16 +117,10 @@ def run(): "toolchain": toolchain, "libraries": suffix_to_microarch, } - with open(output_json, "w") as file: - file.write(json.dumps(pre_json_dict, indent = 4)) # Safe flags to file dump_flags_for_cmake(optimization_flags, output_text) - # Print flags to stdout - for flags in optimization_flags: - print(flags) - ##### ##### Execute as script. ##### diff --git a/bindings/python/src/instantiations.cpp b/cmake/microarch_instantiations.cpp similarity index 100% rename from bindings/python/src/instantiations.cpp rename to cmake/microarch_instantiations.cpp diff --git a/cmake/microarch_targets_aaarch64 b/cmake/microarch_targets_aaarch64 new file mode 100644 index 00000000..8783c382 --- /dev/null +++ b/cmake/microarch_targets_aaarch64 @@ -0,0 +1,2 @@ +neoverse_n1 +neoverse_v1 diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 new file mode 100644 index 00000000..438d9d3c --- /dev/null +++ b/cmake/microarch_targets_x86_64 @@ -0,0 +1,6 @@ +nehalem +haswell +skylake_avx512 +cascadelake +icelake_client +sapphirerapids diff --git a/cmake/options.cmake b/cmake/options.cmake index f925d114..7bc7a7be 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -140,14 +140,6 @@ else() target_compile_options(${SVS_LIB} INTERFACE -DSVS_INITIALIZE_LOGGER=0) endif() -##### -##### Helper target to apply relevant compiler optimizations. -##### - -add_library(svs_native_options INTERFACE) -add_library(svs::native_options ALIAS svs_native_options) -target_compile_options(svs_native_options INTERFACE -DSVS_CPUARCH_NATIVE -march=native -mtune=native) - # Use an internal INTERFACE target to apply the same build options to both the # unit test and the compiled binaries. add_library(svs_compile_options INTERFACE) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index b9f1c98e..a3da0fce 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -24,10 +24,10 @@ endif() # # [1] A simple executable is one that takes no commandline arguments. function(create_simple_example exe test file) - add_executable(${exe} ${file}) + add_executable(${exe} ${file} ${MICROARCH_OBJECT_FILES}) target_include_directories(${exe} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) # Link to our library - target_link_libraries(${exe} ${SVS_LIB} svs_compile_options svs_native_options) + target_link_libraries(${exe} ${SVS_LIB} svs_compile_options svs_microarch_options_base) # Create a test. # No-op if the `include(CTest)` line above is not executed. add_test(${test} ${exe}) @@ -49,9 +49,9 @@ configure_file(../../data/test_dataset/queries_f32.fvecs . COPYONLY) configure_file(../../data/test_dataset/groundtruth_euclidean.ivecs . COPYONLY) # The vamana test executable. -add_executable(vamana vamana.cpp) +add_executable(vamana vamana.cpp ${MICROARCH_OBJECT_FILES}) target_include_directories(vamana PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_link_libraries(vamana ${SVS_LIB} svs_compile_options svs_native_options) +target_link_libraries(vamana ${SVS_LIB} svs_compile_options svs_microarch_options_base) add_test( NAME test_vamana COMMAND @@ -79,9 +79,9 @@ add_test( ##### Dispatcher ##### -add_executable(dispatcher dispatcher.cpp) +add_executable(dispatcher dispatcher.cpp ${MICROARCH_OBJECT_FILES}) target_include_directories(dispatcher PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_link_libraries(dispatcher ${SVS_LIB} svs_compile_options svs_native_options) +target_link_libraries(dispatcher ${SVS_LIB} svs_compile_options svs_microarch_options_base) # Here we go. add_test( diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 27e5c122..d25301dd 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -21,9 +21,7 @@ namespace svs::arch { enum class CPUArch { -#if defined(SVS_CPUARCH_NATIVE) - native, -#elif defined(__x86_64__) +#if defined(__x86_64__) // Refer to the GCC docs for the list of targeted architectures: // https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html nehalem, @@ -39,6 +37,7 @@ enum class CPUArch { x86_64_v4 = skylake_avx512, cascadelake, cooperlake, + icelake_client, icelake_server, sapphirerapids, emeraldrapids = sapphirerapids, @@ -53,10 +52,7 @@ enum class CPUArch { inline bool arch_is_supported(CPUArch arch) { switch (arch) { -#if defined(SVS_CPUARCH_NATIVE) - case CPUArch::native: - return true; -#elif defined(__x86_64__) +#if defined(__x86_64__) case CPUArch::nehalem: return check_extensions(std::vector{ ISAExt::MMX, @@ -116,7 +112,7 @@ inline bool arch_is_supported(CPUArch arch) { case CPUArch::cooperlake: return arch_is_supported(CPUArch::cascadelake) && check_extensions(std::vector{ISAExt::AVX512_BF16}); - case CPUArch::icelake_server: + case CPUArch::icelake_client: return arch_is_supported(CPUArch::cooperlake) && check_extensions(std::vector{ ISAExt::PKU, @@ -129,7 +125,10 @@ inline bool arch_is_supported(CPUArch arch) { ISAExt::VPCLMULQDQ, ISAExt::AVX512_BITALG, ISAExt::RDPID, - ISAExt::AVX512_VPOPCNTDQ, + ISAExt::AVX512_VPOPCNTDQ}); + case CPUArch::icelake_server: + return arch_is_supported(CPUArch::icelake_client) && + check_extensions(std::vector{ ISAExt::PCONFIG, ISAExt::WBNOINVD, ISAExt::CLWB}); @@ -161,7 +160,8 @@ inline bool arch_is_supported(CPUArch arch) { #elif defined(__aarch64__) // TODO: complete lists of supported extensions case CPUArch::neoverse_n1: - return check_extensions(std::vector{ISAExt::SVE}) case CPUArch::neoverse_v1: + return check_extensions(std::vector{ISAExt::SVE}); + case CPUArch::neoverse_v1: return arch_is_supported(CPUArch::neoverse_n1) && check_extensions(std::vector{ISAExt::SVE2}); #endif @@ -171,47 +171,90 @@ inline bool arch_is_supported(CPUArch arch) { } class CPUArchEnvironment { - public: - static CPUArchEnvironment& get_instance() { - // TODO: ensure thread safety - static CPUArchEnvironment instance; - return instance; - } - CPUArch get_cpu_arch() const { return max_arch_; } + public: + static CPUArchEnvironment& get_instance() { + // TODO: ensure thread safety + static CPUArchEnvironment instance; + return instance; + } + CPUArch get_cpu_arch() const { return max_arch_; } - private: - CPUArchEnvironment() { - const std::vector compiled_archs = { -#if defined(SVS_CPUARCH_NATIVE) - CPUArch::native, -#elif defined(__x86_64__) - // TODO: add support for dynamic list of compiled archs - CPUArch::haswell, - CPUArch::broadwell, - CPUArch::skylake, - CPUArch::skylake_avx512, - CPUArch::cascadelake, - CPUArch::sapphirerapids, + private: + CPUArchEnvironment() { + const std::vector compiled_archs = { +#if defined(SVS_CPUARCH_SUPPORT_native) + CPUArch::native, +#endif +#if defined(__x86_64__) + #if defined(SVS_CPUARCH_SUPPORT_nehalem) + CPUArch::nehalem, + #endif + #if defined(SVS_CPUARCH_SUPPORT_westmere) + CPUArch::westmere, + #endif + #if defined(SVS_CPUARCH_SUPPORT_sandybridge) + CPUArch::sandybridge, + #endif + #if defined(SVS_CPUARCH_SUPPORT_ivybridge) + CPUArch::ivybridge, + #endif + #if defined(SVS_CPUARCH_SUPPORT_haswell) + CPUArch::haswell, + #endif + #if defined(SVS_CPUARCH_SUPPORT_broadwell) + CPUArch::broadwell, + #endif + #if defined(SVS_CPUARCH_SUPPORT_skylake) + CPUArch::skylake, + #endif + #if defined(SVS_CPUARCH_SUPPORT_skylake_avx512) + CPUArch::skylake_avx512, + #endif + #if defined(SVS_CPUARCH_SUPPORT_cascadelake) + CPUArch::cascadelake, + #endif + #if defined(SVS_CPUARCH_SUPPORT_cooperlake) + CPUArch::cooperlake, + #endif + #if defined(SVS_CPUARCH_SUPPORT_icelake_client) + CPUArch::icelake_client, + #endif + #if defined(SVS_CPUARCH_SUPPORT_icelake_server) + CPUArch::icelake_server, + #endif + #if defined(SVS_CPUARCH_SUPPORT_sapphirerapids) + CPUArch::sapphirerapids, + #endif + #if defined(SVS_CPUARCH_SUPPORT_graniterapids) + CPUArch::graniterapids, + #endif + #if defined(SVS_CPUARCH_SUPPORT_graniterapids_d) + CPUArch::graniterapids_d, + #endif #elif defined(__aarch64__) - CPUArch::neoverse_n1, - CPUArch::neoverse_v1, + #if defined(SVS_CPUARCH_SUPPORT_neoverse_n1) + CPUArch::neoverse_n1, + #endif + #if defined(SVS_CPUARCH_SUPPORT_neoverse_v1) + CPUArch::neoverse_v1, + #endif #endif - }; - compiled_archs_ = compiled_archs; - max_arch_ = CPUArch::baseline; - for (const auto& arch : compiled_archs_) { - if (arch_is_supported(arch)) { - supported_archs_.push_back(arch); - if (static_cast(arch) > static_cast(max_arch_)) { - max_arch_ = arch; + }; + compiled_archs_ = compiled_archs; + max_arch_ = CPUArch::baseline; + for (const auto& arch : compiled_archs_) { + if (arch_is_supported(arch)) { + supported_archs_.push_back(arch); + if (static_cast(arch) > static_cast(max_arch_)) { + max_arch_ = arch; + } } } } - } - std::vector compiled_archs_; - std::vector supported_archs_; - CPUArch max_arch_; + std::vector compiled_archs_; + std::vector supported_archs_; + CPUArch max_arch_; }; #define SVS_PACK_ARGS(...) __VA_ARGS__ @@ -219,23 +262,19 @@ class CPUArchEnvironment { case svs::arch::CPUArch::cpuarch: \ return cls::method(args); \ break; -#if defined(SVS_CPUARCH_NATIVE) -#define SVS_TARGET_CPUARCH svs::arch::CPUArch::native - -#define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ - return cls::method(args); -#elif defined(__x86_64__) #define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET +#if defined(__x86_64__) + #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ svs::arch::CPUArch cpu_arch = \ svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ switch (cpu_arch) { \ + SVS_CLASS_METHOD_CPUARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_CPUARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_CPUARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_CPUARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(icelake_client, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_CPUARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) \ default: \ return cls::method(args); \ diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h index 2509a17a..c18c6266 100644 --- a/include/svs/lib/cpuid.h +++ b/include/svs/lib/cpuid.h @@ -86,7 +86,7 @@ enum class ISAExt { // AVX family AVX_VNNI, - // AVX512_ family + // AVX512 family AVX512_F, AVX512_VL, AVX512_BW, @@ -293,7 +293,7 @@ inline const std::unordered_map ISAExtInfo = { {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, }; -#endif // if defined(__aarch64__) +#endif inline bool check_extension(ISAExt ext) { return ISAExtInfo.at(ext).get_value(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 45054826..6a040384 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -186,7 +186,7 @@ if (SVS_EXPERIMENTAL_ENABLE_NUMA) list(APPEND TEST_SOURCES ${NUMA_TESTS}) endif() -add_executable(tests ${TEST_SOURCES}) +add_executable(tests ${TEST_SOURCES} ${MICROARCH_OBJECT_FILES}) # Path to the test dataset. set(DATA_DIRECTORY "${PROJECT_SOURCE_DIR}/data") @@ -196,7 +196,7 @@ target_compile_definitions(tests PRIVATE SVS_TEST_DATA_DIR="${DATA_DIRECTORY}") target_link_libraries(tests PRIVATE ${SVS_LIB}) target_link_libraries( - tests PRIVATE svs_compile_options svs_native_options svs_benchmark_library + tests PRIVATE svs_compile_options svs_microarch_options_base svs_benchmark_library ) target_link_libraries(tests PRIVATE Catch2::Catch2WithMain) diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 85c6b316..d4935abb 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -13,7 +13,7 @@ # limitations under the License. function(create_utility exe file) - add_executable(${exe} ${file}) + add_executable(${exe} ${file} ${MICROARCH_OBJECT_FILES}) target_include_directories( ${exe} PRIVATE ${CMAKE_CURRENT_LIST_DIR} @@ -22,7 +22,7 @@ function(create_utility exe file) target_link_libraries(${exe} PRIVATE ${SVS_LIB}) # Get common compiler options with the unit tests. - target_link_libraries(${exe} PRIVATE svs_compile_options svs_native_options) + target_link_libraries(${exe} PRIVATE svs_compile_options svs_microarch_options_base) # Link with third-party executables. target_link_libraries(${exe} PRIVATE fmt::fmt) From 83588cf9a2ff4dcf46d60bc58eebd13bdf9d11e9 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 29 Apr 2025 08:16:59 -0700 Subject: [PATCH 13/65] Install archspec in all github workflows; add uarch target lists to license exceptions --- .github/workflows/build-linux-arm.yml | 3 +++ .github/workflows/build-linux.yml | 3 +++ .github/workflows/build-macos.yaml | 3 +++ .licenserc.yaml | 2 ++ 4 files changed, 11 insertions(+) diff --git a/.github/workflows/build-linux-arm.yml b/.github/workflows/build-linux-arm.yml index 07811f00..d08f9ad5 100644 --- a/.github/workflows/build-linux-arm.yml +++ b/.github/workflows/build-linux-arm.yml @@ -47,6 +47,9 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Install archspec + run: python -m pip install archspec + - name: Configure build working-directory: ${{ runner.temp }} env: diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index a03bdd78..fd07a192 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -56,6 +56,9 @@ jobs: source /opt/intel/oneapi/setvars.sh printenv >> $GITHUB_ENV + - name: Install archspec + run: python -m pip install archspec + - name: Configure build working-directory: ${{ runner.temp }} env: diff --git a/.github/workflows/build-macos.yaml b/.github/workflows/build-macos.yaml index a382d525..edd249e4 100644 --- a/.github/workflows/build-macos.yaml +++ b/.github/workflows/build-macos.yaml @@ -46,6 +46,9 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Install archspec + run: python -m pip install archspec + - name: Install Compiler run: | echo "Installing ${{ matrix.package }}..." diff --git a/.licenserc.yaml b/.licenserc.yaml index 815de7ee..b8347ad6 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -45,6 +45,8 @@ header: - 'THIRD-PARTY-PROGRAMS' - '.github/renovate.json' - 'cmake/mkl_functions' + - 'cmake/microarch_targets_aaarch64' + - 'cmake/microarch_targets_x86_64' - 'cmake/patches/tomlplusplus_v330.patch' - 'docker/x86_64/manylinux2014/oneAPI.repo' - 'docs/cpp/index/loader-compatibility.csv' From 27790b8e011fde038fd009e33ced51de0427aebd Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 29 Apr 2025 08:29:48 -0700 Subject: [PATCH 14/65] Add error message for unknown CMAKE_SYSTEM_PROCESSOR --- cmake/cpuarch.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/cpuarch.cmake b/cmake/cpuarch.cmake index 5d7e3a83..b0b64423 100644 --- a/cmake/cpuarch.cmake +++ b/cmake/cpuarch.cmake @@ -23,8 +23,11 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_x86_64" SVS_MICROARCHS) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_aarch64" SVS_MICROARCHS) +else() + message(FATAL_ERROR "Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") endif() + # Try to find the Python executable. # # If it's given as part of the Cmake arguments given by "scikit build", then use that. From 5fe8239f193c6dbc87f960b18f33c5558da42c0a Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 29 Apr 2025 08:58:07 -0700 Subject: [PATCH 15/65] Fixes for aarch64 uArch targets list --- cmake/cpuarch.cmake | 2 +- cmake/{microarch_targets_aaarch64 => microarch_targets_aarch64} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cmake/{microarch_targets_aaarch64 => microarch_targets_aarch64} (100%) diff --git a/cmake/cpuarch.cmake b/cmake/cpuarch.cmake index b0b64423..f61156a6 100644 --- a/cmake/cpuarch.cmake +++ b/cmake/cpuarch.cmake @@ -21,7 +21,7 @@ set(svs_microarch_cmake_included true) # which is used to build base object files, shared libs and executables if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_x86_64" SVS_MICROARCHS) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_aarch64" SVS_MICROARCHS) else() message(FATAL_ERROR "Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") diff --git a/cmake/microarch_targets_aaarch64 b/cmake/microarch_targets_aarch64 similarity index 100% rename from cmake/microarch_targets_aaarch64 rename to cmake/microarch_targets_aarch64 From 8980d7df952f361505b307bfaae5eacaba9ad15a Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 29 Apr 2025 09:16:39 -0700 Subject: [PATCH 16/65] Fix license exception --- .licenserc.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.licenserc.yaml b/.licenserc.yaml index b8347ad6..b066e4f8 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -45,7 +45,7 @@ header: - 'THIRD-PARTY-PROGRAMS' - '.github/renovate.json' - 'cmake/mkl_functions' - - 'cmake/microarch_targets_aaarch64' + - 'cmake/microarch_targets_aarch64' - 'cmake/microarch_targets_x86_64' - 'cmake/patches/tomlplusplus_v330.patch' - 'docker/x86_64/manylinux2014/oneAPI.repo' From 0c88c7306dd3c5e0527bcca04baef1b3ee909acb Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 29 Apr 2025 09:16:55 -0700 Subject: [PATCH 17/65] Add host microarch output --- .github/workflows/build-linux-arm.yml | 6 ++++-- .github/workflows/build-linux.yml | 6 ++++-- .github/workflows/build-macos.yaml | 6 ++++-- .github/workflows/cibuildwheel.yml | 5 +++++ 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-linux-arm.yml b/.github/workflows/build-linux-arm.yml index d08f9ad5..aa089bdc 100644 --- a/.github/workflows/build-linux-arm.yml +++ b/.github/workflows/build-linux-arm.yml @@ -47,8 +47,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install archspec - run: python -m pip install archspec + - name: Install archspec and get host microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" - name: Configure build working-directory: ${{ runner.temp }} diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index fd07a192..7a7e9eb5 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -56,8 +56,10 @@ jobs: source /opt/intel/oneapi/setvars.sh printenv >> $GITHUB_ENV - - name: Install archspec - run: python -m pip install archspec + - name: Install archspec and get host microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" - name: Configure build working-directory: ${{ runner.temp }} diff --git a/.github/workflows/build-macos.yaml b/.github/workflows/build-macos.yaml index edd249e4..c1054e38 100644 --- a/.github/workflows/build-macos.yaml +++ b/.github/workflows/build-macos.yaml @@ -46,8 +46,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install archspec - run: python -m pip install archspec + - name: Install archspec and get host microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" - name: Install Compiler run: | diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml index 87198b98..780d8baa 100644 --- a/.github/workflows/cibuildwheel.yml +++ b/.github/workflows/cibuildwheel.yml @@ -43,6 +43,11 @@ jobs: - name: Install cibuildwheel run: python -m pip install cibuildwheel + - name: Install archspec and get host microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" + # Install inside the temporary working directory. - name: Build Wheel env: From 9e2934cbaa5f8225052c86a968cebbb2a987d381 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 29 Apr 2025 12:33:15 -0700 Subject: [PATCH 18/65] Fix aarch64 support --- .github/workflows/build-linux-arm.yml | 2 +- .github/workflows/build-linux.yml | 2 +- .github/workflows/build-macos.yaml | 2 +- .github/workflows/cibuildwheel.yml | 2 +- cmake/microarch_targets_aarch64 | 2 +- include/svs/lib/arch.h | 162 +++++++++++++------------- 6 files changed, 85 insertions(+), 87 deletions(-) diff --git a/.github/workflows/build-linux-arm.yml b/.github/workflows/build-linux-arm.yml index aa089bdc..5c073eed 100644 --- a/.github/workflows/build-linux-arm.yml +++ b/.github/workflows/build-linux-arm.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install archspec and get host microarch + - name: Install archspec and Get Host Microarch run: | python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index 7a7e9eb5..de143064 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -56,7 +56,7 @@ jobs: source /opt/intel/oneapi/setvars.sh printenv >> $GITHUB_ENV - - name: Install archspec and get host microarch + - name: Install archspec and Get Host Microarch run: | python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" diff --git a/.github/workflows/build-macos.yaml b/.github/workflows/build-macos.yaml index c1054e38..24d5a89e 100644 --- a/.github/workflows/build-macos.yaml +++ b/.github/workflows/build-macos.yaml @@ -46,7 +46,7 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install archspec and get host microarch + - name: Install archspec and Get Host Microarch run: | python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml index 780d8baa..39cd64f3 100644 --- a/.github/workflows/cibuildwheel.yml +++ b/.github/workflows/cibuildwheel.yml @@ -43,7 +43,7 @@ jobs: - name: Install cibuildwheel run: python -m pip install cibuildwheel - - name: Install archspec and get host microarch + - name: Install archspec and Get Host Microarch run: | python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" diff --git a/cmake/microarch_targets_aarch64 b/cmake/microarch_targets_aarch64 index 8783c382..fe512ced 100644 --- a/cmake/microarch_targets_aarch64 +++ b/cmake/microarch_targets_aarch64 @@ -1,2 +1,2 @@ -neoverse_n1 neoverse_v1 +neoverse_n2 diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index d25301dd..9259d2f1 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -44,8 +44,8 @@ enum class CPUArch { graniterapids, graniterapids_d, #elif defined(__aarch64__) - neoverse_n1, neoverse_v1, + neoverse_n2, #endif baseline = 0, }; @@ -129,9 +129,7 @@ inline bool arch_is_supported(CPUArch arch) { case CPUArch::icelake_server: return arch_is_supported(CPUArch::icelake_client) && check_extensions(std::vector{ - ISAExt::PCONFIG, - ISAExt::WBNOINVD, - ISAExt::CLWB}); + ISAExt::PCONFIG, ISAExt::WBNOINVD, ISAExt::CLWB}); case CPUArch::sapphirerapids: return arch_is_supported(CPUArch::icelake_server) && check_extensions(std::vector{ @@ -159,10 +157,10 @@ inline bool arch_is_supported(CPUArch arch) { check_extensions(std::vector{ISAExt::AMX_COMPLEX}); #elif defined(__aarch64__) // TODO: complete lists of supported extensions - case CPUArch::neoverse_n1: - return check_extensions(std::vector{ISAExt::SVE}); case CPUArch::neoverse_v1: - return arch_is_supported(CPUArch::neoverse_n1) && + return check_extensions(std::vector{ISAExt::SVE}); + case CPUArch::neoverse_n2: + return arch_is_supported(CPUArch::neoverse_v1) && check_extensions(std::vector{ISAExt::SVE2}); #endif default: @@ -171,90 +169,90 @@ inline bool arch_is_supported(CPUArch arch) { } class CPUArchEnvironment { - public: - static CPUArchEnvironment& get_instance() { - // TODO: ensure thread safety - static CPUArchEnvironment instance; - return instance; - } - CPUArch get_cpu_arch() const { return max_arch_; } + public: + static CPUArchEnvironment& get_instance() { + // TODO: ensure thread safety + static CPUArchEnvironment instance; + return instance; + } + CPUArch get_cpu_arch() const { return max_arch_; } - private: - CPUArchEnvironment() { - const std::vector compiled_archs = { + private: + CPUArchEnvironment() { + const std::vector compiled_archs = { #if defined(SVS_CPUARCH_SUPPORT_native) - CPUArch::native, + CPUArch::native, #endif #if defined(__x86_64__) - #if defined(SVS_CPUARCH_SUPPORT_nehalem) - CPUArch::nehalem, - #endif - #if defined(SVS_CPUARCH_SUPPORT_westmere) - CPUArch::westmere, - #endif - #if defined(SVS_CPUARCH_SUPPORT_sandybridge) - CPUArch::sandybridge, - #endif - #if defined(SVS_CPUARCH_SUPPORT_ivybridge) - CPUArch::ivybridge, - #endif - #if defined(SVS_CPUARCH_SUPPORT_haswell) - CPUArch::haswell, - #endif - #if defined(SVS_CPUARCH_SUPPORT_broadwell) - CPUArch::broadwell, - #endif - #if defined(SVS_CPUARCH_SUPPORT_skylake) - CPUArch::skylake, - #endif - #if defined(SVS_CPUARCH_SUPPORT_skylake_avx512) - CPUArch::skylake_avx512, - #endif - #if defined(SVS_CPUARCH_SUPPORT_cascadelake) - CPUArch::cascadelake, - #endif - #if defined(SVS_CPUARCH_SUPPORT_cooperlake) - CPUArch::cooperlake, - #endif - #if defined(SVS_CPUARCH_SUPPORT_icelake_client) - CPUArch::icelake_client, - #endif - #if defined(SVS_CPUARCH_SUPPORT_icelake_server) - CPUArch::icelake_server, - #endif - #if defined(SVS_CPUARCH_SUPPORT_sapphirerapids) - CPUArch::sapphirerapids, - #endif - #if defined(SVS_CPUARCH_SUPPORT_graniterapids) - CPUArch::graniterapids, - #endif - #if defined(SVS_CPUARCH_SUPPORT_graniterapids_d) - CPUArch::graniterapids_d, - #endif +#if defined(SVS_CPUARCH_SUPPORT_nehalem) + CPUArch::nehalem, +#endif +#if defined(SVS_CPUARCH_SUPPORT_westmere) + CPUArch::westmere, +#endif +#if defined(SVS_CPUARCH_SUPPORT_sandybridge) + CPUArch::sandybridge, +#endif +#if defined(SVS_CPUARCH_SUPPORT_ivybridge) + CPUArch::ivybridge, +#endif +#if defined(SVS_CPUARCH_SUPPORT_haswell) + CPUArch::haswell, +#endif +#if defined(SVS_CPUARCH_SUPPORT_broadwell) + CPUArch::broadwell, +#endif +#if defined(SVS_CPUARCH_SUPPORT_skylake) + CPUArch::skylake, +#endif +#if defined(SVS_CPUARCH_SUPPORT_skylake_avx512) + CPUArch::skylake_avx512, +#endif +#if defined(SVS_CPUARCH_SUPPORT_cascadelake) + CPUArch::cascadelake, +#endif +#if defined(SVS_CPUARCH_SUPPORT_cooperlake) + CPUArch::cooperlake, +#endif +#if defined(SVS_CPUARCH_SUPPORT_icelake_client) + CPUArch::icelake_client, +#endif +#if defined(SVS_CPUARCH_SUPPORT_icelake_server) + CPUArch::icelake_server, +#endif +#if defined(SVS_CPUARCH_SUPPORT_sapphirerapids) + CPUArch::sapphirerapids, +#endif +#if defined(SVS_CPUARCH_SUPPORT_graniterapids) + CPUArch::graniterapids, +#endif +#if defined(SVS_CPUARCH_SUPPORT_graniterapids_d) + CPUArch::graniterapids_d, +#endif #elif defined(__aarch64__) - #if defined(SVS_CPUARCH_SUPPORT_neoverse_n1) - CPUArch::neoverse_n1, - #endif - #if defined(SVS_CPUARCH_SUPPORT_neoverse_v1) - CPUArch::neoverse_v1, - #endif +#if defined(SVS_CPUARCH_SUPPORT_neoverse_n1) + CPUArch::neoverse_n1, +#endif +#if defined(SVS_CPUARCH_SUPPORT_neoverse_v1) + CPUArch::neoverse_v1, #endif - }; - compiled_archs_ = compiled_archs; - max_arch_ = CPUArch::baseline; - for (const auto& arch : compiled_archs_) { - if (arch_is_supported(arch)) { - supported_archs_.push_back(arch); - if (static_cast(arch) > static_cast(max_arch_)) { - max_arch_ = arch; - } +#endif + }; + compiled_archs_ = compiled_archs; + max_arch_ = CPUArch::baseline; + for (const auto& arch : compiled_archs_) { + if (arch_is_supported(arch)) { + supported_archs_.push_back(arch); + if (static_cast(arch) > static_cast(max_arch_)) { + max_arch_ = arch; } } } + } - std::vector compiled_archs_; - std::vector supported_archs_; - CPUArch max_arch_; + std::vector compiled_archs_; + std::vector supported_archs_; + CPUArch max_arch_; }; #define SVS_PACK_ARGS(...) __VA_ARGS__ @@ -287,8 +285,8 @@ class CPUArchEnvironment { svs::arch::CPUArch cpu_arch = \ svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ switch (cpu_arch) { \ - SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_n1, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) \ default: \ return cls::method(args); \ break; \ From 2a4ada781d6f409e2e5fe8ca8d1e4ed962cea5cc Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 29 Apr 2025 13:14:25 -0700 Subject: [PATCH 19/65] Fix macOS-ARM support --- .licenserc.yaml | 1 + cmake/cpuarch.cmake | 6 +++- cmake/microarch_targets_aarch64_darwin | 2 ++ include/svs/lib/arch.h | 49 ++++++++++++++++++++++---- 4 files changed, 50 insertions(+), 8 deletions(-) create mode 100644 cmake/microarch_targets_aarch64_darwin diff --git a/.licenserc.yaml b/.licenserc.yaml index b066e4f8..48f920b2 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -46,6 +46,7 @@ header: - '.github/renovate.json' - 'cmake/mkl_functions' - 'cmake/microarch_targets_aarch64' + - 'cmake/microarch_targets_aarch64_darwin' - 'cmake/microarch_targets_x86_64' - 'cmake/patches/tomlplusplus_v330.patch' - 'docker/x86_64/manylinux2014/oneAPI.repo' diff --git a/cmake/cpuarch.cmake b/cmake/cpuarch.cmake index f61156a6..60e46944 100644 --- a/cmake/cpuarch.cmake +++ b/cmake/cpuarch.cmake @@ -22,7 +22,11 @@ set(svs_microarch_cmake_included true) if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_x86_64" SVS_MICROARCHS) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") - file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_aarch64" SVS_MICROARCHS) + if(APPLE) + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_aarch64_darwin" SVS_MICROARCHS) + else() + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_aarch64" SVS_MICROARCHS) + endif() else() message(FATAL_ERROR "Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") endif() diff --git a/cmake/microarch_targets_aarch64_darwin b/cmake/microarch_targets_aarch64_darwin new file mode 100644 index 00000000..b5692e52 --- /dev/null +++ b/cmake/microarch_targets_aarch64_darwin @@ -0,0 +1,2 @@ +m1 +m2 diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 9259d2f1..a39031e1 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -44,8 +44,13 @@ enum class CPUArch { graniterapids, graniterapids_d, #elif defined(__aarch64__) +#if defined(__APPLE__) + m1, + m2, +#else neoverse_v1, neoverse_n2, +#endif #endif baseline = 0, }; @@ -156,12 +161,19 @@ inline bool arch_is_supported(CPUArch arch) { return arch_is_supported(CPUArch::graniterapids) && check_extensions(std::vector{ISAExt::AMX_COMPLEX}); #elif defined(__aarch64__) - // TODO: complete lists of supported extensions +#if defined(__APPLE__) + case CPUArch::m1: + return check_extensions(std::vector{ISAExt::SVE}); + case CPUArch::m2: + return arch_is_supported(CPUArch::m1) && + check_extensions(std::vector{ISAExt::SVE2}); +#else case CPUArch::neoverse_v1: return check_extensions(std::vector{ISAExt::SVE}); case CPUArch::neoverse_n2: return arch_is_supported(CPUArch::neoverse_v1) && check_extensions(std::vector{ISAExt::SVE2}); +#endif #endif default: return false; @@ -180,9 +192,6 @@ class CPUArchEnvironment { private: CPUArchEnvironment() { const std::vector compiled_archs = { -#if defined(SVS_CPUARCH_SUPPORT_native) - CPUArch::native, -#endif #if defined(__x86_64__) #if defined(SVS_CPUARCH_SUPPORT_nehalem) CPUArch::nehalem, @@ -230,12 +239,21 @@ class CPUArchEnvironment { CPUArch::graniterapids_d, #endif #elif defined(__aarch64__) -#if defined(SVS_CPUARCH_SUPPORT_neoverse_n1) - CPUArch::neoverse_n1, +#if defined(__APPLE__) +#if defined(SVS_CPUARCH_SUPPORT_m1) + CPUArch::m1, +#endif +#if defined(SVS_CPUARCH_SUPPORT_m2) + CPUArch::m2, #endif +#else #if defined(SVS_CPUARCH_SUPPORT_neoverse_v1) CPUArch::neoverse_v1, #endif +#if defined(SVS_CPUARCH_SUPPORT_neoverse_n2) + CPUArch::neoverse_n2, +#endif +#endif #endif }; compiled_archs_ = compiled_archs; @@ -279,7 +297,21 @@ class CPUArchEnvironment { break; \ } #elif defined(__aarch64__) -#define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET + +#if defined(__APPLE__) + +#define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ + svs::arch::CPUArch cpu_arch = \ + svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_CPUARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_CPUARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ + } + +#else #define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ svs::arch::CPUArch cpu_arch = \ @@ -291,6 +323,9 @@ class CPUArchEnvironment { return cls::method(args); \ break; \ } + +#endif + #endif #define SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH( \ From a27aa8b6df7957f95b4396c658c9f4e799db14d4 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 30 Apr 2025 02:29:13 -0700 Subject: [PATCH 20/65] Fix macOS-ARM support v2 --- include/svs/lib/arch.h | 4 ++-- include/svs/lib/cpuid.h | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index a39031e1..99f46be1 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -163,10 +163,10 @@ inline bool arch_is_supported(CPUArch arch) { #elif defined(__aarch64__) #if defined(__APPLE__) case CPUArch::m1: - return check_extensions(std::vector{ISAExt::SVE}); + return check_extensions(std::vector{ISAExt::DOTPROD}); case CPUArch::m2: return arch_is_supported(CPUArch::m1) && - check_extensions(std::vector{ISAExt::SVE2}); + check_extensions(std::vector{ISAExt::RNG, ISAExt::BF16}); #else case CPUArch::neoverse_v1: return check_extensions(std::vector{ISAExt::SVE}); diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h index c18c6266..a1153741 100644 --- a/include/svs/lib/cpuid.h +++ b/include/svs/lib/cpuid.h @@ -198,7 +198,15 @@ inline const std::unordered_map ISAExtInfo = { #elif defined(__aarch64__) -enum class ISAExt { SVE, SVE2 }; +enum class ISAExt { + // SVE family + SVE, + SVE2, + + DOTPROD, // ARMv8.4-A + RNG, // ARMv8.5-A + BF16, // ARMv8.6-A +}; // Define register ID values for ARM features detection #define ID_AA64PFR0_EL1 0 @@ -233,8 +241,10 @@ template inline uint64_t read_system_reg() { asm("mrs %0, id_aa64dfr0_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64DFR1_EL1) { asm("mrs %0, id_aa64dfr1_el1" : "=r"(val)); +#if !(defined(__APPLE__)) } else if constexpr (ID == ID_AA64ZFR0_EL1) { asm("mrs %0, id_aa64zfr0_el1" : "=r"(val)); +#endif } else { val = 0; } @@ -270,12 +280,13 @@ struct MSRFlag { case ID_AA64ISAR1_EL1: reg_val = read_system_reg(); break; +#if !(defined(__APPLE__)) case ID_AA64ZFR0_EL1: - // First check if SVE is supported to avoid if (extract_bits(read_system_reg(), 32, 4) != 0) { reg_val = read_system_reg(); } break; +#endif default: return false; } @@ -290,7 +301,12 @@ struct MSRFlag { inline const std::unordered_map ISAExtInfo = { {ISAExt::SVE, {ID_AA64PFR0_EL1, 32, 4, 1, "sve"}}, +#if !(defined(__APPLE__)) {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, +#endif + {ISAExt::DOTPROD, {ID_AA64ISAR0_EL1, 24, 4, 1, "dotprod"}}, + {ISAExt::RNG, {ID_AA64ISAR0_EL1, 60, 4, 1, "rng"}}, + {ISAExt::BF16, {ID_AA64ISAR1_EL1, 44, 4, 1, "bf16"}}, }; #endif From 4c307f767dfe957c70ac201eb968da267f5bf55a Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 30 Apr 2025 02:42:06 -0700 Subject: [PATCH 21/65] Remove unnecessary registers on ARM --- include/svs/lib/cpuid.h | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h index a1153741..683d33eb 100644 --- a/include/svs/lib/cpuid.h +++ b/include/svs/lib/cpuid.h @@ -210,37 +210,19 @@ enum class ISAExt { // Define register ID values for ARM features detection #define ID_AA64PFR0_EL1 0 -#define ID_AA64PFR1_EL1 1 -#define ID_AA64ISAR0_EL1 2 -#define ID_AA64ISAR1_EL1 3 -#define ID_AA64MMFR0_EL1 4 -#define ID_AA64MMFR1_EL1 5 -#define ID_AA64MMFR2_EL1 6 -#define ID_AA64DFR0_EL1 7 -#define ID_AA64DFR1_EL1 8 -#define ID_AA64ZFR0_EL1 9 +#define ID_AA64ISAR0_EL1 1 +#define ID_AA64ISAR1_EL1 2 +#define ID_AA64ZFR0_EL1 3 // Helper template to read system registers with mrs instruction template inline uint64_t read_system_reg() { uint64_t val; if constexpr (ID == ID_AA64PFR0_EL1) { asm("mrs %0, id_aa64pfr0_el1" : "=r"(val)); - } else if constexpr (ID == ID_AA64PFR1_EL1) { - asm("mrs %0, id_aa64pfr1_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64ISAR0_EL1) { asm("mrs %0, id_aa64isar0_el1" : "=r"(val)); } else if constexpr (ID == ID_AA64ISAR1_EL1) { asm("mrs %0, id_aa64isar1_el1" : "=r"(val)); - } else if constexpr (ID == ID_AA64MMFR0_EL1) { - asm("mrs %0, id_aa64mmfr0_el1" : "=r"(val)); - } else if constexpr (ID == ID_AA64MMFR1_EL1) { - asm("mrs %0, id_aa64mmfr1_el1" : "=r"(val)); - } else if constexpr (ID == ID_AA64MMFR2_EL1) { - asm("mrs %0, id_aa64mmfr2_el1" : "=r"(val)); - } else if constexpr (ID == ID_AA64DFR0_EL1) { - asm("mrs %0, id_aa64dfr0_el1" : "=r"(val)); - } else if constexpr (ID == ID_AA64DFR1_EL1) { - asm("mrs %0, id_aa64dfr1_el1" : "=r"(val)); #if !(defined(__APPLE__)) } else if constexpr (ID == ID_AA64ZFR0_EL1) { asm("mrs %0, id_aa64zfr0_el1" : "=r"(val)); @@ -271,9 +253,6 @@ struct MSRFlag { case ID_AA64PFR0_EL1: reg_val = read_system_reg(); break; - case ID_AA64PFR1_EL1: - reg_val = read_system_reg(); - break; case ID_AA64ISAR0_EL1: reg_val = read_system_reg(); break; From c2251dd8a100ed4b3f5ffb2cda799294f5f8b334 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 30 Apr 2025 02:52:22 -0700 Subject: [PATCH 22/65] Change native options in last build target --- examples/cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index a3da0fce..4f18c342 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -64,7 +64,7 @@ add_test( # The custom thread pool executable. add_executable(custom_thread_pool custom_thread_pool.cpp) target_include_directories(custom_thread_pool PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_link_libraries(custom_thread_pool ${SVS_LIB} svs_compile_options svs_native_options) +target_link_libraries(custom_thread_pool ${SVS_LIB} svs_compile_options svs_microarch_options_base) add_test( NAME test_custom_thread_pool COMMAND From fd675dc1e53863c137799bb4241a4123b54e6938 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 30 Apr 2025 04:45:33 -0700 Subject: [PATCH 23/65] Change macOS-ARM check to based on brand string --- cmake/microarch.py | 1 + include/svs/lib/arch.h | 5 ++-- include/svs/lib/cpuid.h | 63 +++++++++++++++++++++++------------------ 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/cmake/microarch.py b/cmake/microarch.py index e1d8ec87..6e6ce58c 100644 --- a/cmake/microarch.py +++ b/cmake/microarch.py @@ -83,6 +83,7 @@ def resolve_compiler(name: str): aliases = { "GNU": "gcc", "Clang": "clang", + "AppleClang": "clang", "IntelLLVM": "oneapi", } return aliases.get(name, name) diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 99f46be1..f1fdcef1 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -163,10 +163,9 @@ inline bool arch_is_supported(CPUArch arch) { #elif defined(__aarch64__) #if defined(__APPLE__) case CPUArch::m1: - return check_extensions(std::vector{ISAExt::DOTPROD}); + return check_extensions(std::vector{ISAExt::M1}); case CPUArch::m2: - return arch_is_supported(CPUArch::m1) && - check_extensions(std::vector{ISAExt::RNG, ISAExt::BF16}); + return check_extensions(std::vector{ISAExt::M2}); #else case CPUArch::neoverse_v1: return check_extensions(std::vector{ISAExt::SVE}); diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h index 683d33eb..b2703f81 100644 --- a/include/svs/lib/cpuid.h +++ b/include/svs/lib/cpuid.h @@ -26,6 +26,10 @@ #include #endif +#if defined(__aarch64__) && defined(__APPLE__) +#include +#endif + namespace svs::arch { #if defined(__x86_64__) @@ -198,35 +202,52 @@ inline const std::unordered_map ISAExtInfo = { #elif defined(__aarch64__) +#if defined(__APPLE__) + +enum class ISAExt { + M1, + M2, +}; + +struct BrandInfo { + const char* name; + + bool get_value() const { + char buffer[256]; + size_t size = sizeof(buffer); + + if (sysctlbyname("machdep.cpu.brand_string", &buffer, &size, nullptr, 0) == 0) { + std::string brand(buffer); + return brand.find(name) != std::string::npos; + } + + return false; + } +}; + +inline const std::unordered_map ISAExtInfo = { + {ISAExt::M1, {"M1"}}, + {ISAExt::M2, {"M2"}}, +}; + +#else + enum class ISAExt { - // SVE family SVE, SVE2, - - DOTPROD, // ARMv8.4-A - RNG, // ARMv8.5-A - BF16, // ARMv8.6-A }; // Define register ID values for ARM features detection #define ID_AA64PFR0_EL1 0 -#define ID_AA64ISAR0_EL1 1 -#define ID_AA64ISAR1_EL1 2 -#define ID_AA64ZFR0_EL1 3 +#define ID_AA64ZFR0_EL1 1 // Helper template to read system registers with mrs instruction template inline uint64_t read_system_reg() { uint64_t val; if constexpr (ID == ID_AA64PFR0_EL1) { asm("mrs %0, id_aa64pfr0_el1" : "=r"(val)); - } else if constexpr (ID == ID_AA64ISAR0_EL1) { - asm("mrs %0, id_aa64isar0_el1" : "=r"(val)); - } else if constexpr (ID == ID_AA64ISAR1_EL1) { - asm("mrs %0, id_aa64isar1_el1" : "=r"(val)); -#if !(defined(__APPLE__)) } else if constexpr (ID == ID_AA64ZFR0_EL1) { asm("mrs %0, id_aa64zfr0_el1" : "=r"(val)); -#endif } else { val = 0; } @@ -253,19 +274,11 @@ struct MSRFlag { case ID_AA64PFR0_EL1: reg_val = read_system_reg(); break; - case ID_AA64ISAR0_EL1: - reg_val = read_system_reg(); - break; - case ID_AA64ISAR1_EL1: - reg_val = read_system_reg(); - break; -#if !(defined(__APPLE__)) case ID_AA64ZFR0_EL1: if (extract_bits(read_system_reg(), 32, 4) != 0) { reg_val = read_system_reg(); } break; -#endif default: return false; } @@ -280,14 +293,10 @@ struct MSRFlag { inline const std::unordered_map ISAExtInfo = { {ISAExt::SVE, {ID_AA64PFR0_EL1, 32, 4, 1, "sve"}}, -#if !(defined(__APPLE__)) {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, -#endif - {ISAExt::DOTPROD, {ID_AA64ISAR0_EL1, 24, 4, 1, "dotprod"}}, - {ISAExt::RNG, {ID_AA64ISAR0_EL1, 60, 4, 1, "rng"}}, - {ISAExt::BF16, {ID_AA64ISAR1_EL1, 44, 4, 1, "bf16"}}, }; +#endif #endif inline bool check_extension(ISAExt ext) { return ISAExtInfo.at(ext).get_value(); } From 5ee411e99945a64ce8b6216597cad8eabd6a4f0d Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 30 Apr 2025 06:38:40 -0700 Subject: [PATCH 24/65] Align naming: cpuarch -> microarch --- CMakeLists.txt | 2 +- cmake/{cpuarch.cmake => microarch.cmake} | 4 +- cmake/microarch_instantiations.cpp | 6 +- include/svs/core/distance/cosine.h | 44 +-- include/svs/core/distance/euclidean.h | 58 ++-- include/svs/core/distance/inner_product.h | 58 ++-- include/svs/lib/arch.h | 270 +++++++++--------- tests/svs/core/distances/cosine.cpp | 9 +- .../svs/core/distances/distance_euclidean.cpp | 4 +- tests/svs/core/distances/inner_product.cpp | 4 +- 10 files changed, 230 insertions(+), 229 deletions(-) rename cmake/{cpuarch.cmake => microarch.cmake} (98%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a9b86c0..985f6c39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,7 +67,7 @@ target_compile_options( include("cmake/options.cmake") -include("cmake/cpuarch.cmake") +include("cmake/microarch.cmake") include("cmake/clang-tidy.cmake") include("cmake/eve.cmake") include("cmake/pthread.cmake") diff --git a/cmake/cpuarch.cmake b/cmake/microarch.cmake similarity index 98% rename from cmake/cpuarch.cmake rename to cmake/microarch.cmake index 60e46944..68236c5a 100644 --- a/cmake/cpuarch.cmake +++ b/cmake/microarch.cmake @@ -95,11 +95,11 @@ list(POP_FRONT OPTIMIZATION_FLAGS BASE_OPT_FLAGS) string(REPLACE "," ";" BASE_OPT_FLAGS ${BASE_OPT_FLAGS}) message("Opt.flags[base=${BASE_MICROARCH}]: ${BASE_OPT_FLAGS}") -target_compile_options(svs_microarch_options_base INTERFACE ${BASE_OPT_FLAGS} -DSVS_CPUARCH_SUPPORT_${BASE_MICROARCH}) +target_compile_options(svs_microarch_options_base INTERFACE ${BASE_OPT_FLAGS} -DSVS_MICROARCH_SUPPORT_${BASE_MICROARCH}) foreach(MICROARCH OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) # Tell the microarch dispatcher to include this microarch branch - target_compile_options(svs_microarch_options_base INTERFACE -DSVS_CPUARCH_SUPPORT_${MICROARCH}) + target_compile_options(svs_microarch_options_base INTERFACE -DSVS_MICROARCH_SUPPORT_${MICROARCH}) string(REPLACE "," ";" OPT_FLAGS ${OPT_FLAGS}) message("Opt.flags[${MICROARCH}]: ${OPT_FLAGS}") diff --git a/cmake/microarch_instantiations.cpp b/cmake/microarch_instantiations.cpp index e31e0ecc..9f60fe32 100644 --- a/cmake/microarch_instantiations.cpp +++ b/cmake/microarch_instantiations.cpp @@ -18,6 +18,6 @@ #include "svs/core/distance/euclidean.h" #include "svs/core/distance/inner_product.h" -SVS_INSTANTIATE_COSINE_DISTANCE_BY_CPUARCH -SVS_INSTANTIATE_L2_DISTANCE_BY_CPUARCH -SVS_INSTANTIATE_IP_DISTANCE_BY_CPUARCH +SVS_INSTANTIATE_COSINE_DISTANCE_BY_MICROARCH +SVS_INSTANTIATE_L2_DISTANCE_BY_MICROARCH +SVS_INSTANTIATE_IP_DISTANCE_BY_MICROARCH diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 088d18bd..76eb26f1 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -33,7 +33,7 @@ namespace svs::distance { // Forward declare implementation to allow entry point to be near the top. -template +template struct CosineSimilarityImpl; // Generic Entry Point @@ -43,7 +43,7 @@ struct CosineSimilarityImpl; // (2) CosineSimilarity::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -template class CosineSimilarity { +template class CosineSimilarity { public: template static constexpr float compute(const Ea* a, const Eb* b, float a_norm, size_t N) { @@ -141,13 +141,13 @@ float compute(DistanceCosineSimilarity distance, std::span a, std::span< assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - SVS_DISPATCH_CLASS_BY_CPUARCH( + SVS_DISPATCH_CLASS_BY_MICROARCH( CosineSimilarity, compute, SVS_PACK_ARGS(a.data(), b.data(), distance.norm_, a.size()) ); } else { - SVS_DISPATCH_CLASS_BY_CPUARCH( + SVS_DISPATCH_CLASS_BY_MICROARCH( CosineSimilarity, compute, SVS_PACK_ARGS(a.data(), b.data(), distance.norm_) @@ -176,7 +176,7 @@ float generic_cosine_similarity( return result / (a_norm * std::sqrt(accum)); }; -template +template struct CosineSimilarityImpl { static float compute( const Ea* a, @@ -235,7 +235,7 @@ template <> struct CosineFloatOp<16> : public svs::simd::ConvertToFloat<16> { // Small Integers SVS_VALIDATE_BOOL_ENV(SVS_AVX512_VNNI) #if SVS_AVX512_VNNI -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -261,7 +261,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -289,7 +289,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -297,7 +297,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -305,7 +305,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -313,7 +313,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -321,7 +321,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -329,7 +329,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -341,29 +341,29 @@ template struct CosineSimilarityImpl struct L2Impl; +template struct L2Impl; // Generic Entry Point // Call as one of either: @@ -81,7 +81,7 @@ template struct L2 // (2) L2::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -template class L2 { +template class L2 { public: template static constexpr float compute(const Ea* a, const Eb* b, size_t N) { @@ -156,11 +156,11 @@ float compute(DistanceL2 /*unused*/, std::span a, std::span b) { assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - SVS_DISPATCH_CLASS_BY_CPUARCH( + SVS_DISPATCH_CLASS_BY_MICROARCH( L2, compute, SVS_PACK_ARGS(a.data(), b.data(), a.size()) ); } else { - SVS_DISPATCH_CLASS_BY_CPUARCH( + SVS_DISPATCH_CLASS_BY_MICROARCH( L2, compute, SVS_PACK_ARGS(a.data(), b.data()) ); } @@ -182,7 +182,7 @@ float generic_l2( return result; } -template struct L2Impl { +template struct L2Impl { static constexpr float compute(const Ea* a, const Eb* b, lib::MaybeStatic length = lib::MaybeStatic()) { return generic_l2(a, b, length); @@ -257,14 +257,14 @@ template <> struct L2VNNIOp : public svs::simd::ConvertForVNNI struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2VNNIOp(), a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2VNNIOp(), a, b, length); @@ -274,42 +274,42 @@ template struct L2Impl { #endif // Floating and Mixed Types -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); }; }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); }; }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); @@ -325,7 +325,7 @@ template struct L2Impl { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -345,7 +345,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -367,7 +367,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -388,7 +388,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -412,7 +412,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -439,7 +439,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -469,16 +469,16 @@ template struct L2Impl { #endif // NOTE: dispatching doesn't work for other L2 instances than the listed below. -#define SVS_INSTANTIATE_L2_DISTANCE_BY_CPUARCH \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, int8_t, int8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, uint8_t, uint8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, float) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, uint8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, int8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, float, svs::float16::Float16) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(L2, svs::float16::Float16, float) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES( \ - L2, svs::float16::Float16, svs::float16::Float16 \ +#define SVS_INSTANTIATE_L2_DISTANCE_BY_MICROARCH \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, int8_t, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, uint8_t, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, float, float) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, float, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, float, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, float, svs::float16::Float16) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, svs::float16::Float16, float) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + L2, svs::float16::Float16, svs::float16::Float16 \ ) } // namespace svs::distance diff --git a/include/svs/core/distance/inner_product.h b/include/svs/core/distance/inner_product.h index 3402c4a6..9acc4a86 100644 --- a/include/svs/core/distance/inner_product.h +++ b/include/svs/core/distance/inner_product.h @@ -33,7 +33,7 @@ namespace svs::distance { // Forward declare implementation to allow entry point to be near the top. -template struct IPImpl; +template struct IPImpl; // Generic Entry Point // Call as one of either: @@ -42,7 +42,7 @@ template struct IP // (2) IP::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -template class IP { +template class IP { public: template static constexpr float compute(const Ea* a, const Eb* b, size_t N) { @@ -118,11 +118,11 @@ float compute(DistanceIP /*unused*/, std::span a, std::span b) { assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - SVS_DISPATCH_CLASS_BY_CPUARCH( + SVS_DISPATCH_CLASS_BY_MICROARCH( IP, compute, SVS_PACK_ARGS(a.data(), b.data(), a.size()) ); } else { - SVS_DISPATCH_CLASS_BY_CPUARCH( + SVS_DISPATCH_CLASS_BY_MICROARCH( IP, compute, SVS_PACK_ARGS(a.data(), b.data()) ); } @@ -143,7 +143,7 @@ float generic_ip( return result; } -template struct IPImpl { +template struct IPImpl { static float compute(const Ea* a, const Eb* b, lib::MaybeStatic length = lib::MaybeStatic()) { return generic_ip(a, b, length); @@ -212,14 +212,14 @@ template <> struct IPVNNIOp : public svs::simd::ConvertForVNNI struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(IPVNNIOp(), a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(IPVNNIOp(), a, b, length); @@ -229,42 +229,42 @@ template struct IPImpl { #endif // Floating and Mixed Types -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); }; }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); }; }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); @@ -279,7 +279,7 @@ template struct IPImpl { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -298,7 +298,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -319,7 +319,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -339,7 +339,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -362,7 +362,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -388,7 +388,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -417,16 +417,16 @@ template struct IPImpl { #endif // NOTE: dispatching doesn't work for other IP instances than the listed below. -#define SVS_INSTANTIATE_IP_DISTANCE_BY_CPUARCH \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, int8_t, int8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, uint8_t, uint8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, float) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, uint8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, int8_t) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, float, svs::float16::Float16) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(IP, svs::float16::Float16, float) \ - SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES( \ - IP, svs::float16::Float16, svs::float16::Float16 \ +#define SVS_INSTANTIATE_IP_DISTANCE_BY_MICROARCH \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, int8_t, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, uint8_t, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, float, float) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, float, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, float, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, float, svs::float16::Float16) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, svs::float16::Float16, float) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + IP, svs::float16::Float16, svs::float16::Float16 \ ) } // namespace svs::distance diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index f1fdcef1..816ac016 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -20,7 +20,7 @@ namespace svs::arch { -enum class CPUArch { +enum class MicroArch { #if defined(__x86_64__) // Refer to the GCC docs for the list of targeted architectures: // https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html @@ -55,10 +55,10 @@ enum class CPUArch { baseline = 0, }; -inline bool arch_is_supported(CPUArch arch) { +inline bool arch_is_supported(MicroArch arch) { switch (arch) { #if defined(__x86_64__) - case CPUArch::nehalem: + case MicroArch::nehalem: return check_extensions(std::vector{ ISAExt::MMX, ISAExt::SSE, @@ -71,18 +71,18 @@ inline bool arch_is_supported(CPUArch arch) { ISAExt::CX16, ISAExt::SAHF, ISAExt::FXSR}); - case CPUArch::westmere: - return arch_is_supported(CPUArch::nehalem) && + case MicroArch::westmere: + return arch_is_supported(MicroArch::nehalem) && check_extensions(std::vector{ISAExt::PCLMUL}); - case CPUArch::sandybridge: - return arch_is_supported(CPUArch::westmere) && + case MicroArch::sandybridge: + return arch_is_supported(MicroArch::westmere) && check_extensions(std::vector{ISAExt::AVX, ISAExt::XSAVE}); - case CPUArch::ivybridge: - return arch_is_supported(CPUArch::sandybridge) && + case MicroArch::ivybridge: + return arch_is_supported(MicroArch::sandybridge) && check_extensions(std::vector{ ISAExt::FSGSBASE, ISAExt::RDRND, ISAExt::F16C}); - case CPUArch::haswell: - return arch_is_supported(CPUArch::ivybridge) && + case MicroArch::haswell: + return arch_is_supported(MicroArch::ivybridge) && check_extensions(std::vector{ ISAExt::AVX2, ISAExt::BMI, @@ -90,20 +90,20 @@ inline bool arch_is_supported(CPUArch arch) { ISAExt::LZCNT, ISAExt::FMA, ISAExt::MOVBE}); - case CPUArch::broadwell: - return arch_is_supported(CPUArch::haswell) && + case MicroArch::broadwell: + return arch_is_supported(MicroArch::haswell) && check_extensions(std::vector{ ISAExt::RDSEED, ISAExt::ADCX, ISAExt::PREFETCHW}); - case CPUArch::skylake: - return arch_is_supported(CPUArch::broadwell) && + case MicroArch::skylake: + return arch_is_supported(MicroArch::broadwell) && check_extensions(std::vector{ ISAExt::AES, ISAExt::CLFLUSHOPT, ISAExt::XSAVEC, ISAExt::XSAVES, ISAExt::SGX}); - case CPUArch::skylake_avx512: - return arch_is_supported(CPUArch::skylake) && + case MicroArch::skylake_avx512: + return arch_is_supported(MicroArch::skylake) && check_extensions(std::vector{ ISAExt::AVX512_F, ISAExt::CLWB, @@ -111,14 +111,14 @@ inline bool arch_is_supported(CPUArch arch) { ISAExt::AVX512_BW, ISAExt::AVX512_DQ, ISAExt::AVX512_CD}); - case CPUArch::cascadelake: - return arch_is_supported(CPUArch::skylake_avx512) && + case MicroArch::cascadelake: + return arch_is_supported(MicroArch::skylake_avx512) && check_extensions(std::vector{ISAExt::AVX512_VNNI}); - case CPUArch::cooperlake: - return arch_is_supported(CPUArch::cascadelake) && + case MicroArch::cooperlake: + return arch_is_supported(MicroArch::cascadelake) && check_extensions(std::vector{ISAExt::AVX512_BF16}); - case CPUArch::icelake_client: - return arch_is_supported(CPUArch::cooperlake) && + case MicroArch::icelake_client: + return arch_is_supported(MicroArch::cooperlake) && check_extensions(std::vector{ ISAExt::PKU, ISAExt::AVX512_VBMI, @@ -131,12 +131,12 @@ inline bool arch_is_supported(CPUArch arch) { ISAExt::AVX512_BITALG, ISAExt::RDPID, ISAExt::AVX512_VPOPCNTDQ}); - case CPUArch::icelake_server: - return arch_is_supported(CPUArch::icelake_client) && + case MicroArch::icelake_server: + return arch_is_supported(MicroArch::icelake_client) && check_extensions(std::vector{ ISAExt::PCONFIG, ISAExt::WBNOINVD, ISAExt::CLWB}); - case CPUArch::sapphirerapids: - return arch_is_supported(CPUArch::icelake_server) && + case MicroArch::sapphirerapids: + return arch_is_supported(MicroArch::icelake_server) && check_extensions(std::vector{ ISAExt::MOVDIRI, ISAExt::MOVDIR64B, @@ -153,24 +153,24 @@ inline bool arch_is_supported(CPUArch arch) { ISAExt::AVX_VNNI, ISAExt::AVX512_FP16, ISAExt::AVX512_BF16}); - case CPUArch::graniterapids: - return arch_is_supported(CPUArch::sapphirerapids) && + case MicroArch::graniterapids: + return arch_is_supported(MicroArch::sapphirerapids) && check_extensions(std::vector{ISAExt::AMX_FP16, ISAExt::PREFETCHI} ); - case CPUArch::graniterapids_d: - return arch_is_supported(CPUArch::graniterapids) && + case MicroArch::graniterapids_d: + return arch_is_supported(MicroArch::graniterapids) && check_extensions(std::vector{ISAExt::AMX_COMPLEX}); #elif defined(__aarch64__) #if defined(__APPLE__) - case CPUArch::m1: + case MicroArch::m1: return check_extensions(std::vector{ISAExt::M1}); - case CPUArch::m2: + case MicroArch::m2: return check_extensions(std::vector{ISAExt::M2}); #else - case CPUArch::neoverse_v1: + case MicroArch::neoverse_v1: return check_extensions(std::vector{ISAExt::SVE}); - case CPUArch::neoverse_n2: - return arch_is_supported(CPUArch::neoverse_v1) && + case MicroArch::neoverse_n2: + return arch_is_supported(MicroArch::neoverse_v1) && check_extensions(std::vector{ISAExt::SVE2}); #endif #endif @@ -179,84 +179,84 @@ inline bool arch_is_supported(CPUArch arch) { } } -class CPUArchEnvironment { +class MicroArchEnvironment { public: - static CPUArchEnvironment& get_instance() { + static MicroArchEnvironment& get_instance() { // TODO: ensure thread safety - static CPUArchEnvironment instance; + static MicroArchEnvironment instance; return instance; } - CPUArch get_cpu_arch() const { return max_arch_; } + MicroArch get_cpu_arch() const { return max_arch_; } private: - CPUArchEnvironment() { - const std::vector compiled_archs = { + MicroArchEnvironment() { + const std::vector compiled_archs = { #if defined(__x86_64__) -#if defined(SVS_CPUARCH_SUPPORT_nehalem) - CPUArch::nehalem, +#if defined(SVS_MICROARCH_SUPPORT_nehalem) + MicroArch::nehalem, #endif -#if defined(SVS_CPUARCH_SUPPORT_westmere) - CPUArch::westmere, +#if defined(SVS_MICROARCH_SUPPORT_westmere) + MicroArch::westmere, #endif -#if defined(SVS_CPUARCH_SUPPORT_sandybridge) - CPUArch::sandybridge, +#if defined(SVS_MICROARCH_SUPPORT_sandybridge) + MicroArch::sandybridge, #endif -#if defined(SVS_CPUARCH_SUPPORT_ivybridge) - CPUArch::ivybridge, +#if defined(SVS_MICROARCH_SUPPORT_ivybridge) + MicroArch::ivybridge, #endif -#if defined(SVS_CPUARCH_SUPPORT_haswell) - CPUArch::haswell, +#if defined(SVS_MICROARCH_SUPPORT_haswell) + MicroArch::haswell, #endif -#if defined(SVS_CPUARCH_SUPPORT_broadwell) - CPUArch::broadwell, +#if defined(SVS_MICROARCH_SUPPORT_broadwell) + MicroArch::broadwell, #endif -#if defined(SVS_CPUARCH_SUPPORT_skylake) - CPUArch::skylake, +#if defined(SVS_MICROARCH_SUPPORT_skylake) + MicroArch::skylake, #endif -#if defined(SVS_CPUARCH_SUPPORT_skylake_avx512) - CPUArch::skylake_avx512, +#if defined(SVS_MICROARCH_SUPPORT_skylake_avx512) + MicroArch::skylake_avx512, #endif -#if defined(SVS_CPUARCH_SUPPORT_cascadelake) - CPUArch::cascadelake, +#if defined(SVS_MICROARCH_SUPPORT_cascadelake) + MicroArch::cascadelake, #endif -#if defined(SVS_CPUARCH_SUPPORT_cooperlake) - CPUArch::cooperlake, +#if defined(SVS_MICROARCH_SUPPORT_cooperlake) + MicroArch::cooperlake, #endif -#if defined(SVS_CPUARCH_SUPPORT_icelake_client) - CPUArch::icelake_client, +#if defined(SVS_MICROARCH_SUPPORT_icelake_client) + MicroArch::icelake_client, #endif -#if defined(SVS_CPUARCH_SUPPORT_icelake_server) - CPUArch::icelake_server, +#if defined(SVS_MICROARCH_SUPPORT_icelake_server) + MicroArch::icelake_server, #endif -#if defined(SVS_CPUARCH_SUPPORT_sapphirerapids) - CPUArch::sapphirerapids, +#if defined(SVS_MICROARCH_SUPPORT_sapphirerapids) + MicroArch::sapphirerapids, #endif -#if defined(SVS_CPUARCH_SUPPORT_graniterapids) - CPUArch::graniterapids, +#if defined(SVS_MICROARCH_SUPPORT_graniterapids) + MicroArch::graniterapids, #endif -#if defined(SVS_CPUARCH_SUPPORT_graniterapids_d) - CPUArch::graniterapids_d, +#if defined(SVS_MICROARCH_SUPPORT_graniterapids_d) + MicroArch::graniterapids_d, #endif #elif defined(__aarch64__) #if defined(__APPLE__) -#if defined(SVS_CPUARCH_SUPPORT_m1) - CPUArch::m1, +#if defined(SVS_MICROARCH_SUPPORT_m1) + MicroArch::m1, #endif -#if defined(SVS_CPUARCH_SUPPORT_m2) - CPUArch::m2, +#if defined(SVS_MICROARCH_SUPPORT_m2) + MicroArch::m2, #endif #else -#if defined(SVS_CPUARCH_SUPPORT_neoverse_v1) - CPUArch::neoverse_v1, +#if defined(SVS_MICROARCH_SUPPORT_neoverse_v1) + MicroArch::neoverse_v1, #endif -#if defined(SVS_CPUARCH_SUPPORT_neoverse_n2) - CPUArch::neoverse_n2, +#if defined(SVS_MICROARCH_SUPPORT_neoverse_n2) + MicroArch::neoverse_n2, #endif #endif #endif }; compiled_archs_ = compiled_archs; - max_arch_ = CPUArch::baseline; + max_arch_ = MicroArch::baseline; for (const auto& arch : compiled_archs_) { if (arch_is_supported(arch)) { supported_archs_.push_back(arch); @@ -267,87 +267,87 @@ class CPUArchEnvironment { } } - std::vector compiled_archs_; - std::vector supported_archs_; - CPUArch max_arch_; + std::vector compiled_archs_; + std::vector supported_archs_; + MicroArch max_arch_; }; #define SVS_PACK_ARGS(...) __VA_ARGS__ -#define SVS_CLASS_METHOD_CPUARCH_CASE(cpuarch, cls, method, args) \ - case svs::arch::CPUArch::cpuarch: \ - return cls::method(args); \ +#define SVS_CLASS_METHOD_MICROARCH_CASE(microarch, cls, method, args) \ + case svs::arch::MicroArch::microarch: \ + return cls::method(args); \ break; -#define SVS_TARGET_CPUARCH svs::arch::CPUArch::SVS_TUNE_TARGET +#define SVS_TARGET_MICROARCH svs::arch::MicroArch::SVS_TUNE_TARGET #if defined(__x86_64__) -#define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ - svs::arch::CPUArch cpu_arch = \ - svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ - switch (cpu_arch) { \ - SVS_CLASS_METHOD_CPUARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(icelake_client, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) \ - default: \ - return cls::method(args); \ - break; \ +#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ + svs::arch::MicroArch cpu_arch = \ + svs::arch::MicroArchEnvironment::get_instance().get_cpu_arch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE(icelake_client, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ } #elif defined(__aarch64__) #if defined(__APPLE__) -#define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ - svs::arch::CPUArch cpu_arch = \ - svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ - switch (cpu_arch) { \ - SVS_CLASS_METHOD_CPUARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) \ - default: \ - return cls::method(args); \ - break; \ +#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ + svs::arch::MicroArch cpu_arch = \ + svs::arch::MicroArchEnvironment::get_instance().get_cpu_arch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ } #else -#define SVS_DISPATCH_CLASS_BY_CPUARCH(cls, method, args) \ - svs::arch::CPUArch cpu_arch = \ - svs::arch::CPUArchEnvironment::get_instance().get_cpu_arch(); \ - switch (cpu_arch) { \ - SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_CPUARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) \ - default: \ - return cls::method(args); \ - break; \ +#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ + svs::arch::MicroArch cpu_arch = \ + svs::arch::MicroArchEnvironment::get_instance().get_cpu_arch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ } #endif #endif -#define SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH( \ +#define SVS_INST_CLASS_METHOD_TMPL_BY_MICROARCH( \ return_type, cls, method, template_args, args \ ) \ - template return_type cls::method(args); + template return_type cls::method(args); // Generic distance dispatching macro -#define SVS_INST_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(cls, a_type, b_type) \ - SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH( \ - float, \ - svs::distance::cls, \ - compute, \ - SVS_PACK_ARGS(a_type, b_type), \ - SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long) \ +#define SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(cls, a_type, b_type) \ + SVS_INST_CLASS_METHOD_TMPL_BY_MICROARCH( \ + float, \ + svs::distance::cls, \ + compute, \ + SVS_PACK_ARGS(a_type, b_type), \ + SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long) \ ) // Cosine distance dispatching macro -#define SVS_INST_COSINE_DISTANCE_CLASS_BY_CPUARCH_AND_TYPENAMES(cls, a_type, b_type) \ - SVS_INST_CLASS_METHOD_TMPL_BY_CPUARCH( \ - float, \ - svs::distance::cls, \ - compute, \ - SVS_PACK_ARGS(a_type, b_type), \ - SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long) \ +#define SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(cls, a_type, b_type) \ + SVS_INST_CLASS_METHOD_TMPL_BY_MICROARCH( \ + float, \ + svs::distance::cls, \ + compute, \ + SVS_PACK_ARGS(a_type, b_type), \ + SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long) \ ) } // namespace svs::arch diff --git a/tests/svs/core/distances/cosine.cpp b/tests/svs/core/distances/cosine.cpp index c18fb9e8..24915d97 100644 --- a/tests/svs/core/distances/cosine.cpp +++ b/tests/svs/core/distances/cosine.cpp @@ -86,14 +86,15 @@ void test_types(T lo, T hi, size_t num_tests) { auto a_norm = svs::distance::norm(std::span{a.data(), a.size()}); CATCH_REQUIRE( // TODO: replace baseline with something else? - (svs::distance::CosineSimilarity::compute( + (svs::distance::CosineSimilarity::compute( a.data(), b.data(), a_norm ) == expected) ); // Dynamically Sized Computation - auto dist = svs::distance::CosineSimilarity::compute( - a.data(), b.data(), a_norm, N - ); + auto dist = + svs::distance::CosineSimilarity::compute( + a.data(), b.data(), a_norm, N + ); CATCH_REQUIRE((dist == expected)); } } diff --git a/tests/svs/core/distances/distance_euclidean.cpp b/tests/svs/core/distances/distance_euclidean.cpp index 216d5bc5..1e375b43 100644 --- a/tests/svs/core/distances/distance_euclidean.cpp +++ b/tests/svs/core/distances/distance_euclidean.cpp @@ -69,13 +69,13 @@ void test_types(T lo, T hi, size_t num_tests) { // Statically Sized Computation CATCH_REQUIRE( - (svs::distance::L2::compute( + (svs::distance::L2::compute( a.data(), b.data() ) == expected) ); // Dynamically Sized Computation CATCH_REQUIRE( - (svs::distance::L2::compute( + (svs::distance::L2::compute( a.data(), b.data(), N ) == expected) ); diff --git a/tests/svs/core/distances/inner_product.cpp b/tests/svs/core/distances/inner_product.cpp index 5ff2865e..b5f0462e 100644 --- a/tests/svs/core/distances/inner_product.cpp +++ b/tests/svs/core/distances/inner_product.cpp @@ -77,13 +77,13 @@ void test_types(T lo, T hi, size_t num_tests) { // Statically Sized Computation CATCH_REQUIRE( - (svs::distance::IP::compute( + (svs::distance::IP::compute( a.data(), b.data() ) == expected) ); // Dynamically Sized Computation CATCH_REQUIRE( - (svs::distance::IP::compute( + (svs::distance::IP::compute( a.data(), b.data(), N ) == expected) ); From 085114a88f6af40f104dcf073c6fba969898aa29 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 30 Apr 2025 06:39:13 -0700 Subject: [PATCH 25/65] Remove uArchs spec in setup.py --- bindings/python/setup.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 83c46e54..7e8aa254 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -13,7 +13,6 @@ # limitations under the License. from skbuild import setup -import archspec.cpu as cpu import os # If building in a cibuildwheel context, compile multiple versions of the library for @@ -25,27 +24,6 @@ "-DCMAKE_EXPORT_COMPILE_COMMANDS=YES", ] -# Utility to convert micro-architecture strings to -def target(arch): - return cpu.TARGETS[arch] - -# TODO: Replace with externally-specified list -svs_microarchs = [ - # "x86_64_v3" # This is the default target for base lib compilation - "broadwell", - "skylake", - "skylake_avx512", - "cascadelake", - # TODO: Add support for other architectures (archspec does not support them yet) - # "cooperlake", - # "icelake_server", - "sapphirerapids", - # "graniterapids", - # "graniterapids_d", - ] -cmake_array = ";".join(svs_microarchs) -cmake_args.append(f"-DSVS_MICROARCHS={cmake_array}") - # Determine the root of the repository base_dir = os.path.relpath(os.path.join(os.path.dirname(__file__), '..', '..')) From a86397f117ccdb460a0da2ba8019d20bb2997173 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Fri, 2 May 2025 01:11:30 -0700 Subject: [PATCH 26/65] Add basic uArch python bindings; fix avx512bf16 support mistake --- bindings/python/src/python_bindings.cpp | 30 +++++++++++ bindings/python/tests/test_microarch.py | 27 ++++++++++ include/svs/lib/arch.h | 68 +++++++++++++++++++++++-- 3 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 bindings/python/tests/test_microarch.py diff --git a/bindings/python/src/python_bindings.cpp b/bindings/python/src/python_bindings.cpp index 30e03acc..85922412 100644 --- a/bindings/python/src/python_bindings.cpp +++ b/bindings/python/src/python_bindings.cpp @@ -26,6 +26,7 @@ // SVS dependencies #include "svs/core/distance.h" #include "svs/core/io.h" +#include "svs/lib/arch.h" #include "svs/lib/array.h" #include "svs/lib/datatype.h" #include "svs/lib/float16.h" @@ -185,6 +186,35 @@ Convert the `fvecs` file on disk with 32-bit floating point entries to a `fvecs` )" ); + // Get name of current microarch + m.def( + "microarch", + []() { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + return svs::arch::microarch_to_string(env.get_microarch()); + }, + "Returns current microarchitecture." + ); + + // Get list of supported microarchs + m.def( + "supported_microarchs", + []() { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + const auto& supported_archs = env.get_supported_microarchs(); + + std::vector result; + result.reserve(supported_archs.size()); + + for (const auto& arch : supported_archs) { + result.push_back(svs::arch::microarch_to_string(arch)); + } + + return result; + }, + "Returns a list of supported microarchitectures." + ); + wrap_conversion(m); // Allocators diff --git a/bindings/python/tests/test_microarch.py b/bindings/python/tests/test_microarch.py new file mode 100644 index 00000000..c98a3889 --- /dev/null +++ b/bindings/python/tests/test_microarch.py @@ -0,0 +1,27 @@ +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import svs +import archspec.cpu as cpu + + +class MicroarchTester(unittest.TestCase): + def test_microarch(self): + supported_microarchs = svs.supported_microarchs() + archspec_host_name = cpu.host().name + if archspec_host_name == "icelake": + archspec_host_name = "icelake_client" + if archspec_host_name in supported_microarchs: + self.assertTrue(archspec_host_name == svs.microarch()) diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 816ac016..470e9567 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -115,10 +115,12 @@ inline bool arch_is_supported(MicroArch arch) { return arch_is_supported(MicroArch::skylake_avx512) && check_extensions(std::vector{ISAExt::AVX512_VNNI}); case MicroArch::cooperlake: + // N.B.: Cooper Lake supports AVX512_BF16, Ice Lake - doesn't, Sapphire Rapids + // and newer - do return arch_is_supported(MicroArch::cascadelake) && check_extensions(std::vector{ISAExt::AVX512_BF16}); case MicroArch::icelake_client: - return arch_is_supported(MicroArch::cooperlake) && + return arch_is_supported(MicroArch::cascadelake) && check_extensions(std::vector{ ISAExt::PKU, ISAExt::AVX512_VBMI, @@ -179,6 +181,58 @@ inline bool arch_is_supported(MicroArch arch) { } } +// Function to convert MicroArch enum to string +inline std::string microarch_to_string(MicroArch arch) { + switch (arch) { +#if defined(__x86_64__) + case MicroArch::nehalem: + return "nehalem"; + case MicroArch::westmere: + return "westmere"; + case MicroArch::sandybridge: + return "sandybridge"; + case MicroArch::ivybridge: + return "ivybridge"; + case MicroArch::haswell: + return "haswell"; + case MicroArch::broadwell: + return "broadwell"; + case MicroArch::skylake: + return "skylake"; + case MicroArch::skylake_avx512: + return "skylake_avx512"; + case MicroArch::cascadelake: + return "cascadelake"; + case MicroArch::cooperlake: + return "cooperlake"; + case MicroArch::icelake_client: + return "icelake_client"; + case MicroArch::icelake_server: + return "icelake_server"; + case MicroArch::sapphirerapids: + return "sapphirerapids"; + case MicroArch::graniterapids: + return "graniterapids"; + case MicroArch::graniterapids_d: + return "graniterapids_d"; +#elif defined(__aarch64__) +#if defined(__APPLE__) + case MicroArch::m1: + return "m1"; + case MicroArch::m2: + return "m2"; +#else + case MicroArch::neoverse_v1: + return "neoverse_v1"; + case MicroArch::neoverse_n2: + return "neoverse_n2"; +#endif +#endif + default: + return "unknown"; + } +} + class MicroArchEnvironment { public: static MicroArchEnvironment& get_instance() { @@ -186,7 +240,11 @@ class MicroArchEnvironment { static MicroArchEnvironment instance; return instance; } - MicroArch get_cpu_arch() const { return max_arch_; } + MicroArch get_microarch() const { return max_arch_; } + + const std::vector& get_supported_microarchs() const { + return supported_archs_; + } private: MicroArchEnvironment() { @@ -283,7 +341,7 @@ class MicroArchEnvironment { #define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ svs::arch::MicroArch cpu_arch = \ - svs::arch::MicroArchEnvironment::get_instance().get_cpu_arch(); \ + svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ switch (cpu_arch) { \ SVS_CLASS_METHOD_MICROARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) \ @@ -301,7 +359,7 @@ class MicroArchEnvironment { #define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ svs::arch::MicroArch cpu_arch = \ - svs::arch::MicroArchEnvironment::get_instance().get_cpu_arch(); \ + svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ switch (cpu_arch) { \ SVS_CLASS_METHOD_MICROARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) \ @@ -314,7 +372,7 @@ class MicroArchEnvironment { #define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ svs::arch::MicroArch cpu_arch = \ - svs::arch::MicroArchEnvironment::get_instance().get_cpu_arch(); \ + svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ switch (cpu_arch) { \ SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) \ From 1f5f531eedc7829fcdc60045b6e4f249c3d51849 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Fri, 2 May 2025 02:56:49 -0700 Subject: [PATCH 27/65] Restructure cpuid and uArch headers --- include/svs/lib/arch.h | 441 ++++++++++++++------------------- include/svs/lib/arch_defines.h | 205 +++++++++++++++ include/svs/lib/cpuid.h | 173 +++++++------ 3 files changed, 485 insertions(+), 334 deletions(-) create mode 100644 include/svs/lib/arch_defines.h diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 470e9567..729ce762 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -16,7 +16,12 @@ #pragma once +#include "svs/lib/arch_defines.h" #include "svs/lib/cpuid.h" +#include +#include +#include +#include namespace svs::arch { @@ -55,182 +60,148 @@ enum class MicroArch { baseline = 0, }; -inline bool arch_is_supported(MicroArch arch) { - switch (arch) { +struct MicroArchInfo { + std::optional parent; + std::vector extensions; + std::string name; +}; + +// Unordered map with MicroArch to MicroArchInfo mapping +inline const std::unordered_map& get_microarch_info_map() { + static const std::unordered_map microarch_info = { #if defined(__x86_64__) - case MicroArch::nehalem: - return check_extensions(std::vector{ - ISAExt::MMX, - ISAExt::SSE, - ISAExt::SSE2, - ISAExt::SSE3, - ISAExt::SSSE3, - ISAExt::SSE4_1, - ISAExt::SSE4_2, - ISAExt::POPCNT, - ISAExt::CX16, - ISAExt::SAHF, - ISAExt::FXSR}); - case MicroArch::westmere: - return arch_is_supported(MicroArch::nehalem) && - check_extensions(std::vector{ISAExt::PCLMUL}); - case MicroArch::sandybridge: - return arch_is_supported(MicroArch::westmere) && - check_extensions(std::vector{ISAExt::AVX, ISAExt::XSAVE}); - case MicroArch::ivybridge: - return arch_is_supported(MicroArch::sandybridge) && - check_extensions(std::vector{ - ISAExt::FSGSBASE, ISAExt::RDRND, ISAExt::F16C}); - case MicroArch::haswell: - return arch_is_supported(MicroArch::ivybridge) && - check_extensions(std::vector{ - ISAExt::AVX2, - ISAExt::BMI, - ISAExt::BMI2, - ISAExt::LZCNT, - ISAExt::FMA, - ISAExt::MOVBE}); - case MicroArch::broadwell: - return arch_is_supported(MicroArch::haswell) && - check_extensions(std::vector{ - ISAExt::RDSEED, ISAExt::ADCX, ISAExt::PREFETCHW}); - case MicroArch::skylake: - return arch_is_supported(MicroArch::broadwell) && - check_extensions(std::vector{ - ISAExt::AES, - ISAExt::CLFLUSHOPT, - ISAExt::XSAVEC, - ISAExt::XSAVES, - ISAExt::SGX}); - case MicroArch::skylake_avx512: - return arch_is_supported(MicroArch::skylake) && - check_extensions(std::vector{ - ISAExt::AVX512_F, - ISAExt::CLWB, - ISAExt::AVX512_VL, - ISAExt::AVX512_BW, - ISAExt::AVX512_DQ, - ISAExt::AVX512_CD}); - case MicroArch::cascadelake: - return arch_is_supported(MicroArch::skylake_avx512) && - check_extensions(std::vector{ISAExt::AVX512_VNNI}); - case MicroArch::cooperlake: - // N.B.: Cooper Lake supports AVX512_BF16, Ice Lake - doesn't, Sapphire Rapids - // and newer - do - return arch_is_supported(MicroArch::cascadelake) && - check_extensions(std::vector{ISAExt::AVX512_BF16}); - case MicroArch::icelake_client: - return arch_is_supported(MicroArch::cascadelake) && - check_extensions(std::vector{ - ISAExt::PKU, - ISAExt::AVX512_VBMI, - ISAExt::AVX512_IFMA, - ISAExt::SHA, - ISAExt::GFNI, - ISAExt::VAES, - ISAExt::AVX512_VBMI2, - ISAExt::VPCLMULQDQ, - ISAExt::AVX512_BITALG, - ISAExt::RDPID, - ISAExt::AVX512_VPOPCNTDQ}); - case MicroArch::icelake_server: - return arch_is_supported(MicroArch::icelake_client) && - check_extensions(std::vector{ - ISAExt::PCONFIG, ISAExt::WBNOINVD, ISAExt::CLWB}); - case MicroArch::sapphirerapids: - return arch_is_supported(MicroArch::icelake_server) && - check_extensions(std::vector{ - ISAExt::MOVDIRI, - ISAExt::MOVDIR64B, - ISAExt::ENQCMD, - ISAExt::CLDEMOTE, - ISAExt::PTWRITE, - ISAExt::WAITPKG, - ISAExt::SERIALIZE, - ISAExt::TSXLDTRK, - ISAExt::UINTR, - ISAExt::AMX_BF16, - ISAExt::AMX_TILE, - ISAExt::AMX_INT8, - ISAExt::AVX_VNNI, - ISAExt::AVX512_FP16, - ISAExt::AVX512_BF16}); - case MicroArch::graniterapids: - return arch_is_supported(MicroArch::sapphirerapids) && - check_extensions(std::vector{ISAExt::AMX_FP16, ISAExt::PREFETCHI} - ); - case MicroArch::graniterapids_d: - return arch_is_supported(MicroArch::graniterapids) && - check_extensions(std::vector{ISAExt::AMX_COMPLEX}); + {MicroArch::nehalem, + {std::nullopt, + {ISAExt::MMX, + ISAExt::SSE, + ISAExt::SSE2, + ISAExt::SSE3, + ISAExt::SSSE3, + ISAExt::SSE4_1, + ISAExt::SSE4_2, + ISAExt::POPCNT, + ISAExt::CX16, + ISAExt::SAHF, + ISAExt::FXSR}, + "nehalem"}}, + {MicroArch::westmere, {MicroArch::nehalem, {ISAExt::PCLMUL}, "westmere"}}, + {MicroArch::sandybridge, + {MicroArch::westmere, {ISAExt::AVX, ISAExt::XSAVE}, "sandybridge"}}, + {MicroArch::ivybridge, + {MicroArch::sandybridge, + {ISAExt::FSGSBASE, ISAExt::RDRND, ISAExt::F16C}, + "ivybridge"}}, + {MicroArch::haswell, + {MicroArch::sandybridge, + {ISAExt::AVX2, + ISAExt::BMI, + ISAExt::BMI2, + ISAExt::LZCNT, + ISAExt::FMA, + ISAExt::MOVBE}, + "haswell"}}, + {MicroArch::broadwell, + {MicroArch::haswell, + {ISAExt::RDSEED, ISAExt::ADCX, ISAExt::PREFETCHW}, + "broadwell"}}, + {MicroArch::skylake, + {MicroArch::broadwell, + {ISAExt::AES, ISAExt::CLFLUSHOPT, ISAExt::XSAVEC, ISAExt::XSAVES, ISAExt::SGX}, + "skylake"}}, + {MicroArch::skylake_avx512, + {MicroArch::skylake, + {ISAExt::AVX512_F, + ISAExt::CLWB, + ISAExt::AVX512_VL, + ISAExt::AVX512_BW, + ISAExt::AVX512_DQ, + ISAExt::AVX512_CD}, + "skylake_avx512"}}, + {MicroArch::cascadelake, + {MicroArch::skylake_avx512, {ISAExt::AVX512_VNNI}, "cascadelake"}}, + {MicroArch::cooperlake, + {MicroArch::cascadelake, {ISAExt::AVX512_BF16}, "cooperlake"}}, + {MicroArch::icelake_client, + {MicroArch::cascadelake, + {ISAExt::PKU, + ISAExt::AVX512_VBMI, + ISAExt::AVX512_IFMA, + ISAExt::SHA, + ISAExt::GFNI, + ISAExt::VAES, + ISAExt::AVX512_VBMI2, + ISAExt::VPCLMULQDQ, + ISAExt::AVX512_BITALG, + ISAExt::RDPID, + ISAExt::AVX512_VPOPCNTDQ}, + "icelake_client"}}, + {MicroArch::icelake_server, + {MicroArch::icelake_client, + {ISAExt::PCONFIG, ISAExt::WBNOINVD, ISAExt::CLWB}, + "icelake_server"}}, + {MicroArch::sapphirerapids, + {MicroArch::icelake_server, + {ISAExt::MOVDIRI, + ISAExt::MOVDIR64B, + ISAExt::ENQCMD, + ISAExt::CLDEMOTE, + ISAExt::PTWRITE, + ISAExt::WAITPKG, + ISAExt::SERIALIZE, + ISAExt::TSXLDTRK, + ISAExt::UINTR, + ISAExt::AMX_BF16, + ISAExt::AMX_TILE, + ISAExt::AMX_INT8, + ISAExt::AVX_VNNI, + ISAExt::AVX512_FP16, + ISAExt::AVX512_BF16}, + "sapphirerapids"}}, + {MicroArch::graniterapids, + {MicroArch::sapphirerapids, + {ISAExt::AMX_FP16, ISAExt::PREFETCHI}, + "graniterapids"}}, + {MicroArch::graniterapids_d, + {MicroArch::graniterapids, {ISAExt::AMX_COMPLEX}, "graniterapids_d"}}, #elif defined(__aarch64__) #if defined(__APPLE__) - case MicroArch::m1: - return check_extensions(std::vector{ISAExt::M1}); - case MicroArch::m2: - return check_extensions(std::vector{ISAExt::M2}); + {MicroArch::m1, {std::nullopt, {ISAExt::M1}, "m1"}}, + {MicroArch::m2, {std::nullopt, {ISAExt::M2}, "m2"}}, #else - case MicroArch::neoverse_v1: - return check_extensions(std::vector{ISAExt::SVE}); - case MicroArch::neoverse_n2: - return arch_is_supported(MicroArch::neoverse_v1) && - check_extensions(std::vector{ISAExt::SVE2}); + {MicroArch::neoverse_v1, {std::nullopt, {ISAExt::SVE}, "neoverse_v1"}}, + {MicroArch::neoverse_n2, {MicroArch::neoverse_v1, {ISAExt::SVE2}, "neoverse_n2"}}, #endif #endif - default: - return false; + {MicroArch::baseline, {std::nullopt, {}, "baseline"}} + }; + return microarch_info; +} + +inline bool arch_is_supported(MicroArch arch) { + const auto& info_map = get_microarch_info_map(); + auto it = info_map.find(arch); + if (it == info_map.end()) { + return false; + } + + const auto& info = it->second; + + // First check if parent architecture is supported + if (info.parent.has_value() && !arch_is_supported(info.parent.value())) { + return false; } + + // Then check additional extensions + return check_extensions(info.extensions); } -// Function to convert MicroArch enum to string inline std::string microarch_to_string(MicroArch arch) { - switch (arch) { -#if defined(__x86_64__) - case MicroArch::nehalem: - return "nehalem"; - case MicroArch::westmere: - return "westmere"; - case MicroArch::sandybridge: - return "sandybridge"; - case MicroArch::ivybridge: - return "ivybridge"; - case MicroArch::haswell: - return "haswell"; - case MicroArch::broadwell: - return "broadwell"; - case MicroArch::skylake: - return "skylake"; - case MicroArch::skylake_avx512: - return "skylake_avx512"; - case MicroArch::cascadelake: - return "cascadelake"; - case MicroArch::cooperlake: - return "cooperlake"; - case MicroArch::icelake_client: - return "icelake_client"; - case MicroArch::icelake_server: - return "icelake_server"; - case MicroArch::sapphirerapids: - return "sapphirerapids"; - case MicroArch::graniterapids: - return "graniterapids"; - case MicroArch::graniterapids_d: - return "graniterapids_d"; -#elif defined(__aarch64__) -#if defined(__APPLE__) - case MicroArch::m1: - return "m1"; - case MicroArch::m2: - return "m2"; -#else - case MicroArch::neoverse_v1: - return "neoverse_v1"; - case MicroArch::neoverse_n2: - return "neoverse_n2"; -#endif -#endif - default: - return "unknown"; + const auto& info_map = get_microarch_info_map(); + auto it = info_map.find(arch); + if (it != info_map.end()) { + return it->second.name; } + return "unknown"; } class MicroArchEnvironment { @@ -250,66 +221,28 @@ class MicroArchEnvironment { MicroArchEnvironment() { const std::vector compiled_archs = { #if defined(__x86_64__) -#if defined(SVS_MICROARCH_SUPPORT_nehalem) - MicroArch::nehalem, -#endif -#if defined(SVS_MICROARCH_SUPPORT_westmere) - MicroArch::westmere, -#endif -#if defined(SVS_MICROARCH_SUPPORT_sandybridge) - MicroArch::sandybridge, -#endif -#if defined(SVS_MICROARCH_SUPPORT_ivybridge) - MicroArch::ivybridge, -#endif -#if defined(SVS_MICROARCH_SUPPORT_haswell) - MicroArch::haswell, -#endif -#if defined(SVS_MICROARCH_SUPPORT_broadwell) - MicroArch::broadwell, -#endif -#if defined(SVS_MICROARCH_SUPPORT_skylake) - MicroArch::skylake, -#endif -#if defined(SVS_MICROARCH_SUPPORT_skylake_avx512) - MicroArch::skylake_avx512, -#endif -#if defined(SVS_MICROARCH_SUPPORT_cascadelake) - MicroArch::cascadelake, -#endif -#if defined(SVS_MICROARCH_SUPPORT_cooperlake) - MicroArch::cooperlake, -#endif -#if defined(SVS_MICROARCH_SUPPORT_icelake_client) - MicroArch::icelake_client, -#endif -#if defined(SVS_MICROARCH_SUPPORT_icelake_server) - MicroArch::icelake_server, -#endif -#if defined(SVS_MICROARCH_SUPPORT_sapphirerapids) - MicroArch::sapphirerapids, -#endif -#if defined(SVS_MICROARCH_SUPPORT_graniterapids) - MicroArch::graniterapids, -#endif -#if defined(SVS_MICROARCH_SUPPORT_graniterapids_d) - MicroArch::graniterapids_d, -#endif + SVS_MICROARCH_COMPILED_nehalem + SVS_MICROARCH_COMPILED_westmere + SVS_MICROARCH_COMPILED_sandybridge + SVS_MICROARCH_COMPILED_ivybridge + SVS_MICROARCH_COMPILED_haswell + SVS_MICROARCH_COMPILED_broadwell + SVS_MICROARCH_COMPILED_skylake + SVS_MICROARCH_COMPILED_skylake_avx512 + SVS_MICROARCH_COMPILED_cascadelake + SVS_MICROARCH_COMPILED_cooperlake + SVS_MICROARCH_COMPILED_icelake_client + SVS_MICROARCH_COMPILED_icelake_server + SVS_MICROARCH_COMPILED_sapphirerapids + SVS_MICROARCH_COMPILED_graniterapids + SVS_MICROARCH_COMPILED_graniterapids_d #elif defined(__aarch64__) #if defined(__APPLE__) -#if defined(SVS_MICROARCH_SUPPORT_m1) - MicroArch::m1, -#endif -#if defined(SVS_MICROARCH_SUPPORT_m2) - MicroArch::m2, -#endif + SVS_MICROARCH_COMPILED_m1 + SVS_MICROARCH_COMPILED_m2 #else -#if defined(SVS_MICROARCH_SUPPORT_neoverse_v1) - MicroArch::neoverse_v1, -#endif -#if defined(SVS_MICROARCH_SUPPORT_neoverse_n2) - MicroArch::neoverse_n2, -#endif + SVS_MICROARCH_COMPILED_neoverse_v1 + SVS_MICROARCH_COMPILED_neoverse_n2 #endif #endif }; @@ -330,28 +263,30 @@ class MicroArchEnvironment { MicroArch max_arch_; }; -#define SVS_PACK_ARGS(...) __VA_ARGS__ -#define SVS_CLASS_METHOD_MICROARCH_CASE(microarch, cls, method, args) \ - case svs::arch::MicroArch::microarch: \ - return cls::method(args); \ - break; -#define SVS_TARGET_MICROARCH svs::arch::MicroArch::SVS_TUNE_TARGET - #if defined(__x86_64__) -#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ - svs::arch::MicroArch cpu_arch = \ - svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ - switch (cpu_arch) { \ - SVS_CLASS_METHOD_MICROARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_MICROARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_MICROARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_MICROARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_MICROARCH_CASE(icelake_client, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_MICROARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) \ - default: \ - return cls::method(args); \ - break; \ +#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ + svs::arch::MicroArch cpu_arch = \ + svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ } #elif defined(__aarch64__) @@ -361,8 +296,8 @@ class MicroArchEnvironment { svs::arch::MicroArch cpu_arch = \ svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ switch (cpu_arch) { \ - SVS_CLASS_METHOD_MICROARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_MICROARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, SVS_PACK_ARGS(args)) \ default: \ return cls::method(args); \ break; \ @@ -370,15 +305,15 @@ class MicroArchEnvironment { #else -#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ - svs::arch::MicroArch cpu_arch = \ - svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ - switch (cpu_arch) { \ - SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) \ - SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) \ - default: \ - return cls::method(args); \ - break; \ +#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ + svs::arch::MicroArch cpu_arch = \ + svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ } #endif diff --git a/include/svs/lib/arch_defines.h b/include/svs/lib/arch_defines.h new file mode 100644 index 00000000..5f7b569e --- /dev/null +++ b/include/svs/lib/arch_defines.h @@ -0,0 +1,205 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define SVS_PACK_ARGS(...) __VA_ARGS__ +#define SVS_CLASS_METHOD_MICROARCH_CASE(microarch, cls, method, args) \ + case svs::arch::MicroArch::microarch: \ + return cls::method(args); \ + break; +#define SVS_TARGET_MICROARCH svs::arch::MicroArch::SVS_TUNE_TARGET + +// TODO: autogenerate this list +#if defined(__x86_64__) + +#if defined(SVS_MICROARCH_SUPPORT_nehalem) +#define SVS_MICROARCH_COMPILED_nehalem MicroArch::nehalem, +#define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_nehalem +#define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_westmere) +#define SVS_MICROARCH_COMPILED_westmere MicroArch::westmere, +#define SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(westmere, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_westmere +#define SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_sandybridge) +#define SVS_MICROARCH_COMPILED_sandybridge MicroArch::sandybridge, +#define SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(sandybridge, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_sandybridge +#define SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_ivybridge) +#define SVS_MICROARCH_COMPILED_ivybridge MicroArch::ivybridge, +#define SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(ivybridge, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_ivybridge +#define SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_haswell) +#define SVS_MICROARCH_COMPILED_haswell MicroArch::haswell, +#define SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_haswell +#define SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_broadwell) +#define SVS_MICROARCH_COMPILED_broadwell MicroArch::broadwell, +#define SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_broadwell +#define SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_skylake) +#define SVS_MICROARCH_COMPILED_skylake MicroArch::skylake, +#define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_skylake +#define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_skylake_avx512) +#define SVS_MICROARCH_COMPILED_skylake_avx512 MicroArch::skylake_avx512, +#define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_skylake_avx512 +#define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_cascadelake) +#define SVS_MICROARCH_COMPILED_cascadelake MicroArch::cascadelake, +#define SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_cascadelake +#define SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_cooperlake) +#define SVS_MICROARCH_COMPILED_cooperlake MicroArch::cooperlake, +#define SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(cooperlake, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_cooperlake +#define SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_icelake_client) +#define SVS_MICROARCH_COMPILED_icelake_client MicroArch::icelake_client, +#define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(icelake_client, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_icelake_client +#define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_icelake_server) +#define SVS_MICROARCH_COMPILED_icelake_server MicroArch::icelake_server, +#define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(icelake_server, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_icelake_server +#define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_sapphirerapids) +#define SVS_MICROARCH_COMPILED_sapphirerapids MicroArch::sapphirerapids, +#define SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_sapphirerapids +#define SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_graniterapids) +#define SVS_MICROARCH_COMPILED_graniterapids MicroArch::graniterapids, +#define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(graniterapids, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_graniterapids +#define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_graniterapids_d) +#define SVS_MICROARCH_COMPILED_graniterapids_d MicroArch::graniterapids_d, +#define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(graniterapids_d, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_graniterapids_d +#define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, args) +#endif + +#elif defined(__aarch64__) + +#if defined(__APPLE__) + +#if defined(SVS_MICROARCH_SUPPORT_m1) +#define SVS_MICROARCH_COMPILED_m1 MicroArch::m1, +#define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_m1 +#define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, SVS_PACK_ARGS(args)) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_m2) +#define SVS_MICROARCH_COMPILED_m2 MicroArch::m2, +#define SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_m2 +#define SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, args) +#endif + +#else + +#if defined(SVS_MICROARCH_SUPPORT_neoverse_v1) +#define SVS_MICROARCH_COMPILED_neoverse_v1 MicroArch::neoverse_v1, +#define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_neoverse_v1 +#define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_neoverse_n2) +#define SVS_MICROARCH_COMPILED_neoverse_n2 MicroArch::neoverse_n2, +#define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_neoverse_n2 +#define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, args) +#endif + +#endif +#endif diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h index b2703f81..6d3d5c15 100644 --- a/include/svs/lib/cpuid.h +++ b/include/svs/lib/cpuid.h @@ -127,78 +127,81 @@ struct CPUIDFlag { } }; -inline const std::unordered_map ISAExtInfo = { - // flags are sorted by function, subfunction, register and bit - {ISAExt::MMX, {1, 0, 3, 23, "MMX"}}, - {ISAExt::FXSR, {1, 0, 3, 24, "FXSR"}}, - {ISAExt::SSE, {1, 0, 3, 25, "SSE"}}, - {ISAExt::SSE2, {1, 0, 3, 26, "SSE2"}}, - {ISAExt::SSE3, {1, 0, 2, 0, "SSE3"}}, - {ISAExt::PCLMUL, {1, 0, 2, 1, "PCLMUL"}}, - {ISAExt::SSSE3, {1, 0, 2, 9, "SSSE3"}}, - {ISAExt::FMA, {1, 0, 2, 12, "FMA"}}, - {ISAExt::CX16, {1, 0, 2, 13, "CX16"}}, - {ISAExt::SSE4_1, {1, 0, 2, 19, "SSE4_1"}}, - {ISAExt::SSE4_2, {1, 0, 2, 20, "SSE4_2"}}, - {ISAExt::MOVBE, {1, 0, 2, 22, "MOVBE"}}, - {ISAExt::POPCNT, {1, 0, 2, 23, "POPCNT"}}, - {ISAExt::AES, {1, 0, 2, 25, "AES"}}, - {ISAExt::XSAVE, {1, 0, 2, 26, "XSAVE"}}, - {ISAExt::AVX, {1, 0, 2, 28, "AVX"}}, - {ISAExt::F16C, {1, 0, 2, 29, "F16C"}}, - {ISAExt::RDRND, {1, 0, 2, 30, "RDRND"}}, - {ISAExt::FSGSBASE, {7, 0, 1, 0, "FSGSBASE"}}, - {ISAExt::SGX, {7, 0, 1, 2, "SGX"}}, - {ISAExt::BMI, {7, 0, 1, 3, "BMI"}}, - {ISAExt::AVX2, {7, 0, 1, 5, "AVX2"}}, - {ISAExt::BMI2, {7, 0, 1, 8, "BMI2"}}, - {ISAExt::AVX512_F, {7, 0, 1, 16, "AVX512_F"}}, - {ISAExt::AVX512_DQ, {7, 0, 1, 17, "AVX512_DQ"}}, - {ISAExt::RDSEED, {7, 0, 1, 18, "RDSEED"}}, - {ISAExt::ADCX, {7, 0, 1, 19, "ADCX"}}, - {ISAExt::AVX512_IFMA, {7, 0, 1, 21, "AVX512_IFMA"}}, - {ISAExt::CLFLUSHOPT, {7, 0, 1, 23, "CLFLUSHOPT"}}, - {ISAExt::CLWB, {7, 0, 1, 24, "CLWB"}}, - {ISAExt::AVX512_CD, {7, 0, 1, 28, "AVX512_CD"}}, - {ISAExt::SHA, {7, 0, 1, 29, "SHA"}}, - {ISAExt::AVX512_BW, {7, 0, 1, 30, "AVX512_BW"}}, - {ISAExt::AVX512_VL, {7, 0, 1, 31, "AVX512_VL"}}, - {ISAExt::AVX512_VBMI, {7, 0, 2, 1, "AVX512_VBMI"}}, - {ISAExt::PKU, {7, 0, 2, 3, "PKU"}}, - {ISAExt::WAITPKG, {7, 0, 2, 5, "WAITPKG"}}, - {ISAExt::AVX512_VBMI2, {7, 0, 2, 6, "AVX512_VBMI2"}}, - {ISAExt::GFNI, {7, 0, 2, 8, "GFNI"}}, - {ISAExt::VAES, {7, 0, 2, 9, "VAES"}}, - {ISAExt::VPCLMULQDQ, {7, 0, 2, 10, "VPCLMULQDQ"}}, - {ISAExt::AVX512_VNNI, {7, 0, 2, 11, "AVX512_VNNI"}}, - {ISAExt::AVX512_BITALG, {7, 0, 2, 12, "AVX512_BITALG"}}, - {ISAExt::AVX512_VPOPCNTDQ, {7, 0, 2, 14, "AVX512_VPOPCNTDQ"}}, - {ISAExt::RDPID, {7, 0, 2, 22, "RDPID"}}, - {ISAExt::CLDEMOTE, {7, 0, 2, 25, "CLDEMOTE"}}, - {ISAExt::MOVDIRI, {7, 0, 2, 27, "MOVDIRI"}}, - {ISAExt::MOVDIR64B, {7, 0, 2, 28, "MOVDIR64B"}}, - {ISAExt::ENQCMD, {7, 0, 2, 29, "ENQCMD"}}, - {ISAExt::UINTR, {7, 0, 3, 5, "UINTR"}}, - {ISAExt::SERIALIZE, {7, 0, 3, 14, "SERIALIZE"}}, - {ISAExt::TSXLDTRK, {7, 0, 3, 16, "TSXLDTRK"}}, - {ISAExt::PCONFIG, {7, 0, 3, 18, "PCONFIG"}}, - {ISAExt::AMX_BF16, {7, 0, 3, 22, "AMX_BF16"}}, - {ISAExt::AVX512_FP16, {7, 0, 3, 23, "AVX512_FP16"}}, - {ISAExt::AMX_TILE, {7, 0, 3, 24, "AMX_TILE"}}, - {ISAExt::AMX_INT8, {7, 0, 3, 25, "AMX_INT8"}}, - {ISAExt::AVX_VNNI, {7, 1, 0, 4, "AVX_VNNI"}}, - {ISAExt::AVX512_BF16, {7, 1, 0, 5, "AVX512_BF16"}}, - {ISAExt::AMX_FP16, {7, 1, 0, 21, "AMX_FP16"}}, - {ISAExt::AMX_COMPLEX, {7, 1, 3, 8, "AMX_COMPLEX"}}, - {ISAExt::PREFETCHI, {7, 1, 3, 14, "PREFETCHI"}}, - {ISAExt::XSAVEC, {0xD, 1, 0, 1, "XSAVEC"}}, - {ISAExt::XSAVES, {0xD, 1, 0, 3, "XSAVES"}}, - {ISAExt::PTWRITE, {0x14, 0, 1, 4, "PTWRITE"}}, - {ISAExt::WBNOINVD, {0x80000008, 0, 1, 9, "WBNOINVD"}}, - {ISAExt::SAHF, {0x80000001, 0, 2, 0, "SAHF"}}, - {ISAExt::LZCNT, {0x80000001, 0, 2, 5, "LZCNT"}}, - {ISAExt::PREFETCHW, {0x80000001, 0, 2, 8, "PREFETCHW"}}, -}; +inline const std::unordered_map& get_isa_ext_info() { + static const std::unordered_map isa_ext_info = { + // flags are sorted by function, subfunction, register and bit + {ISAExt::MMX, {1, 0, 3, 23, "MMX"}}, + {ISAExt::FXSR, {1, 0, 3, 24, "FXSR"}}, + {ISAExt::SSE, {1, 0, 3, 25, "SSE"}}, + {ISAExt::SSE2, {1, 0, 3, 26, "SSE2"}}, + {ISAExt::SSE3, {1, 0, 2, 0, "SSE3"}}, + {ISAExt::PCLMUL, {1, 0, 2, 1, "PCLMUL"}}, + {ISAExt::SSSE3, {1, 0, 2, 9, "SSSE3"}}, + {ISAExt::FMA, {1, 0, 2, 12, "FMA"}}, + {ISAExt::CX16, {1, 0, 2, 13, "CX16"}}, + {ISAExt::SSE4_1, {1, 0, 2, 19, "SSE4_1"}}, + {ISAExt::SSE4_2, {1, 0, 2, 20, "SSE4_2"}}, + {ISAExt::MOVBE, {1, 0, 2, 22, "MOVBE"}}, + {ISAExt::POPCNT, {1, 0, 2, 23, "POPCNT"}}, + {ISAExt::AES, {1, 0, 2, 25, "AES"}}, + {ISAExt::XSAVE, {1, 0, 2, 26, "XSAVE"}}, + {ISAExt::AVX, {1, 0, 2, 28, "AVX"}}, + {ISAExt::F16C, {1, 0, 2, 29, "F16C"}}, + {ISAExt::RDRND, {1, 0, 2, 30, "RDRND"}}, + {ISAExt::FSGSBASE, {7, 0, 1, 0, "FSGSBASE"}}, + {ISAExt::SGX, {7, 0, 1, 2, "SGX"}}, + {ISAExt::BMI, {7, 0, 1, 3, "BMI"}}, + {ISAExt::AVX2, {7, 0, 1, 5, "AVX2"}}, + {ISAExt::BMI2, {7, 0, 1, 8, "BMI2"}}, + {ISAExt::AVX512_F, {7, 0, 1, 16, "AVX512_F"}}, + {ISAExt::AVX512_DQ, {7, 0, 1, 17, "AVX512_DQ"}}, + {ISAExt::RDSEED, {7, 0, 1, 18, "RDSEED"}}, + {ISAExt::ADCX, {7, 0, 1, 19, "ADCX"}}, + {ISAExt::AVX512_IFMA, {7, 0, 1, 21, "AVX512_IFMA"}}, + {ISAExt::CLFLUSHOPT, {7, 0, 1, 23, "CLFLUSHOPT"}}, + {ISAExt::CLWB, {7, 0, 1, 24, "CLWB"}}, + {ISAExt::AVX512_CD, {7, 0, 1, 28, "AVX512_CD"}}, + {ISAExt::SHA, {7, 0, 1, 29, "SHA"}}, + {ISAExt::AVX512_BW, {7, 0, 1, 30, "AVX512_BW"}}, + {ISAExt::AVX512_VL, {7, 0, 1, 31, "AVX512_VL"}}, + {ISAExt::AVX512_VBMI, {7, 0, 2, 1, "AVX512_VBMI"}}, + {ISAExt::PKU, {7, 0, 2, 3, "PKU"}}, + {ISAExt::WAITPKG, {7, 0, 2, 5, "WAITPKG"}}, + {ISAExt::AVX512_VBMI2, {7, 0, 2, 6, "AVX512_VBMI2"}}, + {ISAExt::GFNI, {7, 0, 2, 8, "GFNI"}}, + {ISAExt::VAES, {7, 0, 2, 9, "VAES"}}, + {ISAExt::VPCLMULQDQ, {7, 0, 2, 10, "VPCLMULQDQ"}}, + {ISAExt::AVX512_VNNI, {7, 0, 2, 11, "AVX512_VNNI"}}, + {ISAExt::AVX512_BITALG, {7, 0, 2, 12, "AVX512_BITALG"}}, + {ISAExt::AVX512_VPOPCNTDQ, {7, 0, 2, 14, "AVX512_VPOPCNTDQ"}}, + {ISAExt::RDPID, {7, 0, 2, 22, "RDPID"}}, + {ISAExt::CLDEMOTE, {7, 0, 2, 25, "CLDEMOTE"}}, + {ISAExt::MOVDIRI, {7, 0, 2, 27, "MOVDIRI"}}, + {ISAExt::MOVDIR64B, {7, 0, 2, 28, "MOVDIR64B"}}, + {ISAExt::ENQCMD, {7, 0, 2, 29, "ENQCMD"}}, + {ISAExt::UINTR, {7, 0, 3, 5, "UINTR"}}, + {ISAExt::SERIALIZE, {7, 0, 3, 14, "SERIALIZE"}}, + {ISAExt::TSXLDTRK, {7, 0, 3, 16, "TSXLDTRK"}}, + {ISAExt::PCONFIG, {7, 0, 3, 18, "PCONFIG"}}, + {ISAExt::AMX_BF16, {7, 0, 3, 22, "AMX_BF16"}}, + {ISAExt::AVX512_FP16, {7, 0, 3, 23, "AVX512_FP16"}}, + {ISAExt::AMX_TILE, {7, 0, 3, 24, "AMX_TILE"}}, + {ISAExt::AMX_INT8, {7, 0, 3, 25, "AMX_INT8"}}, + {ISAExt::AVX_VNNI, {7, 1, 0, 4, "AVX_VNNI"}}, + {ISAExt::AVX512_BF16, {7, 1, 0, 5, "AVX512_BF16"}}, + {ISAExt::AMX_FP16, {7, 1, 0, 21, "AMX_FP16"}}, + {ISAExt::AMX_COMPLEX, {7, 1, 3, 8, "AMX_COMPLEX"}}, + {ISAExt::PREFETCHI, {7, 1, 3, 14, "PREFETCHI"}}, + {ISAExt::XSAVEC, {0xD, 1, 0, 1, "XSAVEC"}}, + {ISAExt::XSAVES, {0xD, 1, 0, 3, "XSAVES"}}, + {ISAExt::PTWRITE, {0x14, 0, 1, 4, "PTWRITE"}}, + {ISAExt::WBNOINVD, {0x80000008, 0, 1, 9, "WBNOINVD"}}, + {ISAExt::SAHF, {0x80000001, 0, 2, 0, "SAHF"}}, + {ISAExt::LZCNT, {0x80000001, 0, 2, 5, "LZCNT"}}, + {ISAExt::PREFETCHW, {0x80000001, 0, 2, 8, "PREFETCHW"}}, + }; + return isa_ext_info; +} #elif defined(__aarch64__) @@ -225,10 +228,13 @@ struct BrandInfo { } }; -inline const std::unordered_map ISAExtInfo = { - {ISAExt::M1, {"M1"}}, - {ISAExt::M2, {"M2"}}, -}; +inline const std::unordered_map& get_isa_ext_info() { + static const std::unordered_map isa_ext_info = { + {ISAExt::M1, {"M1"}}, + {ISAExt::M2, {"M2"}}, + }; + return isa_ext_info; +} #else @@ -291,15 +297,20 @@ struct MSRFlag { } }; -inline const std::unordered_map ISAExtInfo = { - {ISAExt::SVE, {ID_AA64PFR0_EL1, 32, 4, 1, "sve"}}, - {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, -}; +inline const std::unordered_map& get_isa_ext_info() { + static const std::unordered_map isa_ext_info = { + {ISAExt::SVE, {ID_AA64PFR0_EL1, 32, 4, 1, "sve"}}, + {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, + }; + return isa_ext_info; +} #endif #endif -inline bool check_extension(ISAExt ext) { return ISAExtInfo.at(ext).get_value(); } +inline bool check_extension(ISAExt ext) { + return get_isa_ext_info().at(ext).get_value(); +} inline bool check_extensions(std::vector exts) { for (const auto& ext : exts) { From 5570b828965d4e9a98348e7389fafd93a119e981 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Fri, 2 May 2025 03:22:04 -0700 Subject: [PATCH 28/65] Fix typo in m1 branch --- include/svs/lib/arch_defines.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/svs/lib/arch_defines.h b/include/svs/lib/arch_defines.h index 5f7b569e..f2de4e49 100644 --- a/include/svs/lib/arch_defines.h +++ b/include/svs/lib/arch_defines.h @@ -169,7 +169,7 @@ SVS_CLASS_METHOD_MICROARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_m1 -#define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, SVS_PACK_ARGS(args)) +#define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, args) #endif #if defined(SVS_MICROARCH_SUPPORT_m2) From fb277e928c935d6a71cb64e692fb8661c5891af8 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Fri, 2 May 2025 06:25:47 -0700 Subject: [PATCH 29/65] Update x86_64 targets --- cmake/microarch_targets_x86_64 | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 index 438d9d3c..ba1f0d83 100644 --- a/cmake/microarch_targets_x86_64 +++ b/cmake/microarch_targets_x86_64 @@ -1,6 +1,5 @@ nehalem haswell skylake_avx512 -cascadelake icelake_client sapphirerapids From e45db18374665e6ed4f752af804303fd62297474 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Fri, 2 May 2025 08:14:36 -0700 Subject: [PATCH 30/65] Revert "Update x86_64 targets" This reverts commit fb416d78ed51fb06178c01ef2e60bcc3135d4eba. --- cmake/microarch_targets_x86_64 | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 index ba1f0d83..438d9d3c 100644 --- a/cmake/microarch_targets_x86_64 +++ b/cmake/microarch_targets_x86_64 @@ -1,5 +1,6 @@ nehalem haswell skylake_avx512 +cascadelake icelake_client sapphirerapids From 4dfd0ce5e985edef6ea22a6dcf6656afe770cf0c Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 6 May 2025 00:33:41 -0700 Subject: [PATCH 31/65] Add x86_64_v4 target --- cmake/microarch_targets_x86_64 | 1 + include/svs/lib/arch.h | 15 +++++++++++---- include/svs/lib/arch_defines.h | 9 +++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 index 438d9d3c..bac14630 100644 --- a/cmake/microarch_targets_x86_64 +++ b/cmake/microarch_targets_x86_64 @@ -1,5 +1,6 @@ nehalem haswell +x86_64_v4 skylake_avx512 cascadelake icelake_client diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 729ce762..72a1f219 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -30,22 +30,19 @@ enum class MicroArch { // Refer to the GCC docs for the list of targeted architectures: // https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html nehalem, - x86_64_v2 = nehalem, westmere, sandybridge, ivybridge, haswell, - x86_64_v3 = haswell, broadwell, skylake, + x86_64_v4, skylake_avx512, - x86_64_v4 = skylake_avx512, cascadelake, cooperlake, icelake_client, icelake_server, sapphirerapids, - emeraldrapids = sapphirerapids, graniterapids, graniterapids_d, #elif defined(__aarch64__) @@ -108,6 +105,14 @@ inline const std::unordered_map& get_microarch_info_ma {MicroArch::broadwell, {ISAExt::AES, ISAExt::CLFLUSHOPT, ISAExt::XSAVEC, ISAExt::XSAVES, ISAExt::SGX}, "skylake"}}, + {MicroArch::x86_64_v4, + {std::nullopt, + {ISAExt::AVX512_F, + ISAExt::AVX512_VL, + ISAExt::AVX512_BW, + ISAExt::AVX512_DQ, + ISAExt::AVX512_CD}, + "x86_64_v4"}}, {MicroArch::skylake_avx512, {MicroArch::skylake, {ISAExt::AVX512_F, @@ -228,6 +233,7 @@ class MicroArchEnvironment { SVS_MICROARCH_COMPILED_haswell SVS_MICROARCH_COMPILED_broadwell SVS_MICROARCH_COMPILED_skylake + SVS_MICROARCH_COMPILED_x86_64_v4 SVS_MICROARCH_COMPILED_skylake_avx512 SVS_MICROARCH_COMPILED_cascadelake SVS_MICROARCH_COMPILED_cooperlake @@ -276,6 +282,7 @@ class MicroArchEnvironment { SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, SVS_PACK_ARGS(args)) \ diff --git a/include/svs/lib/arch_defines.h b/include/svs/lib/arch_defines.h index f2de4e49..8c4eda6a 100644 --- a/include/svs/lib/arch_defines.h +++ b/include/svs/lib/arch_defines.h @@ -87,6 +87,15 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) #endif +#if defined(SVS_MICROARCH_SUPPORT_x86_64_v4) +#define SVS_MICROARCH_COMPILED_x86_64_v4 MicroArch::x86_64_v4, +#define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(x86_64_v4, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_x86_64_v4 +#define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, args) +#endif + #if defined(SVS_MICROARCH_SUPPORT_skylake_avx512) #define SVS_MICROARCH_COMPILED_skylake_avx512 MicroArch::skylake_avx512, #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) \ From 30b2fcd70a2a1f40ea6eec69f66efcf67bbf8f74 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 6 May 2025 02:48:00 -0700 Subject: [PATCH 32/65] Change MICROARCH_OBJECT_FILES setting --- cmake/microarch.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/microarch.cmake b/cmake/microarch.cmake index 68236c5a..9d67735a 100644 --- a/cmake/microarch.cmake +++ b/cmake/microarch.cmake @@ -123,5 +123,6 @@ function(create_microarch_instantiations) list(APPEND MICROARCH_OBJECT_FILES $) endforeach() - set(MICROARCH_OBJECT_FILES "${MICROARCH_OBJECT_FILES}" PARENT_SCOPE) + # Note: this specific way of setting the variable is required to make it available in all targeted scopes + set(MICROARCH_OBJECT_FILES "${MICROARCH_OBJECT_FILES}" CACHE INTERNAL "Microarchitecture-specific object files") endfunction() From b9167b8abc64de902b368c0fe3ff044c3ac57278 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 6 May 2025 03:22:16 -0700 Subject: [PATCH 33/65] Change microarch Python API --- bindings/python/src/python_bindings.cpp | 76 +++++++++++++++---------- bindings/python/tests/test_microarch.py | 5 +- include/svs/lib/arch.h | 22 +++++++ 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/bindings/python/src/python_bindings.cpp b/bindings/python/src/python_bindings.cpp index 85922412..996e6f4c 100644 --- a/bindings/python/src/python_bindings.cpp +++ b/bindings/python/src/python_bindings.cpp @@ -186,37 +186,55 @@ Convert the `fvecs` file on disk with 32-bit floating point entries to a `fvecs` )" ); - // Get name of current microarch - m.def( - "microarch", - []() { - auto& env = svs::arch::MicroArchEnvironment::get_instance(); - return svs::arch::microarch_to_string(env.get_microarch()); - }, - "Returns current microarchitecture." - ); - - // Get list of supported microarchs - m.def( - "supported_microarchs", - []() { - auto& env = svs::arch::MicroArchEnvironment::get_instance(); - const auto& supported_archs = env.get_supported_microarchs(); - - std::vector result; - result.reserve(supported_archs.size()); - - for (const auto& arch : supported_archs) { - result.push_back(svs::arch::microarch_to_string(arch)); - } - - return result; - }, - "Returns a list of supported microarchitectures." - ); - wrap_conversion(m); + // Wrapper for svs::arch::MicroArchEnvironment + py::class_(m, "microarch", "Microarchitecture management singleton") + .def_static( + "get", + []() -> svs::arch::MicroArchEnvironment& { + return svs::arch::MicroArchEnvironment::get_instance(); + }, + py::return_value_policy::reference + ) + .def_property_static( + "current", + [](py::object) { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + return svs::arch::microarch_to_string(env.get_microarch()); + }, + [](py::object, const std::string& arch_name) { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + auto arch = svs::arch::string_to_microarch(arch_name); + env.set_microarch(arch); + }, + "Gets or sets the current microarchitecture." + ) + .def_property_readonly_static( + "supported", + [](py::object) { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + std::vector result; + for (const auto& arch : env.get_supported_microarchs()) { + result.push_back(svs::arch::microarch_to_string(arch)); + } + return result; + }, + "Returns a list of supported microarchitectures." + ) + .def_property_readonly_static( + "compiled", + [](py::object) { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + std::vector result; + for (const auto& arch : env.get_compiled_microarchs()) { + result.push_back(svs::arch::microarch_to_string(arch)); + } + return result; + }, + "Returns a list of compiled microarchitectures." + ); + // Allocators svs::python::allocators::wrap(m); diff --git a/bindings/python/tests/test_microarch.py b/bindings/python/tests/test_microarch.py index c98a3889..c9f475c9 100644 --- a/bindings/python/tests/test_microarch.py +++ b/bindings/python/tests/test_microarch.py @@ -19,9 +19,10 @@ class MicroarchTester(unittest.TestCase): def test_microarch(self): - supported_microarchs = svs.supported_microarchs() + supported_microarchs = svs.microarch.supported archspec_host_name = cpu.host().name + # TODO: better aliases handling if archspec_host_name == "icelake": archspec_host_name = "icelake_client" if archspec_host_name in supported_microarchs: - self.assertTrue(archspec_host_name == svs.microarch()) + self.assertTrue(archspec_host_name == svs.microarch.current) diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 72a1f219..81cab3e0 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -209,6 +209,16 @@ inline std::string microarch_to_string(MicroArch arch) { return "unknown"; } +inline MicroArch string_to_microarch(const std::string& arch_name) { + const auto& info_map = get_microarch_info_map(); + for (const auto& [arch, info] : info_map) { + if (info.name == arch_name) { + return arch; + } + } + throw std::invalid_argument("Unknown microarchitecture name: " + arch_name); +} + class MicroArchEnvironment { public: static MicroArchEnvironment& get_instance() { @@ -218,10 +228,22 @@ class MicroArchEnvironment { } MicroArch get_microarch() const { return max_arch_; } + void set_microarch(MicroArch arch) { + if (arch_is_supported(arch)) { + max_arch_ = arch; + } else { + throw std::invalid_argument("Unsupported microarchitecture"); + } + } + const std::vector& get_supported_microarchs() const { return supported_archs_; } + const std::vector& get_compiled_microarchs() const { + return compiled_archs_; + } + private: MicroArchEnvironment() { const std::vector compiled_archs = { From 5360dd237c95c4bcc7891db1175c08e376f8884e Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 11:17:14 -0700 Subject: [PATCH 34/65] fix: make microarch compatiable with sde usecase --- bindings/python/tests/test_microarch.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/bindings/python/tests/test_microarch.py b/bindings/python/tests/test_microarch.py index c9f475c9..16d4267f 100644 --- a/bindings/python/tests/test_microarch.py +++ b/bindings/python/tests/test_microarch.py @@ -15,14 +15,25 @@ import unittest import svs import archspec.cpu as cpu - +import os class MicroarchTester(unittest.TestCase): def test_microarch(self): supported_microarchs = svs.microarch.supported - archspec_host_name = cpu.host().name - # TODO: better aliases handling - if archspec_host_name == "icelake": - archspec_host_name = "icelake_client" + # Will be set in dispatcher pipeline + archspec_host_name = os.environ.get("SDE_FLAG") + if not archspec_host_name: + archspec_host_name = cpu.host().name + mapping = { + "nhm": "nehalem", + "hsw": "haswell", + "skx": "skylake_avx512", + "clx": "cascadelake", + "icl": "icelake_client", + "icelake": "icelake_client", + "spr": "sapphirerapids", + } + archspec_host_name = mapping.get(archspec_host_name, archspec_host_name) + if archspec_host_name in supported_microarchs: self.assertTrue(archspec_host_name == svs.microarch.current) From 0fbff5605e588a5075a3c569d6f711fa13d22965 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 11:30:33 -0700 Subject: [PATCH 35/65] fix: move dispatcher pipeline to public repo --- .github/workflows/test-dispatcher.yml | 52 +++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/test-dispatcher.yml diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml new file mode 100644 index 00000000..538f7b20 --- /dev/null +++ b/.github/workflows/test-dispatcher.yml @@ -0,0 +1,52 @@ +name: Test ISA dispatcher +on: + push: + branches: [ main ] + pull_request: + workflow_dispatch: + +env: + https_proxy: http://proxy-dmz.intel.com:912 + http_proxy: http://proxy-dmz.intel.com:911 + no_proxy: localhost,127.0.0.1 + +jobs: + test-dispatcher: + name: Download SDE and Check Dispatcher + runs-on: self-hosted + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: recursive + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install prerequisites + run: | + sudo apt-get update + sudo apt-get install -y wget tar grep + + - name: Download Intel SDE + run: | + wget --content-disposition "https://downloadmirror.intel.com/850782/sde-external-9.53.0-2025-03-16-lin.tar.xz" + tar -xf sde-external-*-lin.tar.xz + cd sde-external-*/ + export PATH="$PWD:$PATH" + echo "$PWD" >> $GITHUB_PATH + cd .. + + - name: Install archspec and Get Host Microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" + + - name: Validate dispatcher under SDE + run: | + cd bindings/python + pip install --force-reinstall . + for flag in nhm hsw skx clx icl spr; do + export SDE_FLAG=$flag + sde64 -$flag -- python -m unittest discover -p "test_dispatcher.py" -s . + done From 6360464d61df6d162b51bbdb0362325c08daf681 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 11:40:27 -0700 Subject: [PATCH 36/65] fix: add compielrs and os to test dispatcher --- .github/workflows/test-dispatcher.yml | 114 ++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml index 538f7b20..0ac0318f 100644 --- a/.github/workflows/test-dispatcher.yml +++ b/.github/workflows/test-dispatcher.yml @@ -1,3 +1,17 @@ +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: Test ISA dispatcher on: push: @@ -5,15 +19,19 @@ on: pull_request: workflow_dispatch: -env: - https_proxy: http://proxy-dmz.intel.com:912 - http_proxy: http://proxy-dmz.intel.com:911 - no_proxy: localhost,127.0.0.1 +# This allows a subsequently queued workflow run to interrupt previous runs +concurrency: + group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}' + cancel-in-progress: true jobs: - test-dispatcher: - name: Download SDE and Check Dispatcher + test-dispatcher-linux-x86: + name: Test Dispatcher (Linux x86) runs-on: self-hosted + env: + https_proxy: http://proxy-dmz.intel.com:912 + http_proxy: http://proxy-dmz.intel.com:911 + no_proxy: localhost,127.0.0.1 steps: - name: Checkout repository @@ -48,5 +66,87 @@ jobs: pip install --force-reinstall . for flag in nhm hsw skx clx icl spr; do export SDE_FLAG=$flag - sde64 -$flag -- python -m unittest discover -p "test_dispatcher.py" -s . + sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . done + + test-dispatcher-linux-arm: + name: Test Dispatcher (Linux ARM) + runs-on: ubuntu-22.04-arm + strategy: + matrix: + cxx: [g++-11, g++-12, clang++-15] + include: + - cxx: g++-11 + cc: gcc-11 + - cxx: g++-12 + cc: gcc-12 + - cxx: clang++-15 + cc: clang-15 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: recursive + + - name: Install archspec and Get Host Microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" + + - name: Build and Test Python Bindings + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} + run: | + cd bindings/python + pip install --force-reinstall . + python -m unittest discover -p "test_microarch.py" -s . + + test-dispatcher-macos: + name: Test Dispatcher (macOS) + runs-on: macos-latest + strategy: + matrix: + cxx: [clang++-15] + include: + - cxx: clang++-15 + package: llvm@15 + cc_name: clang + cxx_name: clang++ + needs_prefix: true + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: recursive + + - name: Install archspec and Get Host Microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" + + - name: Install Compiler + run: | + echo "Installing ${{ matrix.package }}..." + brew install ${{ matrix.package }} + + - name: Build and Test Python Bindings + run: | + if [[ "${{ matrix.needs_prefix }}" == "true" ]]; then + # For non-default packages like llvm@15, get the install prefix + COMPILER_PREFIX=$(brew --prefix ${{ matrix.package }}) + export CC="${COMPILER_PREFIX}/bin/${{ matrix.cc_name }}" + export CXX="${COMPILER_PREFIX}/bin/${{ matrix.cxx_name }}" + else + # For versioned GCC installs, the name is usually directly available + export CC="${{ matrix.cc_name }}" + export CXX="${{ matrix.cxx_name }}" + fi + + cd bindings/python + pip install --force-reinstall . + python -m unittest discover -p "test_microarch.py" -s . From 2d594f5f0d868696cbe59ac2888c4067037d134f Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 13:49:58 -0700 Subject: [PATCH 37/65] fix: test x86 only --- .github/workflows/test-dispatcher.yml | 160 +++++++++++++------------- 1 file changed, 80 insertions(+), 80 deletions(-) diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml index 0ac0318f..d4c071e4 100644 --- a/.github/workflows/test-dispatcher.yml +++ b/.github/workflows/test-dispatcher.yml @@ -69,84 +69,84 @@ jobs: sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . done - test-dispatcher-linux-arm: - name: Test Dispatcher (Linux ARM) - runs-on: ubuntu-22.04-arm - strategy: - matrix: - cxx: [g++-11, g++-12, clang++-15] - include: - - cxx: g++-11 - cc: gcc-11 - - cxx: g++-12 - cc: gcc-12 - - cxx: clang++-15 - cc: clang-15 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: recursive - - - name: Install archspec and Get Host Microarch - run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" - - - name: Build and Test Python Bindings - env: - CXX: ${{ matrix.cxx }} - CC: ${{ matrix.cc }} - run: | - cd bindings/python - pip install --force-reinstall . - python -m unittest discover -p "test_microarch.py" -s . - - test-dispatcher-macos: - name: Test Dispatcher (macOS) - runs-on: macos-latest - strategy: - matrix: - cxx: [clang++-15] - include: - - cxx: clang++-15 - package: llvm@15 - cc_name: clang - cxx_name: clang++ - needs_prefix: true - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: recursive - - - name: Install archspec and Get Host Microarch - run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" - - - name: Install Compiler - run: | - echo "Installing ${{ matrix.package }}..." - brew install ${{ matrix.package }} - - - name: Build and Test Python Bindings - run: | - if [[ "${{ matrix.needs_prefix }}" == "true" ]]; then - # For non-default packages like llvm@15, get the install prefix - COMPILER_PREFIX=$(brew --prefix ${{ matrix.package }}) - export CC="${COMPILER_PREFIX}/bin/${{ matrix.cc_name }}" - export CXX="${COMPILER_PREFIX}/bin/${{ matrix.cxx_name }}" - else - # For versioned GCC installs, the name is usually directly available - export CC="${{ matrix.cc_name }}" - export CXX="${{ matrix.cxx_name }}" - fi + # test-dispatcher-linux-arm: + # name: Test Dispatcher (Linux ARM) + # runs-on: ubuntu-22.04-arm + # strategy: + # matrix: + # cxx: [g++-11, g++-12, clang++-15] + # include: + # - cxx: g++-11 + # cc: gcc-11 + # - cxx: g++-12 + # cc: gcc-12 + # - cxx: clang++-15 + # cc: clang-15 + + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # with: + # fetch-depth: 0 + # submodules: recursive + + # - name: Install archspec and Get Host Microarch + # run: | + # python -m pip install archspec + # python -c "import archspec.cpu; print(archspec.cpu.host().name)" + + # - name: Build and Test Python Bindings + # env: + # CXX: ${{ matrix.cxx }} + # CC: ${{ matrix.cc }} + # run: | + # cd bindings/python + # pip install --force-reinstall . + # python -m unittest discover -p "test_microarch.py" -s . + + # test-dispatcher-macos: + # name: Test Dispatcher (macOS) + # runs-on: macos-latest + # strategy: + # matrix: + # cxx: [clang++-15] + # include: + # - cxx: clang++-15 + # package: llvm@15 + # cc_name: clang + # cxx_name: clang++ + # needs_prefix: true + + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # with: + # fetch-depth: 0 + # submodules: recursive + + # - name: Install archspec and Get Host Microarch + # run: | + # python -m pip install archspec + # python -c "import archspec.cpu; print(archspec.cpu.host().name)" + + # - name: Install Compiler + # run: | + # echo "Installing ${{ matrix.package }}..." + # brew install ${{ matrix.package }} + + # - name: Build and Test Python Bindings + # run: | + # if [[ "${{ matrix.needs_prefix }}" == "true" ]]; then + # # For non-default packages like llvm@15, get the install prefix + # COMPILER_PREFIX=$(brew --prefix ${{ matrix.package }}) + # export CC="${COMPILER_PREFIX}/bin/${{ matrix.cc_name }}" + # export CXX="${COMPILER_PREFIX}/bin/${{ matrix.cxx_name }}" + # else + # # For versioned GCC installs, the name is usually directly available + # export CC="${{ matrix.cc_name }}" + # export CXX="${{ matrix.cxx_name }}" + # fi - cd bindings/python - pip install --force-reinstall . - python -m unittest discover -p "test_microarch.py" -s . + # cd bindings/python + # pip install --force-reinstall . + # python -m unittest discover -p "test_microarch.py" -s . From bb397dbfa7158f31125f6f43796e94019ea92c32 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 13:54:23 -0700 Subject: [PATCH 38/65] fix: test x86 only --- .github/workflows/test-dispatcher.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml index d4c071e4..a36d8eac 100644 --- a/.github/workflows/test-dispatcher.yml +++ b/.github/workflows/test-dispatcher.yml @@ -60,14 +60,14 @@ jobs: python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" - - name: Validate dispatcher under SDE - run: | - cd bindings/python - pip install --force-reinstall . - for flag in nhm hsw skx clx icl spr; do - export SDE_FLAG=$flag - sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . - done + # - name: Validate dispatcher under SDE + # run: | + # cd bindings/python + # pip install --force-reinstall . + # for flag in nhm hsw skx clx icl spr; do + # export SDE_FLAG=$flag + # sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . + # done # test-dispatcher-linux-arm: # name: Test Dispatcher (Linux ARM) From 936635a626615d46c64153b30fb0dcba596d172e Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 14:04:08 -0700 Subject: [PATCH 39/65] fix: use actual os name instead of self hosted runners --- .github/workflows/test-dispatcher.yml | 180 +++++++++++++------------- 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml index a36d8eac..a0f96877 100644 --- a/.github/workflows/test-dispatcher.yml +++ b/.github/workflows/test-dispatcher.yml @@ -27,7 +27,7 @@ concurrency: jobs: test-dispatcher-linux-x86: name: Test Dispatcher (Linux x86) - runs-on: self-hosted + runs-on: ubuntu-22.04 env: https_proxy: http://proxy-dmz.intel.com:912 http_proxy: http://proxy-dmz.intel.com:911 @@ -60,93 +60,93 @@ jobs: python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" - # - name: Validate dispatcher under SDE - # run: | - # cd bindings/python - # pip install --force-reinstall . - # for flag in nhm hsw skx clx icl spr; do - # export SDE_FLAG=$flag - # sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . - # done - - # test-dispatcher-linux-arm: - # name: Test Dispatcher (Linux ARM) - # runs-on: ubuntu-22.04-arm - # strategy: - # matrix: - # cxx: [g++-11, g++-12, clang++-15] - # include: - # - cxx: g++-11 - # cc: gcc-11 - # - cxx: g++-12 - # cc: gcc-12 - # - cxx: clang++-15 - # cc: clang-15 - - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # with: - # fetch-depth: 0 - # submodules: recursive - - # - name: Install archspec and Get Host Microarch - # run: | - # python -m pip install archspec - # python -c "import archspec.cpu; print(archspec.cpu.host().name)" - - # - name: Build and Test Python Bindings - # env: - # CXX: ${{ matrix.cxx }} - # CC: ${{ matrix.cc }} - # run: | - # cd bindings/python - # pip install --force-reinstall . - # python -m unittest discover -p "test_microarch.py" -s . - - # test-dispatcher-macos: - # name: Test Dispatcher (macOS) - # runs-on: macos-latest - # strategy: - # matrix: - # cxx: [clang++-15] - # include: - # - cxx: clang++-15 - # package: llvm@15 - # cc_name: clang - # cxx_name: clang++ - # needs_prefix: true - - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # with: - # fetch-depth: 0 - # submodules: recursive - - # - name: Install archspec and Get Host Microarch - # run: | - # python -m pip install archspec - # python -c "import archspec.cpu; print(archspec.cpu.host().name)" - - # - name: Install Compiler - # run: | - # echo "Installing ${{ matrix.package }}..." - # brew install ${{ matrix.package }} - - # - name: Build and Test Python Bindings - # run: | - # if [[ "${{ matrix.needs_prefix }}" == "true" ]]; then - # # For non-default packages like llvm@15, get the install prefix - # COMPILER_PREFIX=$(brew --prefix ${{ matrix.package }}) - # export CC="${COMPILER_PREFIX}/bin/${{ matrix.cc_name }}" - # export CXX="${COMPILER_PREFIX}/bin/${{ matrix.cxx_name }}" - # else - # # For versioned GCC installs, the name is usually directly available - # export CC="${{ matrix.cc_name }}" - # export CXX="${{ matrix.cxx_name }}" - # fi + - name: Validate dispatcher under SDE + run: | + cd bindings/python + pip install --force-reinstall . + for flag in nhm hsw skx clx icl spr; do + export SDE_FLAG=$flag + sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . + done + + test-dispatcher-linux-arm: + name: Test Dispatcher (Linux ARM) + runs-on: ubuntu-22.04-arm + strategy: + matrix: + cxx: [g++-11, g++-12, clang++-15] + include: + - cxx: g++-11 + cc: gcc-11 + - cxx: g++-12 + cc: gcc-12 + - cxx: clang++-15 + cc: clang-15 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: recursive + + - name: Install archspec and Get Host Microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" + + - name: Build and Test Python Bindings + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} + run: | + cd bindings/python + pip install --force-reinstall . + python -m unittest discover -p "test_microarch.py" -s . + + test-dispatcher-macos: + name: Test Dispatcher (macOS) + runs-on: macos-latest + strategy: + matrix: + cxx: [clang++-15] + include: + - cxx: clang++-15 + package: llvm@15 + cc_name: clang + cxx_name: clang++ + needs_prefix: true + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: recursive + + - name: Install archspec and Get Host Microarch + run: | + python -m pip install archspec + python -c "import archspec.cpu; print(archspec.cpu.host().name)" + + - name: Install Compiler + run: | + echo "Installing ${{ matrix.package }}..." + brew install ${{ matrix.package }} + + - name: Build and Test Python Bindings + run: | + if [[ "${{ matrix.needs_prefix }}" == "true" ]]; then + # For non-default packages like llvm@15, get the install prefix + COMPILER_PREFIX=$(brew --prefix ${{ matrix.package }}) + export CC="${COMPILER_PREFIX}/bin/${{ matrix.cc_name }}" + export CXX="${COMPILER_PREFIX}/bin/${{ matrix.cxx_name }}" + else + # For versioned GCC installs, the name is usually directly available + export CC="${{ matrix.cc_name }}" + export CXX="${{ matrix.cxx_name }}" + fi - # cd bindings/python - # pip install --force-reinstall . - # python -m unittest discover -p "test_microarch.py" -s . + cd bindings/python + pip install --force-reinstall . + python -m unittest discover -p "test_microarch.py" -s . From 37184dfb56f7ff13f2b06ae9d89f3e096c9f9118 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 14:07:57 -0700 Subject: [PATCH 40/65] fix: checkout erros --- .github/workflows/test-dispatcher.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml index a0f96877..59b432e3 100644 --- a/.github/workflows/test-dispatcher.yml +++ b/.github/workflows/test-dispatcher.yml @@ -36,10 +36,6 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: recursive - token: ${{ secrets.GITHUB_TOKEN }} - name: Install prerequisites run: | @@ -86,9 +82,6 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: recursive - name: Install archspec and Get Host Microarch run: | @@ -120,9 +113,6 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 - with: - fetch-depth: 0 - submodules: recursive - name: Install archspec and Get Host Microarch run: | From a54d187370027c1d505bca87bf5f9fccfa3dd21a Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 14:10:54 -0700 Subject: [PATCH 41/65] fix: checkout erros --- .github/workflows/test-dispatcher.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml index 59b432e3..7c24ba58 100644 --- a/.github/workflows/test-dispatcher.yml +++ b/.github/workflows/test-dispatcher.yml @@ -29,8 +29,6 @@ jobs: name: Test Dispatcher (Linux x86) runs-on: ubuntu-22.04 env: - https_proxy: http://proxy-dmz.intel.com:912 - http_proxy: http://proxy-dmz.intel.com:911 no_proxy: localhost,127.0.0.1 steps: From b3ac750bab608de1d4cc4743ea3b95a7d5c6bcd7 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 16:05:13 -0700 Subject: [PATCH 42/65] fix: add comiler matric for x86 targets as well --- .github/workflows/test-dispatcher.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml index 7c24ba58..72a8dced 100644 --- a/.github/workflows/test-dispatcher.yml +++ b/.github/workflows/test-dispatcher.yml @@ -28,6 +28,16 @@ jobs: test-dispatcher-linux-x86: name: Test Dispatcher (Linux x86) runs-on: ubuntu-22.04 + strategy: + matrix: + cxx: [g++-11, g++-12, clang++-15] + include: + - cxx: g++-11 + cc: gcc-11 + - cxx: g++-12 + cc: gcc-12 + - cxx: clang++-15 + cc: clang-15 env: no_proxy: localhost,127.0.0.1 @@ -55,6 +65,9 @@ jobs: python -c "import archspec.cpu; print(archspec.cpu.host().name)" - name: Validate dispatcher under SDE + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} run: | cd bindings/python pip install --force-reinstall . From 1095355bb0d2e4e7e1535d278b18d10a776c4b17 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 16:33:28 -0700 Subject: [PATCH 43/65] add: add cpu name and avx support info print --- .github/scripts/print_cpu_info.sh | 45 +++++++++++++++++++++++++++ .github/workflows/test-dispatcher.yml | 18 +++++++++++ 2 files changed, 63 insertions(+) create mode 100644 .github/scripts/print_cpu_info.sh diff --git a/.github/scripts/print_cpu_info.sh b/.github/scripts/print_cpu_info.sh new file mode 100644 index 00000000..b83a44a5 --- /dev/null +++ b/.github/scripts/print_cpu_info.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# Check AVX support using compiler intrinsics +cat > check_avx.c <<'EOF' +#include +#include + +int main() { + if (__builtin_cpu_supports("avx512f")) { + printf("AVX512\n"); + } else if (__builtin_cpu_supports("avx2")) { + printf("AVX2\n"); + } else if (__builtin_cpu_supports("avx")) { + printf("AVX\n"); + } else { + printf("No AVX\n"); + } + return 0; +} +EOF + +##### CPU Name ##### +if [[ -n "$SDE_FLAG" ]]; then + echo "CPU: SDE Emulated $SDE_FLAG" +else + if [[ "$(uname -s)" == "Linux" ]]; then + echo "CPU: $(cat /proc/cpuinfo | grep "model name" | head -1 | cut -d':' -f2 | xargs)" + elif [[ "$(uname -s)" == "Darwin" ]]; then + echo "CPU: $(sysctl -n machdep.cpu.brand_string)" + else + echo "CPU: Unknown (unsupported OS)" + fi +fi + +###### AVX Support ##### +echo -n "AVX Support: " +gcc -O2 check_avx.c -o check_avx + +if [[ -n "$SDE_FLAG" ]]; then + sde64 -$SDE_FLAG -- ./check_avx +else + ./check_avx +fi + +rm check_avx check_avx.c \ No newline at end of file diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml index 72a8dced..5c1e8986 100644 --- a/.github/workflows/test-dispatcher.yml +++ b/.github/workflows/test-dispatcher.yml @@ -63,6 +63,12 @@ jobs: run: | python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" + + - name: Check CPU Information + run: | + # Print host CPU info + chmod +x .github/scripts/print_cpu_info.sh + .github/scripts/print_cpu_info.sh - name: Validate dispatcher under SDE env: @@ -73,6 +79,8 @@ jobs: pip install --force-reinstall . for flag in nhm hsw skx clx icl spr; do export SDE_FLAG=$flag + # Print emulator CPU info + ../../.github/scripts/print_cpu_info.sh sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . done @@ -99,6 +107,11 @@ jobs: python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" + - name: Check CPU Information + run: | + chmod +x .github/scripts/print_cpu_info.sh + .github/scripts/print_cpu_info.sh + - name: Build and Test Python Bindings env: CXX: ${{ matrix.cxx }} @@ -130,6 +143,11 @@ jobs: python -m pip install archspec python -c "import archspec.cpu; print(archspec.cpu.host().name)" + - name: Check CPU Information + run: | + chmod +x .github/scripts/print_cpu_info.sh + .github/scripts/print_cpu_info.sh + - name: Install Compiler run: | echo "Installing ${{ matrix.package }}..." From 449d49cd8107ca9a8a28cd956068b6e67df0a4ab Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 17:34:31 -0700 Subject: [PATCH 44/65] fix: fix cpu info print for arm and macos --- .github/scripts/print_cpu_info.sh | 53 +++++++++++++++++++------------ 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/.github/scripts/print_cpu_info.sh b/.github/scripts/print_cpu_info.sh index b83a44a5..92f52354 100644 --- a/.github/scripts/print_cpu_info.sh +++ b/.github/scripts/print_cpu_info.sh @@ -1,6 +1,29 @@ #!/bin/bash +# Exit immediately if error +set -e -# Check AVX support using compiler intrinsics +##### CPU Name ##### +if [[ -n "$SDE_FLAG" ]]; then + echo "CPU: SDE Emulation ($SDE_FLAG)" +else + if [[ "$(uname -s)" == "Linux" ]]; then + echo "CPU: $(cat /proc/cpuinfo | grep "model name" | head -1 | cut -d':' -f2 | xargs)" + elif [[ "$(uname -s)" == "Darwin" ]]; then + echo "CPU: $(sysctl -n machdep.cpu.brand_string)" + else + echo "CPU: unsupported OS" + fi +fi + +##### AVX Support ##### +ARCH=$(uname -m) +# Only check AVX support on x86 architectures +if [[ "$ARCH" != "x86_64" && "$ARCH" != "i386" && "$ARCH" != "i686" ]]; then + echo "AVX Support: Not applicable" + exit 0 +fi + +# Check AVX support with compiler intrinsics cat > check_avx.c <<'EOF' #include #include @@ -19,27 +42,17 @@ int main() { } EOF -##### CPU Name ##### -if [[ -n "$SDE_FLAG" ]]; then - echo "CPU: SDE Emulated $SDE_FLAG" -else - if [[ "$(uname -s)" == "Linux" ]]; then - echo "CPU: $(cat /proc/cpuinfo | grep "model name" | head -1 | cut -d':' -f2 | xargs)" - elif [[ "$(uname -s)" == "Darwin" ]]; then - echo "CPU: $(sysctl -n machdep.cpu.brand_string)" +# Compile and run the AVX detection program +echo -n "AVX Support: " +if gcc -O2 check_avx.c -o check_avx 2>/dev/null; then + if [[ -n "$SDE_FLAG" ]]; then + sde64 -$SDE_FLAG -- ./check_avx else - echo "CPU: Unknown (unsupported OS)" + ./check_avx fi -fi - -###### AVX Support ##### -echo -n "AVX Support: " -gcc -O2 check_avx.c -o check_avx - -if [[ -n "$SDE_FLAG" ]]; then - sde64 -$SDE_FLAG -- ./check_avx else - ./check_avx + echo "Detection failed with compiler error" fi -rm check_avx check_avx.c \ No newline at end of file +# Clean up +rm -f check_avx check_avx.c \ No newline at end of file From 687a5fd1bb8dc88216fe5ff5005be32ed9dc88e6 Mon Sep 17 00:00:00 2001 From: yuejiaointel Date: Wed, 7 May 2025 17:38:20 -0700 Subject: [PATCH 45/65] fix: format --- .github/scripts/print_cpu_info.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/scripts/print_cpu_info.sh b/.github/scripts/print_cpu_info.sh index 92f52354..5fb00c9a 100644 --- a/.github/scripts/print_cpu_info.sh +++ b/.github/scripts/print_cpu_info.sh @@ -1,4 +1,18 @@ #!/bin/bash +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Exit immediately if error set -e From aa5bf2bc4ee7b187a3a16539630b4af8d0ce76f7 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 8 May 2025 03:34:26 -0700 Subject: [PATCH 46/65] Add microarch_info example --- examples/cpp/CMakeLists.txt | 1 + examples/cpp/microarch_info.cpp | 49 +++++++++++++++++++++++++++++++++ include/svs/lib/cpuid.h | 17 ++++++++++-- 3 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 examples/cpp/microarch_info.cpp diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 4f18c342..6110f4d9 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -37,6 +37,7 @@ endfunction() create_simple_example(saveload test_saveload saveload.cpp) create_simple_example(types test_types types.cpp) create_simple_example(vamana_iterator test_vamana_iterator vamana_iterator.cpp) +create_simple_example(microarch_info test_microarch_info microarch_info.cpp) ## More complicated examples involving more extensive setup. diff --git a/examples/cpp/microarch_info.cpp b/examples/cpp/microarch_info.cpp new file mode 100644 index 00000000..b70954bf --- /dev/null +++ b/examples/cpp/microarch_info.cpp @@ -0,0 +1,49 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "svs/lib/arch.h" +#include "svs/lib/cpuid.h" +#include + +int main() { + std::ostream& out = std::cout; + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + + // Print support status for all ISA extensions + svs::arch::write_extensions_status(out); + + // Print current microarchitecture + auto current_arch = arch_env.get_microarch(); + out << "\nCurrent µarch: " << svs::arch::microarch_to_string(current_arch) << std::endl; + + // Print all supported microarchitectures + const auto& supported_archs = arch_env.get_supported_microarchs(); + out << "\nSupported µarchs: "; + for (const auto& arch : supported_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; + + // Print all compiled microarchitectures + const auto& compiled_archs = arch_env.get_compiled_microarchs(); + out << "\nCompiled µarchs: "; + for (const auto& arch : compiled_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; + + return 0; +} diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h index 6d3d5c15..0ed67b0c 100644 --- a/include/svs/lib/cpuid.h +++ b/include/svs/lib/cpuid.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -308,9 +309,7 @@ inline const std::unordered_map& get_isa_ext_info() { #endif #endif -inline bool check_extension(ISAExt ext) { - return get_isa_ext_info().at(ext).get_value(); -} +inline bool check_extension(ISAExt ext) { return get_isa_ext_info().at(ext).get_value(); } inline bool check_extensions(std::vector exts) { for (const auto& ext : exts) { @@ -321,4 +320,16 @@ inline bool check_extensions(std::vector exts) { return true; } +template inline void write_extensions_status(StreamType& stream) { + const auto& ext_info = get_isa_ext_info(); + + stream << "CPU Extensions Support Status:" << std::endl; + stream << "-----------------------------" << std::endl; + + for (const auto& [ext, info] : ext_info) { + stream << info.name << ": " + << (check_extension(ext) ? "Supported" : "Not supported") << std::endl; + } +} + } // namespace svs::arch From bd4bcef59e1aa26993d83d70164c98fa32465405 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 8 May 2025 05:35:09 -0700 Subject: [PATCH 47/65] Merge dispatcher testing with SDE into build-linux pipeline --- .github/scripts/get_cpu_info.sh | 34 +++++++++++++++++++++++++++ .github/scripts/install_sde.sh | 19 +++++++++++++++ .github/workflows/build-linux-arm.yml | 5 ++-- .github/workflows/build-linux.yml | 26 ++++++++++++++++---- .github/workflows/build-macos.yaml | 5 ++-- .github/workflows/cibuildwheel.yml | 5 ++-- 6 files changed, 81 insertions(+), 13 deletions(-) create mode 100755 .github/scripts/get_cpu_info.sh create mode 100755 .github/scripts/install_sde.sh diff --git a/.github/scripts/get_cpu_info.sh b/.github/scripts/get_cpu_info.sh new file mode 100755 index 00000000..82c46b84 --- /dev/null +++ b/.github/scripts/get_cpu_info.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Script to get CPU information using platform-agnostic python packages + +# Install python packages if not present in the environment +if ! python -m pip show archspec > /dev/null 2>&1; then + python -m pip install archspec +fi + +if ! python -m pip show py-cpuinfo > /dev/null 2>&1; then + python -m pip install py-cpuinfo +fi + +# Print host microarchitecture +python -c "import archspec.cpu; \ + print('Host Microarchitecture[archspec]:', archspec.cpu.host().name)" + +# Print full CPU information +python -c "import pprint, cpuinfo; \ + print('CPU info[py-cpuinfo]:'); \ + pprint.pprint(cpuinfo.get_cpu_info(), indent=4, compact=True)" diff --git a/.github/scripts/install_sde.sh b/.github/scripts/install_sde.sh new file mode 100755 index 00000000..25310c5a --- /dev/null +++ b/.github/scripts/install_sde.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget --content-disposition "https://downloadmirror.intel.com/850782/sde-external-9.53.0-2025-03-16-lin.tar.xz" +tar -xf sde-external-*-lin.tar.xz +cd sde-external-*/ +echo "$PWD" >> $GITHUB_PATH diff --git a/.github/workflows/build-linux-arm.yml b/.github/workflows/build-linux-arm.yml index 5c073eed..9fe6a648 100644 --- a/.github/workflows/build-linux-arm.yml +++ b/.github/workflows/build-linux-arm.yml @@ -47,10 +47,9 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install archspec and Get Host Microarch + - name: Get CPU info run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" + bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh - name: Configure build working-directory: ${{ runner.temp }} diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index de143064..76679167 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -56,10 +56,11 @@ jobs: source /opt/intel/oneapi/setvars.sh printenv >> $GITHUB_ENV - - name: Install archspec and Get Host Microarch - run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" + - name: Install Intel(R) SDE + - run: source ${GITHUB_WORKSPACE}/.github/scripts/install_sde.sh + + - name: Get CPU info + run: bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh - name: Configure build working-directory: ${{ runner.temp }} @@ -91,3 +92,20 @@ jobs: CTEST_OUTPUT_ON_FAILURE: 1 working-directory: ${{ runner.temp }}/build/examples/cpp run: ctest -C RelWithDebugInfo + + - name: Build Python Bindings + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} + run: | + cd bindings/python + python -m pip install . + + - name: Run Python Microarch Test with SDE + run: | + for flag in nhm hsw skx clx icl spr; do + echo "SDE emulation: $flag" + export SDE_FLAG=$flag + sde64 -$flag -- bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh + sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . + done diff --git a/.github/workflows/build-macos.yaml b/.github/workflows/build-macos.yaml index 24d5a89e..96a7b439 100644 --- a/.github/workflows/build-macos.yaml +++ b/.github/workflows/build-macos.yaml @@ -46,10 +46,9 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Install archspec and Get Host Microarch + - name: Get CPU info run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" + bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh - name: Install Compiler run: | diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml index 39cd64f3..8ca5dc92 100644 --- a/.github/workflows/cibuildwheel.yml +++ b/.github/workflows/cibuildwheel.yml @@ -43,10 +43,9 @@ jobs: - name: Install cibuildwheel run: python -m pip install cibuildwheel - - name: Install archspec and Get Host Microarch + - name: Get CPU info run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" + bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh # Install inside the temporary working directory. - name: Build Wheel From 658c770d062f10272551123e499277732557ceb5 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 8 May 2025 05:38:02 -0700 Subject: [PATCH 48/65] Fix typo in pipeline --- .github/workflows/build-linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index 76679167..230f95cb 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -57,7 +57,7 @@ jobs: printenv >> $GITHUB_ENV - name: Install Intel(R) SDE - - run: source ${GITHUB_WORKSPACE}/.github/scripts/install_sde.sh + run: source ${GITHUB_WORKSPACE}/.github/scripts/install_sde.sh - name: Get CPU info run: bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh From d520b37b5732f4e768e17575df65b95cba5f0725 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 8 May 2025 06:19:09 -0700 Subject: [PATCH 49/65] Add `describe` static function to `svs.microarch`and fix pipelines --- .github/workflows/build-linux-arm.yml | 14 +++++++++ .github/workflows/build-linux.yml | 3 +- .github/workflows/build-macos.yaml | 14 +++++++++ bindings/python/src/python_bindings.cpp | 38 +++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-linux-arm.yml b/.github/workflows/build-linux-arm.yml index 9fe6a648..7ea8c425 100644 --- a/.github/workflows/build-linux-arm.yml +++ b/.github/workflows/build-linux-arm.yml @@ -74,3 +74,17 @@ jobs: CTEST_OUTPUT_ON_FAILURE: 1 working-directory: ${{ runner.temp }}/build/tests run: ctest -C ${{ matrix.build_type }} + + - name: Build Python Bindings + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} + run: | + cd bindings/python + python -m pip install . + + - name: Run Python Microarch Test + run: | + cd bindings/python + python -c "import svs; svs.microarch.describe()" + python -m unittest discover -p "test_microarch.py" -s . diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index 230f95cb..94b1a28b 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -103,9 +103,10 @@ jobs: - name: Run Python Microarch Test with SDE run: | + cd bindings/python for flag in nhm hsw skx clx icl spr; do echo "SDE emulation: $flag" export SDE_FLAG=$flag - sde64 -$flag -- bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh + sde64 -$flag -- python -c "import svs; svs.microarch.describe()" sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . done diff --git a/.github/workflows/build-macos.yaml b/.github/workflows/build-macos.yaml index 96a7b439..6307bc94 100644 --- a/.github/workflows/build-macos.yaml +++ b/.github/workflows/build-macos.yaml @@ -87,3 +87,17 @@ jobs: CTEST_OUTPUT_ON_FAILURE: 1 working-directory: ${{ runner.temp }}/build/tests run: ctest -C ${{ matrix.build_type }} + + - name: Build Python Bindings + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} + run: | + cd bindings/python + python -m pip install . + + - name: Run Python Microarch Test + run: | + cd bindings/python + python -c "import svs; svs.microarch.describe()" + python -m unittest discover -p "test_microarch.py" -s . diff --git a/bindings/python/src/python_bindings.cpp b/bindings/python/src/python_bindings.cpp index 996e6f4c..dfbf3e88 100644 --- a/bindings/python/src/python_bindings.cpp +++ b/bindings/python/src/python_bindings.cpp @@ -44,6 +44,7 @@ // stl #include #include +#include namespace py = pybind11; @@ -188,6 +189,13 @@ Convert the `fvecs` file on disk with 32-bit floating point entries to a `fvecs` wrap_conversion(m); + m.def( + "_print_cpu_extensions_status", + []() { + svs::arch::write_extensions_status(std::cout); + } + ); + // Wrapper for svs::arch::MicroArchEnvironment py::class_(m, "microarch", "Microarchitecture management singleton") .def_static( @@ -233,6 +241,36 @@ Convert the `fvecs` file on disk with 32-bit floating point entries to a `fvecs` return result; }, "Returns a list of compiled microarchitectures." + ) + .def_static( + "describe", + []() { + std::ostream& out = std::cout; + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + + // Print support status for all ISA extensions + svs::arch::write_extensions_status(out); + + // Print current microarchitecture + auto current_arch = arch_env.get_microarch(); + out << "\nCurrent µarch: " << svs::arch::microarch_to_string(current_arch) << std::endl; + + // Print all supported microarchitectures + const auto& supported_archs = arch_env.get_supported_microarchs(); + out << "\nSupported µarchs: "; + for (const auto& arch : supported_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; + + // Print all compiled microarchitectures + const auto& compiled_archs = arch_env.get_compiled_microarchs(); + out << "\nCompiled µarchs: "; + for (const auto& arch : compiled_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; + } ); // Allocators From f3f076b1d4199ea85dd45725e3f2b97f5e6d25b8 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 8 May 2025 06:45:29 -0700 Subject: [PATCH 50/65] Remove merged dispatcher testing; macos pipeline fix; test_microarch.py corrections --- .github/scripts/print_cpu_info.sh | 72 ---------- .github/workflows/build-macos.yaml | 11 ++ .github/workflows/test-dispatcher.yml | 171 ------------------------ bindings/python/tests/test_microarch.py | 13 +- 4 files changed, 17 insertions(+), 250 deletions(-) delete mode 100644 .github/scripts/print_cpu_info.sh delete mode 100644 .github/workflows/test-dispatcher.yml diff --git a/.github/scripts/print_cpu_info.sh b/.github/scripts/print_cpu_info.sh deleted file mode 100644 index 5fb00c9a..00000000 --- a/.github/scripts/print_cpu_info.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash -# Copyright 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Exit immediately if error -set -e - -##### CPU Name ##### -if [[ -n "$SDE_FLAG" ]]; then - echo "CPU: SDE Emulation ($SDE_FLAG)" -else - if [[ "$(uname -s)" == "Linux" ]]; then - echo "CPU: $(cat /proc/cpuinfo | grep "model name" | head -1 | cut -d':' -f2 | xargs)" - elif [[ "$(uname -s)" == "Darwin" ]]; then - echo "CPU: $(sysctl -n machdep.cpu.brand_string)" - else - echo "CPU: unsupported OS" - fi -fi - -##### AVX Support ##### -ARCH=$(uname -m) -# Only check AVX support on x86 architectures -if [[ "$ARCH" != "x86_64" && "$ARCH" != "i386" && "$ARCH" != "i686" ]]; then - echo "AVX Support: Not applicable" - exit 0 -fi - -# Check AVX support with compiler intrinsics -cat > check_avx.c <<'EOF' -#include -#include - -int main() { - if (__builtin_cpu_supports("avx512f")) { - printf("AVX512\n"); - } else if (__builtin_cpu_supports("avx2")) { - printf("AVX2\n"); - } else if (__builtin_cpu_supports("avx")) { - printf("AVX\n"); - } else { - printf("No AVX\n"); - } - return 0; -} -EOF - -# Compile and run the AVX detection program -echo -n "AVX Support: " -if gcc -O2 check_avx.c -o check_avx 2>/dev/null; then - if [[ -n "$SDE_FLAG" ]]; then - sde64 -$SDE_FLAG -- ./check_avx - else - ./check_avx - fi -else - echo "Detection failed with compiler error" -fi - -# Clean up -rm -f check_avx check_avx.c \ No newline at end of file diff --git a/.github/workflows/build-macos.yaml b/.github/workflows/build-macos.yaml index 6307bc94..9069fda1 100644 --- a/.github/workflows/build-macos.yaml +++ b/.github/workflows/build-macos.yaml @@ -93,6 +93,17 @@ jobs: CXX: ${{ matrix.cxx }} CC: ${{ matrix.cc }} run: | + if [[ "${{ matrix.needs_prefix }}" == "true" ]]; then + # For non-default packages like llvm@15, get the install prefix + COMPILER_PREFIX=$(brew --prefix ${{ matrix.package }}) + export CC="${COMPILER_PREFIX}/bin/${{ matrix.cc_name }}" + export CXX="${COMPILER_PREFIX}/bin/${{ matrix.cxx_name }}" + else + # For versioned GCC installs, the name is usually directly available + export CC="${{ matrix.cc_name }}" + export CXX="${{ matrix.cxx_name }}" + fi + cd bindings/python python -m pip install . diff --git a/.github/workflows/test-dispatcher.yml b/.github/workflows/test-dispatcher.yml deleted file mode 100644 index 5c1e8986..00000000 --- a/.github/workflows/test-dispatcher.yml +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: Test ISA dispatcher -on: - push: - branches: [ main ] - pull_request: - workflow_dispatch: - -# This allows a subsequently queued workflow run to interrupt previous runs -concurrency: - group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}' - cancel-in-progress: true - -jobs: - test-dispatcher-linux-x86: - name: Test Dispatcher (Linux x86) - runs-on: ubuntu-22.04 - strategy: - matrix: - cxx: [g++-11, g++-12, clang++-15] - include: - - cxx: g++-11 - cc: gcc-11 - - cxx: g++-12 - cc: gcc-12 - - cxx: clang++-15 - cc: clang-15 - env: - no_proxy: localhost,127.0.0.1 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install prerequisites - run: | - sudo apt-get update - sudo apt-get install -y wget tar grep - - - name: Download Intel SDE - run: | - wget --content-disposition "https://downloadmirror.intel.com/850782/sde-external-9.53.0-2025-03-16-lin.tar.xz" - tar -xf sde-external-*-lin.tar.xz - cd sde-external-*/ - export PATH="$PWD:$PATH" - echo "$PWD" >> $GITHUB_PATH - cd .. - - - name: Install archspec and Get Host Microarch - run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" - - - name: Check CPU Information - run: | - # Print host CPU info - chmod +x .github/scripts/print_cpu_info.sh - .github/scripts/print_cpu_info.sh - - - name: Validate dispatcher under SDE - env: - CXX: ${{ matrix.cxx }} - CC: ${{ matrix.cc }} - run: | - cd bindings/python - pip install --force-reinstall . - for flag in nhm hsw skx clx icl spr; do - export SDE_FLAG=$flag - # Print emulator CPU info - ../../.github/scripts/print_cpu_info.sh - sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . - done - - test-dispatcher-linux-arm: - name: Test Dispatcher (Linux ARM) - runs-on: ubuntu-22.04-arm - strategy: - matrix: - cxx: [g++-11, g++-12, clang++-15] - include: - - cxx: g++-11 - cc: gcc-11 - - cxx: g++-12 - cc: gcc-12 - - cxx: clang++-15 - cc: clang-15 - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install archspec and Get Host Microarch - run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" - - - name: Check CPU Information - run: | - chmod +x .github/scripts/print_cpu_info.sh - .github/scripts/print_cpu_info.sh - - - name: Build and Test Python Bindings - env: - CXX: ${{ matrix.cxx }} - CC: ${{ matrix.cc }} - run: | - cd bindings/python - pip install --force-reinstall . - python -m unittest discover -p "test_microarch.py" -s . - - test-dispatcher-macos: - name: Test Dispatcher (macOS) - runs-on: macos-latest - strategy: - matrix: - cxx: [clang++-15] - include: - - cxx: clang++-15 - package: llvm@15 - cc_name: clang - cxx_name: clang++ - needs_prefix: true - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install archspec and Get Host Microarch - run: | - python -m pip install archspec - python -c "import archspec.cpu; print(archspec.cpu.host().name)" - - - name: Check CPU Information - run: | - chmod +x .github/scripts/print_cpu_info.sh - .github/scripts/print_cpu_info.sh - - - name: Install Compiler - run: | - echo "Installing ${{ matrix.package }}..." - brew install ${{ matrix.package }} - - - name: Build and Test Python Bindings - run: | - if [[ "${{ matrix.needs_prefix }}" == "true" ]]; then - # For non-default packages like llvm@15, get the install prefix - COMPILER_PREFIX=$(brew --prefix ${{ matrix.package }}) - export CC="${COMPILER_PREFIX}/bin/${{ matrix.cc_name }}" - export CXX="${COMPILER_PREFIX}/bin/${{ matrix.cxx_name }}" - else - # For versioned GCC installs, the name is usually directly available - export CC="${{ matrix.cc_name }}" - export CXX="${{ matrix.cxx_name }}" - fi - - cd bindings/python - pip install --force-reinstall . - python -m unittest discover -p "test_microarch.py" -s . diff --git a/bindings/python/tests/test_microarch.py b/bindings/python/tests/test_microarch.py index 16d4267f..496cf7cd 100644 --- a/bindings/python/tests/test_microarch.py +++ b/bindings/python/tests/test_microarch.py @@ -17,13 +17,12 @@ import archspec.cpu as cpu import os + class MicroarchTester(unittest.TestCase): def test_microarch(self): supported_microarchs = svs.microarch.supported - # Will be set in dispatcher pipeline - archspec_host_name = os.environ.get("SDE_FLAG") - if not archspec_host_name: - archspec_host_name = cpu.host().name + # Get emulated microarch from SDE_FLAG or use the host CPU + host_microarch = os.environ.get("SDE_FLAG", cpu.host().name) mapping = { "nhm": "nehalem", "hsw": "haswell", @@ -33,7 +32,7 @@ def test_microarch(self): "icelake": "icelake_client", "spr": "sapphirerapids", } - archspec_host_name = mapping.get(archspec_host_name, archspec_host_name) + host_microarch = mapping.get(host_microarch, host_microarch) - if archspec_host_name in supported_microarchs: - self.assertTrue(archspec_host_name == svs.microarch.current) + if host_microarch in supported_microarchs: + self.assertTrue(host_microarch == svs.microarch.current) From d07feadcfc38680c3a861826c7bf735ea8a046c0 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 8 May 2025 07:39:30 -0700 Subject: [PATCH 51/65] Fix for SDE --- .github/workflows/build-linux.yml | 2 +- bindings/python/tests/test_microarch.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index 94b1a28b..23d9a90a 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -104,7 +104,7 @@ jobs: - name: Run Python Microarch Test with SDE run: | cd bindings/python - for flag in nhm hsw skx clx icl spr; do + for flag in nhm hsw skx clx icl; do echo "SDE emulation: $flag" export SDE_FLAG=$flag sde64 -$flag -- python -c "import svs; svs.microarch.describe()" diff --git a/bindings/python/tests/test_microarch.py b/bindings/python/tests/test_microarch.py index 496cf7cd..670ae779 100644 --- a/bindings/python/tests/test_microarch.py +++ b/bindings/python/tests/test_microarch.py @@ -20,7 +20,6 @@ class MicroarchTester(unittest.TestCase): def test_microarch(self): - supported_microarchs = svs.microarch.supported # Get emulated microarch from SDE_FLAG or use the host CPU host_microarch = os.environ.get("SDE_FLAG", cpu.host().name) mapping = { @@ -34,5 +33,5 @@ def test_microarch(self): } host_microarch = mapping.get(host_microarch, host_microarch) - if host_microarch in supported_microarchs: + if host_microarch in svs.microarch.compiled: self.assertTrue(host_microarch == svs.microarch.current) From e9ee015c5ea0ae5def5ac958cfdc3b0cb7d9f26a Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 12 May 2025 08:23:40 -0700 Subject: [PATCH 52/65] Replace x86_64 base uarch (nehalem) with x86_64_v2 --- cmake/microarch_targets_x86_64 | 2 +- include/svs/lib/arch.h | 13 +++++++++++++ include/svs/lib/arch_defines.h | 9 +++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 index bac14630..418bf777 100644 --- a/cmake/microarch_targets_x86_64 +++ b/cmake/microarch_targets_x86_64 @@ -1,4 +1,4 @@ -nehalem +x86_64_v2 haswell x86_64_v4 skylake_avx512 diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 81cab3e0..a2753fbd 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -29,6 +29,7 @@ enum class MicroArch { #if defined(__x86_64__) // Refer to the GCC docs for the list of targeted architectures: // https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html + x86_64_v2, nehalem, westmere, sandybridge, @@ -67,6 +68,16 @@ struct MicroArchInfo { inline const std::unordered_map& get_microarch_info_map() { static const std::unordered_map microarch_info = { #if defined(__x86_64__) + {MicroArch::x86_64_v2, + {std::nullopt, + {ISAExt::SSE3, + ISAExt::SSSE3, + ISAExt::SSE4_1, + ISAExt::SSE4_2, + ISAExt::POPCNT, + ISAExt::CX16, + ISAExt::SAHF}, + "x86_64_v2"}}, {MicroArch::nehalem, {std::nullopt, {ISAExt::MMX, @@ -248,6 +259,7 @@ class MicroArchEnvironment { MicroArchEnvironment() { const std::vector compiled_archs = { #if defined(__x86_64__) + SVS_MICROARCH_COMPILED_x86_64_v2 SVS_MICROARCH_COMPILED_nehalem SVS_MICROARCH_COMPILED_westmere SVS_MICROARCH_COMPILED_sandybridge @@ -297,6 +309,7 @@ class MicroArchEnvironment { svs::arch::MicroArch cpu_arch = \ svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v2(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, SVS_PACK_ARGS(args)) \ diff --git a/include/svs/lib/arch_defines.h b/include/svs/lib/arch_defines.h index 8c4eda6a..aa0e5c63 100644 --- a/include/svs/lib/arch_defines.h +++ b/include/svs/lib/arch_defines.h @@ -24,6 +24,15 @@ // TODO: autogenerate this list #if defined(__x86_64__) +#if defined(SVS_MICROARCH_SUPPORT_x86_64_v2) +#define SVS_MICROARCH_COMPILED_x86_64_v2 MicroArch::x86_64_v2, +#define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v2(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(x86_64_v2, cls, method, SVS_PACK_ARGS(args)) +#else +#define SVS_MICROARCH_COMPILED_x86_64_v2 +#define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v2(cls, method, args) +#endif + #if defined(SVS_MICROARCH_SUPPORT_nehalem) #define SVS_MICROARCH_COMPILED_nehalem MicroArch::nehalem, #define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) \ From ee2309c2704f7ae1d6d0fbd6d886a10b12fec72a Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 12 May 2025 09:31:15 -0700 Subject: [PATCH 53/65] Fix for base uarch flags --- cmake/microarch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/microarch.cmake b/cmake/microarch.cmake index 9d67735a..71b1b848 100644 --- a/cmake/microarch.cmake +++ b/cmake/microarch.cmake @@ -95,7 +95,7 @@ list(POP_FRONT OPTIMIZATION_FLAGS BASE_OPT_FLAGS) string(REPLACE "," ";" BASE_OPT_FLAGS ${BASE_OPT_FLAGS}) message("Opt.flags[base=${BASE_MICROARCH}]: ${BASE_OPT_FLAGS}") -target_compile_options(svs_microarch_options_base INTERFACE ${BASE_OPT_FLAGS} -DSVS_MICROARCH_SUPPORT_${BASE_MICROARCH}) +target_compile_options(svs_microarch_options_base INTERFACE ${BASE_OPT_FLAGS} -DSVS_MICROARCH_SUPPORT_${BASE_MICROARCH} -DSVS_TUNE_TARGET=${BASE_MICROARCH}) foreach(MICROARCH OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) # Tell the microarch dispatcher to include this microarch branch From 11c6c096593ce11c45ca64d40ed1db90a53f5ea6 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 12 May 2025 09:32:46 -0700 Subject: [PATCH 54/65] Exp.: change base uarch to sandybridge --- cmake/microarch_targets_x86_64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 index 418bf777..9606b7a1 100644 --- a/cmake/microarch_targets_x86_64 +++ b/cmake/microarch_targets_x86_64 @@ -1,4 +1,4 @@ -x86_64_v2 +sandybridge haswell x86_64_v4 skylake_avx512 From 90108198fe50fc7d955396eb7d4ebcdda2823d5f Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 13 May 2025 01:51:15 -0700 Subject: [PATCH 55/65] Exp.: change base uarch to haswell --- cmake/microarch_targets_x86_64 | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 index 9606b7a1..74e49edc 100644 --- a/cmake/microarch_targets_x86_64 +++ b/cmake/microarch_targets_x86_64 @@ -1,4 +1,3 @@ -sandybridge haswell x86_64_v4 skylake_avx512 From 5d639a59745de945675435a89ac4cd5f44f028db Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 13 May 2025 02:13:25 -0700 Subject: [PATCH 56/65] Revert x86_64 base uarch changes --- cmake/microarch_targets_x86_64 | 1 + include/svs/lib/arch.h | 13 ------------- include/svs/lib/arch_defines.h | 9 --------- 3 files changed, 1 insertion(+), 22 deletions(-) diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 index 74e49edc..bac14630 100644 --- a/cmake/microarch_targets_x86_64 +++ b/cmake/microarch_targets_x86_64 @@ -1,3 +1,4 @@ +nehalem haswell x86_64_v4 skylake_avx512 diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index a2753fbd..81cab3e0 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -29,7 +29,6 @@ enum class MicroArch { #if defined(__x86_64__) // Refer to the GCC docs for the list of targeted architectures: // https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html - x86_64_v2, nehalem, westmere, sandybridge, @@ -68,16 +67,6 @@ struct MicroArchInfo { inline const std::unordered_map& get_microarch_info_map() { static const std::unordered_map microarch_info = { #if defined(__x86_64__) - {MicroArch::x86_64_v2, - {std::nullopt, - {ISAExt::SSE3, - ISAExt::SSSE3, - ISAExt::SSE4_1, - ISAExt::SSE4_2, - ISAExt::POPCNT, - ISAExt::CX16, - ISAExt::SAHF}, - "x86_64_v2"}}, {MicroArch::nehalem, {std::nullopt, {ISAExt::MMX, @@ -259,7 +248,6 @@ class MicroArchEnvironment { MicroArchEnvironment() { const std::vector compiled_archs = { #if defined(__x86_64__) - SVS_MICROARCH_COMPILED_x86_64_v2 SVS_MICROARCH_COMPILED_nehalem SVS_MICROARCH_COMPILED_westmere SVS_MICROARCH_COMPILED_sandybridge @@ -309,7 +297,6 @@ class MicroArchEnvironment { svs::arch::MicroArch cpu_arch = \ svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ switch (cpu_arch) { \ - SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v2(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, SVS_PACK_ARGS(args)) \ SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, SVS_PACK_ARGS(args)) \ diff --git a/include/svs/lib/arch_defines.h b/include/svs/lib/arch_defines.h index aa0e5c63..8c4eda6a 100644 --- a/include/svs/lib/arch_defines.h +++ b/include/svs/lib/arch_defines.h @@ -24,15 +24,6 @@ // TODO: autogenerate this list #if defined(__x86_64__) -#if defined(SVS_MICROARCH_SUPPORT_x86_64_v2) -#define SVS_MICROARCH_COMPILED_x86_64_v2 MicroArch::x86_64_v2, -#define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v2(cls, method, args) \ - SVS_CLASS_METHOD_MICROARCH_CASE(x86_64_v2, cls, method, SVS_PACK_ARGS(args)) -#else -#define SVS_MICROARCH_COMPILED_x86_64_v2 -#define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v2(cls, method, args) -#endif - #if defined(SVS_MICROARCH_SUPPORT_nehalem) #define SVS_MICROARCH_COMPILED_nehalem MicroArch::nehalem, #define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) \ From ce487829b714dad09b8de012b6c9cf1dd2b39ca8 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Tue, 13 May 2025 10:06:42 -0700 Subject: [PATCH 57/65] Change template args for DistanceImpl --- include/svs/core/distance/cosine.h | 16 ++++++------- include/svs/core/distance/euclidean.h | 28 +++++++++++------------ include/svs/core/distance/inner_product.h | 28 +++++++++++------------ 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 76eb26f1..ee8d0bb6 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -235,7 +235,7 @@ template <> struct CosineFloatOp<16> : public svs::simd::ConvertToFloat<16> { // Small Integers SVS_VALIDATE_BOOL_ENV(SVS_AVX512_VNNI) #if SVS_AVX512_VNNI -template struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -261,7 +261,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -289,7 +289,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -297,7 +297,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -305,7 +305,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -313,7 +313,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -321,7 +321,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -329,7 +329,7 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); diff --git a/include/svs/core/distance/euclidean.h b/include/svs/core/distance/euclidean.h index 93b82dfe..d5b01a65 100644 --- a/include/svs/core/distance/euclidean.h +++ b/include/svs/core/distance/euclidean.h @@ -257,14 +257,14 @@ template <> struct L2VNNIOp : public svs::simd::ConvertForVNNI struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2VNNIOp(), a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2VNNIOp(), a, b, length); @@ -274,42 +274,42 @@ template struct L2Impl { #endif // Floating and Mixed Types -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); }; }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); }; }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); @@ -325,7 +325,7 @@ template struct L2Impl { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -345,7 +345,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -367,7 +367,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -388,7 +388,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -412,7 +412,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -439,7 +439,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; diff --git a/include/svs/core/distance/inner_product.h b/include/svs/core/distance/inner_product.h index 9acc4a86..166342e8 100644 --- a/include/svs/core/distance/inner_product.h +++ b/include/svs/core/distance/inner_product.h @@ -212,14 +212,14 @@ template <> struct IPVNNIOp : public svs::simd::ConvertForVNNI struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(IPVNNIOp(), a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(IPVNNIOp(), a, b, length); @@ -229,42 +229,42 @@ template struct IPImpl { #endif // Floating and Mixed Types -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); }; }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); }; }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); @@ -279,7 +279,7 @@ template struct IPImpl { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -298,7 +298,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -319,7 +319,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -339,7 +339,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -362,7 +362,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -388,7 +388,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; From b1a51c582cb7d5150afb4bc7e854684da8170fbf Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 14 May 2025 08:08:38 -0700 Subject: [PATCH 58/65] Extend macros to correct linking --- examples/cpp/CMakeLists.txt | 2 +- include/svs/core/distance/cosine.h | 18 +++++- include/svs/core/distance/euclidean.h | 18 +++++- include/svs/core/distance/inner_product.h | 18 +++++- include/svs/lib/arch.h | 75 +++++++++++++++++++++++ include/svs/lib/arch_defines.h | 65 ++++++++++++++++++++ 6 files changed, 189 insertions(+), 7 deletions(-) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 6110f4d9..6040e156 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -63,7 +63,7 @@ add_test( ) # The custom thread pool executable. -add_executable(custom_thread_pool custom_thread_pool.cpp) +add_executable(custom_thread_pool custom_thread_pool.cpp ${MICROARCH_OBJECT_FILES}) target_include_directories(custom_thread_pool PRIVATE ${CMAKE_CURRENT_LIST_DIR}) target_link_libraries(custom_thread_pool ${SVS_LIB} svs_compile_options svs_microarch_options_base) add_test( diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index ee8d0bb6..0a9e4bb2 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -46,14 +46,14 @@ struct CosineSimilarityImpl; template class CosineSimilarity { public: template - static constexpr float compute(const Ea* a, const Eb* b, float a_norm, size_t N) { + SVS_NOINLINE static float compute(const Ea* a, const Eb* b, float a_norm, size_t N) { return CosineSimilarityImpl::compute( a, b, a_norm, lib::MaybeStatic(N) ); } template - static constexpr float compute(const Ea* a, const Eb* b, float a_norm) { + SVS_NOINLINE static float compute(const Ea* a, const Eb* b, float a_norm) { return CosineSimilarityImpl::compute( a, b, a_norm, lib::MaybeStatic() ); @@ -367,4 +367,18 @@ template struct CosineSimilarityImpl struct template class L2 { public: template - static constexpr float compute(const Ea* a, const Eb* b, size_t N) { + SVS_NOINLINE static float compute(const Ea* a, const Eb* b, size_t N) { return L2Impl::compute(a, b, lib::MaybeStatic(N)); } template - static constexpr float compute(const Ea* a, const Eb* b) { + SVS_NOINLINE static float compute(const Ea* a, const Eb* b) { return L2Impl::compute(a, b, lib::MaybeStatic()); } }; @@ -481,4 +481,18 @@ template struct L2Impl struct template class IP { public: template - static constexpr float compute(const Ea* a, const Eb* b, size_t N) { + SVS_NOINLINE static float compute(const Ea* a, const Eb* b, size_t N) { return IPImpl::compute(a, b, lib::MaybeStatic(N)); } template - static constexpr float compute(const Ea* a, const Eb* b) { + SVS_NOINLINE static float compute(const Ea* a, const Eb* b) { return IPImpl::compute(a, b, lib::MaybeStatic()); } }; @@ -429,4 +429,18 @@ template struct IPImpl::method(args); + // TODO: autogenerate this list #if defined(__x86_64__) @@ -28,144 +33,192 @@ #define SVS_MICROARCH_COMPILED_nehalem MicroArch::nehalem, #define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_nehalem(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(nehalem, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_nehalem #define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_nehalem(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_westmere) #define SVS_MICROARCH_COMPILED_westmere MicroArch::westmere, #define SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(westmere, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_westmere(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(westmere, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_westmere #define SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_westmere(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_sandybridge) #define SVS_MICROARCH_COMPILED_sandybridge MicroArch::sandybridge, #define SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(sandybridge, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_sandybridge(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(sandybridge, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_sandybridge #define SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_sandybridge(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_ivybridge) #define SVS_MICROARCH_COMPILED_ivybridge MicroArch::ivybridge, #define SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(ivybridge, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_ivybridge(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(ivybridge, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_ivybridge #define SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_ivybridge(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_haswell) #define SVS_MICROARCH_COMPILED_haswell MicroArch::haswell, #define SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_haswell(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(haswell, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_haswell #define SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_haswell(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_broadwell) #define SVS_MICROARCH_COMPILED_broadwell MicroArch::broadwell, #define SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_broadwell(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(broadwell, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_broadwell #define SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_broadwell(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_skylake) #define SVS_MICROARCH_COMPILED_skylake MicroArch::skylake, #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_skylake(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(skylake, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_skylake #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_skylake(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_x86_64_v4) #define SVS_MICROARCH_COMPILED_x86_64_v4 MicroArch::x86_64_v4, #define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(x86_64_v4, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_x86_64_v4(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(x86_64_v4, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_x86_64_v4 #define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_x86_64_v4(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_skylake_avx512) #define SVS_MICROARCH_COMPILED_skylake_avx512 MicroArch::skylake_avx512, #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_skylake_avx512(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(skylake_avx512, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_skylake_avx512 #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_skylake_avx512(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_cascadelake) #define SVS_MICROARCH_COMPILED_cascadelake MicroArch::cascadelake, #define SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_cascadelake(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(cascadelake, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_cascadelake #define SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_cascadelake(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_cooperlake) #define SVS_MICROARCH_COMPILED_cooperlake MicroArch::cooperlake, #define SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(cooperlake, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_cooperlake(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(cooperlake, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_cooperlake #define SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_cooperlake(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_icelake_client) #define SVS_MICROARCH_COMPILED_icelake_client MicroArch::icelake_client, #define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(icelake_client, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_icelake_client(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(icelake_client, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_icelake_client #define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_icelake_client(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_icelake_server) #define SVS_MICROARCH_COMPILED_icelake_server MicroArch::icelake_server, #define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(icelake_server, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_icelake_server(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(icelake_server, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_icelake_server #define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_icelake_server(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_sapphirerapids) #define SVS_MICROARCH_COMPILED_sapphirerapids MicroArch::sapphirerapids, #define SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_sapphirerapids(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(sapphirerapids, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_sapphirerapids #define SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_sapphirerapids(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_graniterapids) #define SVS_MICROARCH_COMPILED_graniterapids MicroArch::graniterapids, #define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(graniterapids, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_graniterapids(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(graniterapids, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_graniterapids #define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_graniterapids(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_graniterapids_d) #define SVS_MICROARCH_COMPILED_graniterapids_d MicroArch::graniterapids_d, #define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(graniterapids_d, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_graniterapids_d(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(graniterapids_d, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_graniterapids_d #define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_graniterapids_d(return_type, cls, method, template_args, args) #endif #elif defined(__aarch64__) @@ -176,18 +229,24 @@ #define SVS_MICROARCH_COMPILED_m1 MicroArch::m1, #define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_m1(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(m1, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_m1 #define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_m1(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_m2) #define SVS_MICROARCH_COMPILED_m2 MicroArch::m2, #define SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_m2(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(m2, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_m2 #define SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_m2(return_type, cls, method, template_args, args) #endif #else @@ -196,18 +255,24 @@ #define SVS_MICROARCH_COMPILED_neoverse_v1 MicroArch::neoverse_v1, #define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_neoverse_v1(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(neoverse_v1, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_neoverse_v1 #define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_neoverse_v1(return_type, cls, method, template_args, args) #endif #if defined(SVS_MICROARCH_SUPPORT_neoverse_n2) #define SVS_MICROARCH_COMPILED_neoverse_n2 MicroArch::neoverse_n2, #define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_neoverse_n2(return_type, cls, method, template_args, args) \ + SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(neoverse_n2, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) #else #define SVS_MICROARCH_COMPILED_neoverse_n2 #define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_neoverse_n2(return_type, cls, method, template_args, args) #endif #endif From 74dc321dda5d18752ec7895fe53bf9842fa13229 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 14 May 2025 09:42:17 -0700 Subject: [PATCH 59/65] Linting --- bindings/python/src/python_bindings.cpp | 69 ++++--- include/svs/core/distance/cosine.h | 48 +++-- include/svs/core/distance/euclidean.h | 4 +- include/svs/core/distance/inner_product.h | 4 +- include/svs/lib/arch_defines.h | 223 ++++++++++++++++++---- 5 files changed, 253 insertions(+), 95 deletions(-) diff --git a/bindings/python/src/python_bindings.cpp b/bindings/python/src/python_bindings.cpp index dfbf3e88..cf155362 100644 --- a/bindings/python/src/python_bindings.cpp +++ b/bindings/python/src/python_bindings.cpp @@ -43,8 +43,8 @@ // stl #include -#include #include +#include namespace py = pybind11; @@ -189,15 +189,14 @@ Convert the `fvecs` file on disk with 32-bit floating point entries to a `fvecs` wrap_conversion(m); - m.def( - "_print_cpu_extensions_status", - []() { - svs::arch::write_extensions_status(std::cout); - } - ); + m.def("_print_cpu_extensions_status", []() { + svs::arch::write_extensions_status(std::cout); + }); // Wrapper for svs::arch::MicroArchEnvironment - py::class_(m, "microarch", "Microarchitecture management singleton") + py::class_( + m, "microarch", "Microarchitecture management singleton" + ) .def_static( "get", []() -> svs::arch::MicroArchEnvironment& { @@ -242,36 +241,34 @@ Convert the `fvecs` file on disk with 32-bit floating point entries to a `fvecs` }, "Returns a list of compiled microarchitectures." ) - .def_static( - "describe", - []() { - std::ostream& out = std::cout; - auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); - - // Print support status for all ISA extensions - svs::arch::write_extensions_status(out); - - // Print current microarchitecture - auto current_arch = arch_env.get_microarch(); - out << "\nCurrent µarch: " << svs::arch::microarch_to_string(current_arch) << std::endl; - - // Print all supported microarchitectures - const auto& supported_archs = arch_env.get_supported_microarchs(); - out << "\nSupported µarchs: "; - for (const auto& arch : supported_archs) { - out << svs::arch::microarch_to_string(arch) << " "; - } - out << std::endl; + .def_static("describe", []() { + std::ostream& out = std::cout; + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + + // Print support status for all ISA extensions + svs::arch::write_extensions_status(out); + + // Print current microarchitecture + auto current_arch = arch_env.get_microarch(); + out << "\nCurrent µarch: " << svs::arch::microarch_to_string(current_arch) + << std::endl; + + // Print all supported microarchitectures + const auto& supported_archs = arch_env.get_supported_microarchs(); + out << "\nSupported µarchs: "; + for (const auto& arch : supported_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; - // Print all compiled microarchitectures - const auto& compiled_archs = arch_env.get_compiled_microarchs(); - out << "\nCompiled µarchs: "; - for (const auto& arch : compiled_archs) { - out << svs::arch::microarch_to_string(arch) << " "; - } - out << std::endl; + // Print all compiled microarchitectures + const auto& compiled_archs = arch_env.get_compiled_microarchs(); + out << "\nCompiled µarchs: "; + for (const auto& arch : compiled_archs) { + out << svs::arch::microarch_to_string(arch) << " "; } - ); + out << std::endl; + }); // Allocators svs::python::allocators::wrap(m); diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 0a9e4bb2..4f6e56e9 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -235,7 +235,8 @@ template <> struct CosineFloatOp<16> : public svs::simd::ConvertToFloat<16> { // Small Integers SVS_VALIDATE_BOOL_ENV(SVS_AVX512_VNNI) #if SVS_AVX512_VNNI -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -261,7 +262,8 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -289,7 +291,8 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -297,7 +300,8 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -305,7 +309,8 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -313,7 +318,8 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -321,7 +327,8 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -329,7 +336,8 @@ template struct CosineSimilarityImpl struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -367,16 +375,20 @@ template struct CosineSimilarityImpl struct L2Impl struct IPImpl::method(args); + extern template return_type \ + cls::method(args); // TODO: autogenerate this list #if defined(__x86_64__) @@ -34,7 +35,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_nehalem(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(nehalem, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + nehalem, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_nehalem #define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) @@ -46,7 +54,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(westmere, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_westmere(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(westmere, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + westmere, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_westmere #define SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, args) @@ -58,7 +73,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(sandybridge, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_sandybridge(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(sandybridge, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + sandybridge, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_sandybridge #define SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, args) @@ -70,7 +92,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(ivybridge, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_ivybridge(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(ivybridge, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + ivybridge, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_ivybridge #define SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, args) @@ -82,7 +111,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_haswell(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(haswell, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + haswell, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_haswell #define SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, args) @@ -94,7 +130,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_broadwell(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(broadwell, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + broadwell, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_broadwell #define SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, args) @@ -106,7 +149,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_skylake(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(skylake, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + skylake, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_skylake #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) @@ -118,7 +168,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(x86_64_v4, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_x86_64_v4(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(x86_64_v4, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + x86_64_v4, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_x86_64_v4 #define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, args) @@ -129,12 +186,23 @@ #define SVS_MICROARCH_COMPILED_skylake_avx512 MicroArch::skylake_avx512, #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) -#define SVS_EXTERN_CLASS_METHOD_skylake_avx512(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(skylake_avx512, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_skylake_avx512( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + skylake_avx512, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_skylake_avx512 #define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) -#define SVS_EXTERN_CLASS_METHOD_skylake_avx512(return_type, cls, method, template_args, args) +#define SVS_EXTERN_CLASS_METHOD_skylake_avx512( \ + return_type, cls, method, template_args, args \ +) #endif #if defined(SVS_MICROARCH_SUPPORT_cascadelake) @@ -142,7 +210,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_cascadelake(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(cascadelake, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + cascadelake, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_cascadelake #define SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, args) @@ -154,7 +229,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(cooperlake, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_cooperlake(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(cooperlake, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + cooperlake, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_cooperlake #define SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, args) @@ -165,44 +247,86 @@ #define SVS_MICROARCH_COMPILED_icelake_client MicroArch::icelake_client, #define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(icelake_client, cls, method, SVS_PACK_ARGS(args)) -#define SVS_EXTERN_CLASS_METHOD_icelake_client(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(icelake_client, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_icelake_client( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + icelake_client, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_icelake_client #define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, args) -#define SVS_EXTERN_CLASS_METHOD_icelake_client(return_type, cls, method, template_args, args) +#define SVS_EXTERN_CLASS_METHOD_icelake_client( \ + return_type, cls, method, template_args, args \ +) #endif #if defined(SVS_MICROARCH_SUPPORT_icelake_server) #define SVS_MICROARCH_COMPILED_icelake_server MicroArch::icelake_server, #define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(icelake_server, cls, method, SVS_PACK_ARGS(args)) -#define SVS_EXTERN_CLASS_METHOD_icelake_server(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(icelake_server, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_icelake_server( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + icelake_server, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_icelake_server #define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, args) -#define SVS_EXTERN_CLASS_METHOD_icelake_server(return_type, cls, method, template_args, args) +#define SVS_EXTERN_CLASS_METHOD_icelake_server( \ + return_type, cls, method, template_args, args \ +) #endif #if defined(SVS_MICROARCH_SUPPORT_sapphirerapids) #define SVS_MICROARCH_COMPILED_sapphirerapids MicroArch::sapphirerapids, #define SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) -#define SVS_EXTERN_CLASS_METHOD_sapphirerapids(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(sapphirerapids, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_sapphirerapids( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + sapphirerapids, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_sapphirerapids #define SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, args) -#define SVS_EXTERN_CLASS_METHOD_sapphirerapids(return_type, cls, method, template_args, args) +#define SVS_EXTERN_CLASS_METHOD_sapphirerapids( \ + return_type, cls, method, template_args, args \ +) #endif #if defined(SVS_MICROARCH_SUPPORT_graniterapids) #define SVS_MICROARCH_COMPILED_graniterapids MicroArch::graniterapids, #define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(graniterapids, cls, method, SVS_PACK_ARGS(args)) -#define SVS_EXTERN_CLASS_METHOD_graniterapids(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(graniterapids, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_graniterapids( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + graniterapids, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_graniterapids #define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, args) @@ -213,12 +337,23 @@ #define SVS_MICROARCH_COMPILED_graniterapids_d MicroArch::graniterapids_d, #define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(graniterapids_d, cls, method, SVS_PACK_ARGS(args)) -#define SVS_EXTERN_CLASS_METHOD_graniterapids_d(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(graniterapids_d, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_graniterapids_d( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + graniterapids_d, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_graniterapids_d #define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, args) -#define SVS_EXTERN_CLASS_METHOD_graniterapids_d(return_type, cls, method, template_args, args) +#define SVS_EXTERN_CLASS_METHOD_graniterapids_d( \ + return_type, cls, method, template_args, args \ +) #endif #elif defined(__aarch64__) @@ -229,8 +364,10 @@ #define SVS_MICROARCH_COMPILED_m1 MicroArch::m1, #define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) -#define SVS_EXTERN_CLASS_METHOD_m1(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(m1, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_m1(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + m1, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_m1 #define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, args) @@ -241,8 +378,10 @@ #define SVS_MICROARCH_COMPILED_m2 MicroArch::m2, #define SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) -#define SVS_EXTERN_CLASS_METHOD_m2(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(m2, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_m2(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + m2, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_m2 #define SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, args) @@ -256,7 +395,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_neoverse_v1(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(neoverse_v1, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + neoverse_v1, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_neoverse_v1 #define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, args) @@ -268,7 +414,14 @@ #define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, args) \ SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) #define SVS_EXTERN_CLASS_METHOD_neoverse_n2(return_type, cls, method, template_args, args) \ - SVS_EXT_CLASS_METHOD_TMPL_BY_MICROARCH(neoverse_n2, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args)) + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + neoverse_n2, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) #else #define SVS_MICROARCH_COMPILED_neoverse_n2 #define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, args) From aea03905cf427a5618907c67094cec403edf188a Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 14 May 2025 09:47:16 -0700 Subject: [PATCH 60/65] Fix for ARM platforms and correct naming --- include/svs/core/distance/cosine.h | 16 ++++++++-------- include/svs/core/distance/euclidean.h | 16 ++++++++-------- include/svs/core/distance/inner_product.h | 16 ++++++++-------- include/svs/lib/arch.h | 14 ++++++-------- 4 files changed, 30 insertions(+), 32 deletions(-) diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 4f6e56e9..49281c9e 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -376,18 +376,18 @@ struct CosineSimilarityImpl { ) #define SVS_EXTERN_COSINE_DISTANCE \ - SVS_EXTENT_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, int8_t, int8_t) \ - SVS_EXTENT_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, uint8_t, uint8_t) \ - SVS_EXTENT_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, float) \ - SVS_EXTENT_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, uint8_t) \ - SVS_EXTENT_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, int8_t) \ - SVS_EXTENT_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, int8_t, int8_t) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, uint8_t, uint8_t) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, float) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, uint8_t) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, int8_t) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ CosineSimilarity, float, svs::float16::Float16 \ ) \ - SVS_EXTENT_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ CosineSimilarity, svs::float16::Float16, float \ ) \ - SVS_EXTENT_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ CosineSimilarity, svs::float16::Float16, svs::float16::Float16 \ ) diff --git a/include/svs/core/distance/euclidean.h b/include/svs/core/distance/euclidean.h index f96be955..6a612684 100644 --- a/include/svs/core/distance/euclidean.h +++ b/include/svs/core/distance/euclidean.h @@ -482,14 +482,14 @@ template struct L2Impl struct IPImpl Date: Wed, 14 May 2025 09:53:23 -0700 Subject: [PATCH 61/65] Fix for ARM platforms --- include/svs/lib/arch.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h index 345a5715..ec7d0217 100644 --- a/include/svs/lib/arch.h +++ b/include/svs/lib/arch.h @@ -362,7 +362,7 @@ class MicroArchEnvironment { SVS_PACK_ARGS(a_type, b_type), \ SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long) \ ) -// Generic distance extent macro +// Generic distance extern macro (required for external linking to uarch-specific implementations) #if defined(__x86_64__) #define SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ @@ -407,7 +407,7 @@ class MicroArchEnvironment { SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long) \ ) -// Cosine distance extent macro +// Cosine distance extern macro (required for external linking to uarch-specific implementations) #if defined(__x86_64__) #define SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ @@ -432,13 +432,11 @@ class MicroArchEnvironment { #if defined(__APPLE__) #define SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ - SVS_EXTERN_CLASS_METHOD_m1(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ SVS_EXTERN_CLASS_METHOD_m2(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) #else #define SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ - SVS_EXTERN_CLASS_METHOD_neoverse_v1(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ SVS_EXTERN_CLASS_METHOD_neoverse_n2(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) #endif From ec32357c01dca22572b9f8ca283e179094fed4bb Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 15 May 2025 12:00:13 -0700 Subject: [PATCH 62/65] Finalize tests for CosineSimilarity --- tests/svs/core/distances/cosine.cpp | 131 ++++++++++++++++------------ 1 file changed, 76 insertions(+), 55 deletions(-) diff --git a/tests/svs/core/distances/cosine.cpp b/tests/svs/core/distances/cosine.cpp index 24915d97..a2fad3ac 100644 --- a/tests/svs/core/distances/cosine.cpp +++ b/tests/svs/core/distances/cosine.cpp @@ -82,20 +82,41 @@ void test_types(T lo, T hi, size_t num_tests) { .epsilon(COSINE_EPSILON) .margin(COSINE_MARGIN); - // Statically Sized Computation - auto a_norm = svs::distance::norm(std::span{a.data(), a.size()}); - CATCH_REQUIRE( - // TODO: replace baseline with something else? - (svs::distance::CosineSimilarity::compute( - a.data(), b.data(), a_norm - ) == expected) - ); - // Dynamically Sized Computation - auto dist = - svs::distance::CosineSimilarity::compute( - a.data(), b.data(), a_norm, N + auto as_span = [](auto v) { return std::span{v.data(), v.size()}; }; + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + auto supported_archs = arch_env.get_supported_microarchs(); + + for (auto arch : supported_archs) { + arch_env.set_microarch(arch); + auto dist = svs::distance::DistanceCosineSimilarity{}; + dist.fix_argument(as_span(a)); + auto result = svs::distance::compute( + dist, as_span(a), as_span(b) + ); - CATCH_REQUIRE((dist == expected)); + CATCH_REQUIRE(result == expected); + } + + // Checking statically and dynamically sized computation requires a direct + // call to the compute function. We pick any MicroArch, since all available + // are already tested above. + auto a_norm = svs::distance::norm(std::span{a.data(), a.size()}); + { + // Statically Sized Computation + auto dist = + svs::distance::CosineSimilarity::compute( + a.data(), b.data(), a_norm + ); + CATCH_REQUIRE(dist == expected); + } + { + // Dynamically Sized Computation + auto dist = + svs::distance::CosineSimilarity::compute( + a.data(), b.data(), a_norm, N + ); + CATCH_REQUIRE(dist == expected); + } } } } // anonymous namespace @@ -214,46 +235,46 @@ CATCH_TEST_CASE( "Benchmark CosineSimilarity Distance", "[distance][cosinesimilarity_distance][benchmark_suite][!benchmark]" ) { - auto num_elements = 1000000; - // Types: `float` and `float` - run_benchmark(num_elements, -1.0f, 1.0f); - run_benchmark(num_elements, -1.0f, 1.0f, 128); - run_benchmark(num_elements, -1.0f, 1.0f); - run_benchmark(num_elements, -1.0f, 1.0f, 100); - - // Types: `float` and `svs::Float16` - run_benchmark(num_elements, -1.0f, 1.0f); - run_benchmark(num_elements, -1.0f, 1.0f, 128); - run_benchmark(num_elements, -1.0f, 1.0f); - run_benchmark(num_elements, -1.0f, 1.0f, 100); - - // Types: `svs::Float16` and `svs::Float16` - run_benchmark(num_elements, -1.0f, 1.0f); - run_benchmark(num_elements, -1.0f, 1.0f, 128); - run_benchmark(num_elements, -1.0f, 1.0f); - run_benchmark(num_elements, -1.0f, 1.0f, 100); - - // Types: `float` and `int8_t` - run_benchmark(num_elements, -128, 127); - run_benchmark(num_elements, -128, 127, 128); - run_benchmark(num_elements, -128, 127); - run_benchmark(num_elements, -128, 127, 100); - - // Types: `float` and `uint8_t` - run_benchmark(num_elements, 0, 255); - run_benchmark(num_elements, 0, 255, 128); - run_benchmark(num_elements, 0, 255); - run_benchmark(num_elements, 0, 255, 100); - - // Types: `uint8_t` and `uint8_t` - run_benchmark(num_elements, 0, 255); - run_benchmark(num_elements, 0, 255, 128); - run_benchmark(num_elements, 0, 255); - run_benchmark(num_elements, 0, 255, 100); - - // Types: `int8_t` and `int8_t` - run_benchmark(num_elements, -128, 127); - run_benchmark(num_elements, -128, 127, 128); - run_benchmark(num_elements, -128, 127); - run_benchmark(num_elements, -128, 127, 100); + // auto num_elements = 1000000; + // // Types: `float` and `float` + // run_benchmark(num_elements, -1.0f, 1.0f); + // run_benchmark(num_elements, -1.0f, 1.0f, 128); + // run_benchmark(num_elements, -1.0f, 1.0f); + // run_benchmark(num_elements, -1.0f, 1.0f, 100); + + // // Types: `float` and `svs::Float16` + // run_benchmark(num_elements, -1.0f, 1.0f); + // run_benchmark(num_elements, -1.0f, 1.0f, 128); + // run_benchmark(num_elements, -1.0f, 1.0f); + // run_benchmark(num_elements, -1.0f, 1.0f, 100); + + // // Types: `svs::Float16` and `svs::Float16` + // run_benchmark(num_elements, -1.0f, 1.0f); + // run_benchmark(num_elements, -1.0f, 1.0f, 128); + // run_benchmark(num_elements, -1.0f, 1.0f); + // run_benchmark(num_elements, -1.0f, 1.0f, 100); + + // // Types: `float` and `int8_t` + // run_benchmark(num_elements, -128, 127); + // run_benchmark(num_elements, -128, 127, 128); + // run_benchmark(num_elements, -128, 127); + // run_benchmark(num_elements, -128, 127, 100); + + // // Types: `float` and `uint8_t` + // run_benchmark(num_elements, 0, 255); + // run_benchmark(num_elements, 0, 255, 128); + // run_benchmark(num_elements, 0, 255); + // run_benchmark(num_elements, 0, 255, 100); + + // // Types: `uint8_t` and `uint8_t` + // run_benchmark(num_elements, 0, 255); + // run_benchmark(num_elements, 0, 255, 128); + // run_benchmark(num_elements, 0, 255); + // run_benchmark(num_elements, 0, 255, 100); + + // // Types: `int8_t` and `int8_t` + // run_benchmark(num_elements, -128, 127); + // run_benchmark(num_elements, -128, 127, 128); + // run_benchmark(num_elements, -128, 127); + // run_benchmark(num_elements, -128, 127, 100); } From 6bdc0ae54ee6be1c48aebb162c9b56a7fd580038 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 15 May 2025 12:07:42 -0700 Subject: [PATCH 63/65] Finalize standard distances --- tests/svs/core/distances/cosine.cpp | 90 +++++++++---------- .../svs/core/distances/distance_euclidean.cpp | 41 ++++++--- tests/svs/core/distances/inner_product.cpp | 41 ++++++--- 3 files changed, 102 insertions(+), 70 deletions(-) diff --git a/tests/svs/core/distances/cosine.cpp b/tests/svs/core/distances/cosine.cpp index a2fad3ac..145f59fd 100644 --- a/tests/svs/core/distances/cosine.cpp +++ b/tests/svs/core/distances/cosine.cpp @@ -86,14 +86,12 @@ void test_types(T lo, T hi, size_t num_tests) { auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); auto supported_archs = arch_env.get_supported_microarchs(); + // Check the distance computation for all supported architectures. for (auto arch : supported_archs) { arch_env.set_microarch(arch); auto dist = svs::distance::DistanceCosineSimilarity{}; dist.fix_argument(as_span(a)); - auto result = svs::distance::compute( - dist, as_span(a), as_span(b) - - ); + auto result = svs::distance::compute(dist, as_span(a), as_span(b)); CATCH_REQUIRE(result == expected); } @@ -235,46 +233,46 @@ CATCH_TEST_CASE( "Benchmark CosineSimilarity Distance", "[distance][cosinesimilarity_distance][benchmark_suite][!benchmark]" ) { - // auto num_elements = 1000000; - // // Types: `float` and `float` - // run_benchmark(num_elements, -1.0f, 1.0f); - // run_benchmark(num_elements, -1.0f, 1.0f, 128); - // run_benchmark(num_elements, -1.0f, 1.0f); - // run_benchmark(num_elements, -1.0f, 1.0f, 100); - - // // Types: `float` and `svs::Float16` - // run_benchmark(num_elements, -1.0f, 1.0f); - // run_benchmark(num_elements, -1.0f, 1.0f, 128); - // run_benchmark(num_elements, -1.0f, 1.0f); - // run_benchmark(num_elements, -1.0f, 1.0f, 100); - - // // Types: `svs::Float16` and `svs::Float16` - // run_benchmark(num_elements, -1.0f, 1.0f); - // run_benchmark(num_elements, -1.0f, 1.0f, 128); - // run_benchmark(num_elements, -1.0f, 1.0f); - // run_benchmark(num_elements, -1.0f, 1.0f, 100); - - // // Types: `float` and `int8_t` - // run_benchmark(num_elements, -128, 127); - // run_benchmark(num_elements, -128, 127, 128); - // run_benchmark(num_elements, -128, 127); - // run_benchmark(num_elements, -128, 127, 100); - - // // Types: `float` and `uint8_t` - // run_benchmark(num_elements, 0, 255); - // run_benchmark(num_elements, 0, 255, 128); - // run_benchmark(num_elements, 0, 255); - // run_benchmark(num_elements, 0, 255, 100); - - // // Types: `uint8_t` and `uint8_t` - // run_benchmark(num_elements, 0, 255); - // run_benchmark(num_elements, 0, 255, 128); - // run_benchmark(num_elements, 0, 255); - // run_benchmark(num_elements, 0, 255, 100); - - // // Types: `int8_t` and `int8_t` - // run_benchmark(num_elements, -128, 127); - // run_benchmark(num_elements, -128, 127, 128); - // run_benchmark(num_elements, -128, 127); - // run_benchmark(num_elements, -128, 127, 100); + auto num_elements = 1000000; + // Types: `float` and `float` + run_benchmark(num_elements, -1.0f, 1.0f); + run_benchmark(num_elements, -1.0f, 1.0f, 128); + run_benchmark(num_elements, -1.0f, 1.0f); + run_benchmark(num_elements, -1.0f, 1.0f, 100); + + // Types: `float` and `svs::Float16` + run_benchmark(num_elements, -1.0f, 1.0f); + run_benchmark(num_elements, -1.0f, 1.0f, 128); + run_benchmark(num_elements, -1.0f, 1.0f); + run_benchmark(num_elements, -1.0f, 1.0f, 100); + + // Types: `svs::Float16` and `svs::Float16` + run_benchmark(num_elements, -1.0f, 1.0f); + run_benchmark(num_elements, -1.0f, 1.0f, 128); + run_benchmark(num_elements, -1.0f, 1.0f); + run_benchmark(num_elements, -1.0f, 1.0f, 100); + + // Types: `float` and `int8_t` + run_benchmark(num_elements, -128, 127); + run_benchmark(num_elements, -128, 127, 128); + run_benchmark(num_elements, -128, 127); + run_benchmark(num_elements, -128, 127, 100); + + // Types: `float` and `uint8_t` + run_benchmark(num_elements, 0, 255); + run_benchmark(num_elements, 0, 255, 128); + run_benchmark(num_elements, 0, 255); + run_benchmark(num_elements, 0, 255, 100); + + // Types: `uint8_t` and `uint8_t` + run_benchmark(num_elements, 0, 255); + run_benchmark(num_elements, 0, 255, 128); + run_benchmark(num_elements, 0, 255); + run_benchmark(num_elements, 0, 255, 100); + + // Types: `int8_t` and `int8_t` + run_benchmark(num_elements, -128, 127); + run_benchmark(num_elements, -128, 127, 128); + run_benchmark(num_elements, -128, 127); + run_benchmark(num_elements, -128, 127, 100); } diff --git a/tests/svs/core/distances/distance_euclidean.cpp b/tests/svs/core/distances/distance_euclidean.cpp index 1e375b43..1a20a3de 100644 --- a/tests/svs/core/distances/distance_euclidean.cpp +++ b/tests/svs/core/distances/distance_euclidean.cpp @@ -67,18 +67,35 @@ void test_types(T lo, T hi, size_t num_tests) { svs_test::populate(b, generator_b, N); auto expected = Catch::Approx(euclidean_reference(a, b)); - // Statically Sized Computation - CATCH_REQUIRE( - (svs::distance::L2::compute( - a.data(), b.data() - ) == expected) - ); - // Dynamically Sized Computation - CATCH_REQUIRE( - (svs::distance::L2::compute( - a.data(), b.data(), N - ) == expected) - ); + auto as_span = [](auto v) { return std::span{v.data(), v.size()}; }; + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + auto supported_archs = arch_env.get_supported_microarchs(); + + // Check the distance computation for all supported architectures. + for (auto arch : supported_archs) { + arch_env.set_microarch(arch); + auto dist = svs::distance::DistanceL2{}; + auto result = svs::distance::compute(dist, as_span(a), as_span(b)); + CATCH_REQUIRE(result == expected); + } + + // Checking statically and dynamically sized computation requires a direct + // call to the compute function. We pick any MicroArch, since all available + // are already tested above. + { + // Statically Sized Computation + auto dist = svs::distance::L2::compute( + a.data(), b.data() + ); + CATCH_REQUIRE(dist == expected); + } + { + // Dynamically Sized Computation + auto dist = svs::distance::L2::compute( + a.data(), b.data(), N + ); + CATCH_REQUIRE(dist == expected); + } } } } // namespace diff --git a/tests/svs/core/distances/inner_product.cpp b/tests/svs/core/distances/inner_product.cpp index b5f0462e..a754e081 100644 --- a/tests/svs/core/distances/inner_product.cpp +++ b/tests/svs/core/distances/inner_product.cpp @@ -75,18 +75,35 @@ void test_types(T lo, T hi, size_t num_tests) { .epsilon(INNERPRODUCT_EPSILON) .margin(INNERPRODUCT_MARGIN); - // Statically Sized Computation - CATCH_REQUIRE( - (svs::distance::IP::compute( - a.data(), b.data() - ) == expected) - ); - // Dynamically Sized Computation - CATCH_REQUIRE( - (svs::distance::IP::compute( - a.data(), b.data(), N - ) == expected) - ); + auto as_span = [](auto v) { return std::span{v.data(), v.size()}; }; + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + auto supported_archs = arch_env.get_supported_microarchs(); + + // Check the distance computation for all supported architectures. + for (auto arch : supported_archs) { + arch_env.set_microarch(arch); + auto dist = svs::distance::DistanceIP{}; + auto result = svs::distance::compute(dist, as_span(a), as_span(b)); + CATCH_REQUIRE(result == expected); + } + + // Checking statically and dynamically sized computation requires a direct + // call to the compute function. We pick any MicroArch, since all available + // are already tested above. + { + // Statically Sized Computation + auto dist = svs::distance::IP::compute( + a.data(), b.data() + ); + CATCH_REQUIRE(dist == expected); + } + { + // Dynamically Sized Computation + auto dist = svs::distance::IP::compute( + a.data(), b.data(), N + ); + CATCH_REQUIRE(dist == expected); + } } } } // anonymous namespace From f039955e97bd9bc598a8b8dc49004b78fbd72a0d Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Thu, 15 May 2025 12:30:57 -0700 Subject: [PATCH 64/65] Add uarch tests to scalar.cpp --- tests/svs/quantization/scalar/scalar.cpp | 27 +++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/svs/quantization/scalar/scalar.cpp b/tests/svs/quantization/scalar/scalar.cpp index 6893bb53..a81052d4 100644 --- a/tests/svs/quantization/scalar/scalar.cpp +++ b/tests/svs/quantization/scalar/scalar.cpp @@ -141,16 +141,23 @@ template void test_distance_single(T l // A buffer into which we decompress the int8 values std::vector rhs(N); - for (size_t i = 0; i < num_tests; ++i) { - auto datum = compressed.get_datum(i); - std::transform(datum.begin(), datum.end(), rhs.begin(), decompress); - auto rhs_span = std::span(rhs); - float reference = svs::distance::compute(distance, query.get_datum(0), rhs_span); - auto expected = Catch::Approx(reference).epsilon(0.01).margin(0.01); - - // Calculate compressed distance and compare with reference - float result = compressed_distance.compute(compressed.get_datum(i)); - CATCH_REQUIRE(result == expected); + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + auto supported_archs = arch_env.get_supported_microarchs(); + + for (auto arch : supported_archs) { + for (size_t i = 0; i < num_tests; ++i) { + arch_env.set_microarch(arch); + auto datum = compressed.get_datum(i); + std::transform(datum.begin(), datum.end(), rhs.begin(), decompress); + auto rhs_span = std::span(rhs); + float reference = + svs::distance::compute(distance, query.get_datum(0), rhs_span); + auto expected = Catch::Approx(reference).epsilon(0.01).margin(0.01); + + // Calculate compressed distance and compare with reference + float result = compressed_distance.compute(compressed.get_datum(i)); + CATCH_REQUIRE(result == expected); + } } } From 36eb21cf21d422592eda854f1e1db3ec23138fe4 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Fri, 16 May 2025 02:55:52 -0700 Subject: [PATCH 65/65] Minor corrections for uarch-specific assertions --- tests/svs/core/distances/cosine.cpp | 9 +++++---- tests/svs/core/distances/distance_euclidean.cpp | 7 ++++--- tests/svs/core/distances/inner_product.cpp | 7 ++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/svs/core/distances/cosine.cpp b/tests/svs/core/distances/cosine.cpp index 145f59fd..a1855f59 100644 --- a/tests/svs/core/distances/cosine.cpp +++ b/tests/svs/core/distances/cosine.cpp @@ -89,10 +89,11 @@ void test_types(T lo, T hi, size_t num_tests) { // Check the distance computation for all supported architectures. for (auto arch : supported_archs) { arch_env.set_microarch(arch); - auto dist = svs::distance::DistanceCosineSimilarity{}; - dist.fix_argument(as_span(a)); - auto result = svs::distance::compute(dist, as_span(a), as_span(b)); - CATCH_REQUIRE(result == expected); + auto dist_type = svs::distance::DistanceCosineSimilarity{}; + dist_type.fix_argument(as_span(a)); + auto dist = svs::distance::compute(dist_type, as_span(a), as_span(b)); + CATCH_INFO("Testing architecture: " << svs::arch::microarch_to_string(arch)); + CATCH_REQUIRE(dist == expected); } // Checking statically and dynamically sized computation requires a direct diff --git a/tests/svs/core/distances/distance_euclidean.cpp b/tests/svs/core/distances/distance_euclidean.cpp index 1a20a3de..e6227a55 100644 --- a/tests/svs/core/distances/distance_euclidean.cpp +++ b/tests/svs/core/distances/distance_euclidean.cpp @@ -74,9 +74,10 @@ void test_types(T lo, T hi, size_t num_tests) { // Check the distance computation for all supported architectures. for (auto arch : supported_archs) { arch_env.set_microarch(arch); - auto dist = svs::distance::DistanceL2{}; - auto result = svs::distance::compute(dist, as_span(a), as_span(b)); - CATCH_REQUIRE(result == expected); + auto dist_type = svs::distance::DistanceL2{}; + auto dist = svs::distance::compute(dist_type, as_span(a), as_span(b)); + CATCH_INFO("Testing architecture: " << svs::arch::microarch_to_string(arch)); + CATCH_REQUIRE(dist == expected); } // Checking statically and dynamically sized computation requires a direct diff --git a/tests/svs/core/distances/inner_product.cpp b/tests/svs/core/distances/inner_product.cpp index a754e081..d27cfafd 100644 --- a/tests/svs/core/distances/inner_product.cpp +++ b/tests/svs/core/distances/inner_product.cpp @@ -82,9 +82,10 @@ void test_types(T lo, T hi, size_t num_tests) { // Check the distance computation for all supported architectures. for (auto arch : supported_archs) { arch_env.set_microarch(arch); - auto dist = svs::distance::DistanceIP{}; - auto result = svs::distance::compute(dist, as_span(a), as_span(b)); - CATCH_REQUIRE(result == expected); + auto dist_type = svs::distance::DistanceIP{}; + auto dist = svs::distance::compute(dist_type, as_span(a), as_span(b)); + CATCH_INFO("Testing architecture: " << svs::arch::microarch_to_string(arch)); + CATCH_REQUIRE(dist == expected); } // Checking statically and dynamically sized computation requires a direct