diff --git a/.github/scripts/get_cpu_info.sh b/.github/scripts/get_cpu_info.sh new file mode 100755 index 00000000..82c46b84 --- /dev/null +++ b/.github/scripts/get_cpu_info.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Script to get CPU information using platform-agnostic python packages + +# Install python packages if not present in the environment +if ! python -m pip show archspec > /dev/null 2>&1; then + python -m pip install archspec +fi + +if ! python -m pip show py-cpuinfo > /dev/null 2>&1; then + python -m pip install py-cpuinfo +fi + +# Print host microarchitecture +python -c "import archspec.cpu; \ + print('Host Microarchitecture[archspec]:', archspec.cpu.host().name)" + +# Print full CPU information +python -c "import pprint, cpuinfo; \ + print('CPU info[py-cpuinfo]:'); \ + pprint.pprint(cpuinfo.get_cpu_info(), indent=4, compact=True)" diff --git a/.github/scripts/install_sde.sh b/.github/scripts/install_sde.sh new file mode 100755 index 00000000..25310c5a --- /dev/null +++ b/.github/scripts/install_sde.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +wget --content-disposition "https://downloadmirror.intel.com/850782/sde-external-9.53.0-2025-03-16-lin.tar.xz" +tar -xf sde-external-*-lin.tar.xz +cd sde-external-*/ +echo "$PWD" >> $GITHUB_PATH diff --git a/.github/workflows/build-linux-arm.yml b/.github/workflows/build-linux-arm.yml index 07811f00..7ea8c425 100644 --- a/.github/workflows/build-linux-arm.yml +++ b/.github/workflows/build-linux-arm.yml @@ -47,6 +47,10 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Get CPU info + run: | + bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh + - name: Configure build working-directory: ${{ runner.temp }} env: @@ -70,3 +74,17 @@ jobs: CTEST_OUTPUT_ON_FAILURE: 1 working-directory: ${{ runner.temp }}/build/tests run: ctest -C ${{ matrix.build_type }} + + - name: Build Python Bindings + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} + run: | + cd bindings/python + python -m pip install . + + - name: Run Python Microarch Test + run: | + cd bindings/python + python -c "import svs; svs.microarch.describe()" + python -m unittest discover -p "test_microarch.py" -s . diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index a03bdd78..23d9a90a 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -56,6 +56,12 @@ jobs: source /opt/intel/oneapi/setvars.sh printenv >> $GITHUB_ENV + - name: Install Intel(R) SDE + run: source ${GITHUB_WORKSPACE}/.github/scripts/install_sde.sh + + - name: Get CPU info + run: bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh + - name: Configure build working-directory: ${{ runner.temp }} env: @@ -86,3 +92,21 @@ jobs: CTEST_OUTPUT_ON_FAILURE: 1 working-directory: ${{ runner.temp }}/build/examples/cpp run: ctest -C RelWithDebugInfo + + - name: Build Python Bindings + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} + run: | + cd bindings/python + python -m pip install . + + - name: Run Python Microarch Test with SDE + run: | + cd bindings/python + for flag in nhm hsw skx clx icl; do + echo "SDE emulation: $flag" + export SDE_FLAG=$flag + sde64 -$flag -- python -c "import svs; svs.microarch.describe()" + sde64 -$flag -- python -m unittest discover -p "test_microarch.py" -s . + done diff --git a/.github/workflows/build-macos.yaml b/.github/workflows/build-macos.yaml index a382d525..9069fda1 100644 --- a/.github/workflows/build-macos.yaml +++ b/.github/workflows/build-macos.yaml @@ -46,6 +46,10 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Get CPU info + run: | + bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh + - name: Install Compiler run: | echo "Installing ${{ matrix.package }}..." @@ -83,3 +87,28 @@ jobs: CTEST_OUTPUT_ON_FAILURE: 1 working-directory: ${{ runner.temp }}/build/tests run: ctest -C ${{ matrix.build_type }} + + - name: Build Python Bindings + env: + CXX: ${{ matrix.cxx }} + CC: ${{ matrix.cc }} + run: | + if [[ "${{ matrix.needs_prefix }}" == "true" ]]; then + # For non-default packages like llvm@15, get the install prefix + COMPILER_PREFIX=$(brew --prefix ${{ matrix.package }}) + export CC="${COMPILER_PREFIX}/bin/${{ matrix.cc_name }}" + export CXX="${COMPILER_PREFIX}/bin/${{ matrix.cxx_name }}" + else + # For versioned GCC installs, the name is usually directly available + export CC="${{ matrix.cc_name }}" + export CXX="${{ matrix.cxx_name }}" + fi + + cd bindings/python + python -m pip install . + + - name: Run Python Microarch Test + run: | + cd bindings/python + python -c "import svs; svs.microarch.describe()" + python -m unittest discover -p "test_microarch.py" -s . diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml index 87198b98..8ca5dc92 100644 --- a/.github/workflows/cibuildwheel.yml +++ b/.github/workflows/cibuildwheel.yml @@ -43,6 +43,10 @@ jobs: - name: Install cibuildwheel run: python -m pip install cibuildwheel + - name: Get CPU info + run: | + bash ${GITHUB_WORKSPACE}/.github/scripts/get_cpu_info.sh + # Install inside the temporary working directory. - name: Build Wheel env: diff --git a/.licenserc.yaml b/.licenserc.yaml index 815de7ee..48f920b2 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -45,6 +45,9 @@ header: - 'THIRD-PARTY-PROGRAMS' - '.github/renovate.json' - 'cmake/mkl_functions' + - 'cmake/microarch_targets_aarch64' + - 'cmake/microarch_targets_aarch64_darwin' + - 'cmake/microarch_targets_x86_64' - 'cmake/patches/tomlplusplus_v330.patch' - 'docker/x86_64/manylinux2014/oneAPI.repo' - 'docs/cpp/index/loader-compatibility.csv' diff --git a/CMakeLists.txt b/CMakeLists.txt index 314a6b33..985f6c39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ target_compile_options( include("cmake/options.cmake") +include("cmake/microarch.cmake") include("cmake/clang-tidy.cmake") include("cmake/eve.cmake") include("cmake/pthread.cmake") @@ -80,6 +81,8 @@ include("cmake/toml.cmake") ##### Build Objects ##### +create_microarch_instantiations() + if(SVS_BUILD_BINARIES) add_subdirectory(utils) endif() @@ -112,7 +115,7 @@ set(LIB_CONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/svs") # Install headers and target information. install( - TARGETS svs_devel svs_compile_options svs_native_options + TARGETS svs_devel svs_compile_options svs_microarch_options_base EXPORT svs-targets INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 5c042c29..840c9c36 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -48,7 +48,7 @@ set(SHARED_LIBRARY_FILES src/inverted/memory/executables/memory_test.cpp ) -add_library(svs_benchmark_library SHARED ${SHARED_LIBRARY_FILES}) +add_library(svs_benchmark_library SHARED ${SHARED_LIBRARY_FILES} ${MICROARCH_OBJECT_FILES}) target_include_directories(svs_benchmark_library PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include) # Minimal @@ -104,7 +104,7 @@ target_link_libraries( PUBLIC ${SVS_LIB} svs_compile_options - svs_native_options + svs_microarch_options_base fmt::fmt ) diff --git a/bindings/python/CMakeLists.txt b/bindings/python/CMakeLists.txt index 495eec2c..6bc333c6 100644 --- a/bindings/python/CMakeLists.txt +++ b/bindings/python/CMakeLists.txt @@ -24,68 +24,9 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(pybind11) -# Try to find the Python executable. -# -# If it's given as part of the Cmake arguments given by "scikit build", then use that. -# Otherwise, fall back to using plain old "python". -# If *THAT* doesn't work, give up. -if(DEFINED PYTHON_EXECUTABLE) - set(SVS_PYTHON_EXECUTABLE "${PYTHON_EXECUTABLE}") -else() - set(SVS_PYTHON_EXECUTABLE "python") -endif() - -# The micro architectures to compile for. -if(NOT DEFINED SVS_MICROARCHS) - set(SVS_MICROARCHS native) -endif() - # Include the SVS library directly. add_subdirectory("../.." "${CMAKE_CURRENT_BINARY_DIR}/svs") -# Run the python script to get optimization flags for the desired back-ends. -# -# FLAGS_SCRIPT - Path to the Python script that will take the compiler, compiler version, -# and list of desired microarchitectures and generate optimization flags for each -# microarchitecture. -# -# FLAGS_TEXT_FILE - List of optimization flags for each architecture. -# Expected format: -# -march=arch1,-mtune=arch1 -# -march=arch2,-mtune=arch2 -# ... -# -march=archN,-mtune=archN -# -# The number of lines should be equal to the number of microarchitectures. -# NOTE: The entries within each line are separated by a comma on purpose to allow CMake -# to read the whole file as a List and then use string replacement on the commas to turn -# each line into a list in its own right. -# -# TEMP_JSON - JSON Manifest file describing the generated binaries. This is meant to be -# included in the Python package to allow the Python code to reason about the packaged -# libraries and select the correct one for loading. -# -set(FLAGS_SCRIPT "${CMAKE_CURRENT_LIST_DIR}/microarch.py") -set(FLAGS_TEXT_FILE "${CMAKE_CURRENT_BINARY_DIR}/optimization_flags.txt") -set(FLAGS_MANIFEST_JSON "${CMAKE_CURRENT_BINARY_DIR}/flags_manifest.json") - -execute_process( - COMMAND - ${SVS_PYTHON_EXECUTABLE} - ${FLAGS_SCRIPT} - ${FLAGS_TEXT_FILE} - ${FLAGS_MANIFEST_JSON} - --compiler ${CMAKE_CXX_COMPILER_ID} - --compiler-version ${CMAKE_CXX_COMPILER_VERSION} - --microarchitectures ${SVS_MICROARCHS} - COMMAND_ERROR_IS_FATAL ANY -) - -file(STRINGS "${FLAGS_TEXT_FILE}" OPTIMIZATION_FLAGS) -message("Flags: ${OPTIMIZATION_FLAGS}") -list(LENGTH OPTIMIZATION_FLAGS OPT_FLAGS_LENGTH) -message("Length of flags: ${OPT_FLAGS_LENGTH}") - # C++ files makind up the python bindings. set(CPP_FILES src/allocator.cpp @@ -98,51 +39,33 @@ set(CPP_FILES src/svs_mkl.cpp ) -# Generate a shared library for each target microarchitecture. -foreach(MICRO OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) - set(LIB_NAME "_svs_${MICRO}") +set(LIB_NAME "_svs") +pybind11_add_module(${LIB_NAME} MODULE ${CPP_FILES} ${MICROARCH_OBJECT_FILES}) +target_link_libraries(${LIB_NAME} PRIVATE pybind11::module) +target_link_libraries(${LIB_NAME} PUBLIC svs::svs) +# Dependency "fmt::fmt" obtained from "svs" +target_link_libraries(${LIB_NAME} PRIVATE svs::compile_options fmt::fmt svs::microarch_options_base) +target_include_directories( + ${LIB_NAME} + PUBLIC $ +) - pybind11_add_module(${LIB_NAME} MODULE ${CPP_FILES}) - target_link_libraries(${LIB_NAME} PUBLIC svs::svs) - # Dependency "fmt::fmt" obtained from "svs" - target_link_libraries(${LIB_NAME} PRIVATE svs::compile_options fmt::fmt) +if(DEFINED SKBUILD) + install(TARGETS ${LIB_NAME} DESTINATION .) - string(REPLACE "," ";" OPT_FLAGS ${OPT_FLAGS}) - message("OPT Flags: ${OPT_FLAGS}") - target_compile_options(${LIB_NAME} PRIVATE ${OPT_FLAGS}) + # The extension module may need to load build or included libraries when loaded. - # Header files. - target_include_directories( + # Placing build depedencies in the package and using relative RPATHs that + # don't point outside of the package means that the built package is + # relocatable. This allows for safe binary redistribution. + set_target_properties( ${LIB_NAME} - PUBLIC $ + PROPERTIES + INSTALL_RPATH "$ORIGIN/${CMAKE_INSTALL_LIBDIR}" ) - - # Comunicate to the C++ library the desired name of the library - target_compile_options(${LIB_NAME} PRIVATE "-DSVS_MODULE_NAME=${LIB_NAME}") - - # If scikit build is running the compilation process, - if(DEFINED SKBUILD) - install(TARGETS ${LIB_NAME} DESTINATION .) - - # The extension module may need to load build or included libraries when loaded. - - # Placing build depedencies in the package and using relative RPATHs that - # don't point outside of the package means that the built package is - # relocatable. This allows for safe binary redistribution. - set_target_properties( - ${LIB_NAME} - PROPERTIES - INSTALL_RPATH "$ORIGIN/${CMAKE_INSTALL_LIBDIR}" - ) - endif() -endforeach() +endif() if(DEFINED SKBUILD) - # Install the manifest JSON file. - # This is kind of a hack to avoid the needing to explicitly move JSON file into the - # source folder of the python library. - install(FILES ${FLAGS_MANIFEST_JSON} DESTINATION .) - # Install header files. install( DIRECTORY "${CMAKE_CURRENT_LIST_DIR}/include/svs" diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 5d310749..7e8aa254 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -13,7 +13,6 @@ # limitations under the License. from skbuild import setup -import archspec.cpu as cpu import os # If building in a cibuildwheel context, compile multiple versions of the library for @@ -25,27 +24,6 @@ "-DCMAKE_EXPORT_COMPILE_COMMANDS=YES", ] -# Utility to convert micro-architecture strings to -def target(arch): - return cpu.TARGETS[arch] - -# N.B.: cibuildwheel must configure the multi-arch environment variable. -# Also, the micro-architectures defined below should be in order of preference. -if os.environ.get("SVS_MULTIARCH", None) is not None: - svs_microarchs = [ - "cascadelake", - "x86_64_v3", # conservative base CPU for x86 CPUs. - ] - - # Add the current host to the list of micro-architecture if it doesn't already exist. - last_target = target(svs_microarchs[-1]) - host_name = cpu.host().name - if host_name not in svs_microarchs and target(host_name) < last_target: - svs_microarchs.append(host_name) - - cmake_array = ";".join(svs_microarchs) - cmake_args.append(f"-DSVS_MICROARCHS={cmake_array}") - # Determine the root of the repository base_dir = os.path.relpath(os.path.join(os.path.dirname(__file__), '..', '..')) diff --git a/bindings/python/src/python_bindings.cpp b/bindings/python/src/python_bindings.cpp index e1ac92b6..cf155362 100644 --- a/bindings/python/src/python_bindings.cpp +++ b/bindings/python/src/python_bindings.cpp @@ -26,6 +26,7 @@ // SVS dependencies #include "svs/core/distance.h" #include "svs/core/io.h" +#include "svs/lib/arch.h" #include "svs/lib/array.h" #include "svs/lib/datatype.h" #include "svs/lib/float16.h" @@ -42,17 +43,9 @@ // stl #include +#include #include -// Get the expected name of the library -// Make sure CMake stays up to date with defining this parameter. -// -// The variable allows us to customize the name of the python module to support -// micro-architecture versioning. -#if !defined(SVS_MODULE_NAME) -#define SVS_MODULE_NAME _svs_native -#endif - namespace py = pybind11; namespace { @@ -144,7 +137,7 @@ class ScopedModuleNameOverride { } // namespace -PYBIND11_MODULE(SVS_MODULE_NAME, m) { +PYBIND11_MODULE(_svs, m) { // Internall, the top level `__init__.py` imports everything from the C++ module named // `_svs`. // @@ -196,6 +189,87 @@ Convert the `fvecs` file on disk with 32-bit floating point entries to a `fvecs` wrap_conversion(m); + m.def("_print_cpu_extensions_status", []() { + svs::arch::write_extensions_status(std::cout); + }); + + // Wrapper for svs::arch::MicroArchEnvironment + py::class_( + m, "microarch", "Microarchitecture management singleton" + ) + .def_static( + "get", + []() -> svs::arch::MicroArchEnvironment& { + return svs::arch::MicroArchEnvironment::get_instance(); + }, + py::return_value_policy::reference + ) + .def_property_static( + "current", + [](py::object) { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + return svs::arch::microarch_to_string(env.get_microarch()); + }, + [](py::object, const std::string& arch_name) { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + auto arch = svs::arch::string_to_microarch(arch_name); + env.set_microarch(arch); + }, + "Gets or sets the current microarchitecture." + ) + .def_property_readonly_static( + "supported", + [](py::object) { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + std::vector result; + for (const auto& arch : env.get_supported_microarchs()) { + result.push_back(svs::arch::microarch_to_string(arch)); + } + return result; + }, + "Returns a list of supported microarchitectures." + ) + .def_property_readonly_static( + "compiled", + [](py::object) { + auto& env = svs::arch::MicroArchEnvironment::get_instance(); + std::vector result; + for (const auto& arch : env.get_compiled_microarchs()) { + result.push_back(svs::arch::microarch_to_string(arch)); + } + return result; + }, + "Returns a list of compiled microarchitectures." + ) + .def_static("describe", []() { + std::ostream& out = std::cout; + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + + // Print support status for all ISA extensions + svs::arch::write_extensions_status(out); + + // Print current microarchitecture + auto current_arch = arch_env.get_microarch(); + out << "\nCurrent µarch: " << svs::arch::microarch_to_string(current_arch) + << std::endl; + + // Print all supported microarchitectures + const auto& supported_archs = arch_env.get_supported_microarchs(); + out << "\nSupported µarchs: "; + for (const auto& arch : supported_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; + + // Print all compiled microarchitectures + const auto& compiled_archs = arch_env.get_compiled_microarchs(); + out << "\nCompiled µarchs: "; + for (const auto& arch : compiled_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; + }); + // Allocators svs::python::allocators::wrap(m); diff --git a/bindings/python/src/svs/__init__.py b/bindings/python/src/svs/__init__.py index dd9948e7..6379826b 100644 --- a/bindings/python/src/svs/__init__.py +++ b/bindings/python/src/svs/__init__.py @@ -13,7 +13,7 @@ # limitations under the License. # Dynamic loading logic. -from .loader import library, current_backend, available_backends +from .loader import library # Reexport all public functions and structs from the inner module. lib = library() diff --git a/bindings/python/src/svs/loader.py b/bindings/python/src/svs/loader.py index 1390cf79..06d057c6 100644 --- a/bindings/python/src/svs/loader.py +++ b/bindings/python/src/svs/loader.py @@ -12,163 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -# dep pre-coms -import archspec.cpu as cpu - -# standard library -import json import importlib -import os -from pathlib import Path - -# Get environment variables for configuring warnings and overriding backend selection. -def _is_quiet(): - """ - Return whether or not backend loading should be "quiet". - In this context, "quiet" means not warning for older architectures. - """ - return os.environ.get("SVS_QUIET", False) - -def _override_backend(): - """ - Return a manual override for the backend. - If no override is set, return `None`. - """ - return os.environ.get("SVS_OVERRIDE_BACKEND", None) - - -# The name of the manifest file. -FLAGS_MANIFEST = "flags_manifest.json" # Keep in-sync with CMakeLists.txt - -def _library_from_suffix(suffix): - return f"._svs_{suffix}" - -def _message_prehook(spec, host = cpu.host()): - """ - Emit any special messages for the given microarchitecture spec. - """ - if _is_quiet(): - return - - if isinstance(spec, str): - spec = cpu.TARGETS[spec] - - import warnings - if spec <= cpu.TARGETS["skylake_avx512"]: - message = f""" - Loading library for an older CPU architecture ({spec}). - Performance may be degraded. - """ - warnings.warn(message, RuntimeWarning) - - if host < spec: - message = """ - Override backend is target for a newer CPU than the one you're currently using. - Application may crash. - """ - warnings.warn(message, RuntimeWarning) - - -# The backend being used for this session -__CURRENT_BACKEND__ = None -def current_backend(): - """ - Return the name of the current backend. - """ - return __CURRENT_BACKEND__ - -def __set_backend_once__(suffix: str, spec): - global __CURRENT_BACKEND__ - if __CURRENT_BACKEND__ == None: - _message_prehook(spec) - __CURRENT_BACKEND__ = str(suffix) - - return current_backend() -# The dynamically loaded module. -__LIBRARY__ = None - -def _load_manifest(): - """ - Determine which shared library to load to supply the C++ extentions. - """ - json_file = Path(__file__).parent / FLAGS_MANIFEST - json_file_alternate = Path(__file__).parent.parent / FLAGS_MANIFEST - - # Try to give a somewhat helpful error message if the JSON manifest file was not - # generated properly by Scikit-build/CMake - if json_file.exists(): - with open(json_file, "r") as io: - return json.load(io) - elif json_file_alternate.exists(): - with open(json_file_alternate, "r") as io: - return json.load(io) - else: - print(Path(str(json_file).replace("ai.similarity-search.gss/", ""))) - raise RuntimeError(f""" - Expected a file {FLAGS_MANIFEST} to exist in the source directory to describe the - attributes of the libraries bundled with this application. - - No such file was found. - - Please report this to the project maintainer! - """) - -def available_backends(): - """ - Return a list of the available backends that where compiled when this module was built. - - Each backend in the list may be used to initialize ``SVS_OVERRIDE_BACKEND`` - environment variable prior to application start to override the default loading logic. - """ - return list(_load_manifest()["libraries"].keys()) - -def _find_library(): - """ - Find the appropriate library to load for this micro architecture. - """ - - # Get the current CPU and the manifest of compiled libraries that ship with this - # library. - host = cpu.host() - manifest = _load_manifest() - - # Respect override requests. - # Down stream loading will fail if the given option doesn't exist. - # - # However, if an override is explicitly given, then we can assume that the use knows - # what they're doing and can respond to a loading failure correctly. - override = _override_backend() - if override is not None: - spec = cpu.TARGETS[manifest["libraries"][override]] - return __set_backend_once__(override, spec) - - # Assume architectures in the manifest are place in order of preference. - # TODO: Revisit this assumption. - for (suffix, microarch) in manifest["libraries"].items(): - # Are we compatible with this micro architecture? - spec = cpu.TARGETS[microarch] - if spec <= host: - return __set_backend_once__(suffix, spec) - - raise RuntimeError(f""" - Could not find a suitable backend for your machine ({host}). - Please contact the project maintainers! - """) - -def __load_module_once__(): - global __LIBRARY__ - if __LIBRARY__ is None: - library_name = _library_from_suffix(_find_library()) - __LIBRARY__ = importlib.import_module(library_name, package = "svs") def library(): - """ - Return the library backend as a module. Dynamically loads the library when first called. - - Dynamically loading the library may trigger warnings related to correctness or - performance. If you really **really** don't want these warnings, they can be suppressed - by defining the environemtn variable ``SVS_QUIET=YES`` prior to application start. - """ - __load_module_once__() - return __LIBRARY__ + return importlib.import_module("._svs", package = "svs") diff --git a/bindings/python/tests/test_loader.py b/bindings/python/tests/test_loader.py deleted file mode 100644 index c9abb886..00000000 --- a/bindings/python/tests/test_loader.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Test the dynamic loading logic -import archspec.cpu as cpu -import unittest -import os -import warnings - -import svs.loader as loader - -def set_quiet(): - os.environ["SVS_QUIET"] = "YES" - -def clear_quiet(): - os.environ.pop("SVS_QUIET", None) - -def set_override(override: str): - os.environ["SVS_OVERRIDE_BACKEND"] = override - -def clear_override(): - os.environ.pop("SVS_OVERRIDE_BACKEND", None) - -class LoadingTester(unittest.TestCase): - def __unset_environment_variables__(self): - clear_quiet() - clear_override() - - def tearDown(self): - self.__unset_environment_variables__() - - def test_environment_variables(self): - # Clear the environment variables in question. - self.__unset_environment_variables__() - - # Make sure "is_quiet" behaves correctly. - self.assertFalse(loader._is_quiet()) - set_quiet() - self.assertTrue(loader._is_quiet()) - self.__unset_environment_variables__() - self.assertFalse(loader._is_quiet()) - - # Now, check that "override_backend" works. - self.assertEqual(loader._override_backend(), None) - set_override("hello") - self.assertEqual(loader._override_backend(), "hello") - set_override("north") - self.assertEqual(loader._override_backend(), "north") - clear_override() - self.assertEqual(loader._override_backend(), None) - self.__unset_environment_variables__() - - def test_suffix(self): - self.assertEqual(loader._library_from_suffix("native"), "._svs_native") - self.assertEqual(loader._library_from_suffix("cascadelake"), "._svs_cascadelake") - - def test_available_backends(self): - self.assertGreaterEqual(len(loader.available_backends()), 1) - - def test_manifest(self): - manifest = loader._load_manifest() - self.assertTrue("toolchain" in manifest) - self.assertTrue("libraries" in manifest) - - toolchain = manifest["toolchain"] - self.assertTrue("compiler" in toolchain) - self.assertTrue("compiler_version" in toolchain) - - libraries = manifest["libraries"] - self.assertGreaterEqual(len(libraries), 1) - - def test_message_prehook(self): - # Cause all warnings to always be triggered. - warnings.simplefilter("always") - - # Refer to - # https://docs.python.org/3/library/warnings.html#testing-warnings - # for how to test warnings. - - # Warning for the host being greater than the spec. - spec = cpu.TARGETS["icelake"] - host = cpu.TARGETS["skylake"] - with warnings.catch_warnings(record = True) as w: - loader._message_prehook(spec, host) - self.assertTrue(len(w) == 1) - self.assertTrue(issubclass(w[-1].category, RuntimeWarning)) - self.assertTrue("Override" in str(w[-1].message)) - - # Running again with "quiet" enabled should suppress the warning - set_quiet() - with warnings.catch_warnings(record = True) as w: - loader._message_prehook(spec, host) - self.assertTrue(len(w) == 0) - - # Warning for using an old architecture. - clear_quiet() - archs = ["haswell", "skylake", "skylake_avx512"] - for arch in archs: - with warnings.catch_warnings(record = True) as w: - loader._message_prehook(arch) - # Number of warnings can exceed 1 if running on an older CPU. - # In this latter case, we get a "newer CPU" warning as well. - self.assertTrue(len(w) >= 1) - self.assertTrue(issubclass(w[0].category, RuntimeWarning)) - self.assertTrue("older CPU" in str(w[0].message)) - - def test_loaded(self): - libraries = loader._load_manifest()["libraries"] - self.assertTrue(loader.current_backend() in libraries) - self.assertNotEqual(loader.library(), None) diff --git a/bindings/python/tests/test_microarch.py b/bindings/python/tests/test_microarch.py new file mode 100644 index 00000000..670ae779 --- /dev/null +++ b/bindings/python/tests/test_microarch.py @@ -0,0 +1,37 @@ +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import svs +import archspec.cpu as cpu +import os + + +class MicroarchTester(unittest.TestCase): + def test_microarch(self): + # Get emulated microarch from SDE_FLAG or use the host CPU + host_microarch = os.environ.get("SDE_FLAG", cpu.host().name) + mapping = { + "nhm": "nehalem", + "hsw": "haswell", + "skx": "skylake_avx512", + "clx": "cascadelake", + "icl": "icelake_client", + "icelake": "icelake_client", + "spr": "sapphirerapids", + } + host_microarch = mapping.get(host_microarch, host_microarch) + + if host_microarch in svs.microarch.compiled: + self.assertTrue(host_microarch == svs.microarch.current) diff --git a/cmake/microarch.cmake b/cmake/microarch.cmake new file mode 100644 index 00000000..71b1b848 --- /dev/null +++ b/cmake/microarch.cmake @@ -0,0 +1,128 @@ +# Copyright 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(svs_microarch_cmake_included) + return() +endif() +set(svs_microarch_cmake_included true) + +# N.B.: first microarch listed in targets file is treated as "base" microarch +# which is used to build base object files, shared libs and executables +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_x86_64" SVS_MICROARCHS) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" OR CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") + if(APPLE) + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_aarch64_darwin" SVS_MICROARCHS) + else() + file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/microarch_targets_aarch64" SVS_MICROARCHS) + endif() +else() + message(FATAL_ERROR "Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") +endif() + + +# Try to find the Python executable. +# +# If it's given as part of the Cmake arguments given by "scikit build", then use that. +# Otherwise, fall back to using plain old "python". +# If *THAT* doesn't work, give up. +if(DEFINED PYTHON_EXECUTABLE) + set(SVS_PYTHON_EXECUTABLE "${PYTHON_EXECUTABLE}") +else() + set(SVS_PYTHON_EXECUTABLE "python") +endif() + +# Run the python script to get optimization flags for the desired back-ends. +# +# FLAGS_SCRIPT - Path to the Python script that will take the compiler, compiler version, +# and list of desired microarchitectures and generate optimization flags for each +# microarchitecture. +# +# FLAGS_TEXT_FILE - List of optimization flags for each architecture. +# Expected format: +# -march=arch1,-mtune=arch1 +# -march=arch2,-mtune=arch2 +# ... +# -march=archN,-mtune=archN +# +# The number of lines should be equal to the number of microarchitectures. +# NOTE: The entries within each line are separated by a comma on purpose to allow CMake +# to read the whole file as a List and then use string replacement on the commas to turn +# each line into a list in its own right. +# +set(FLAGS_SCRIPT "${CMAKE_CURRENT_LIST_DIR}/microarch.py") +set(FLAGS_TEXT_FILE "${CMAKE_CURRENT_BINARY_DIR}/optimization_flags.txt") + +execute_process( + COMMAND + ${SVS_PYTHON_EXECUTABLE} + ${FLAGS_SCRIPT} + ${FLAGS_TEXT_FILE} + --compiler ${CMAKE_CXX_COMPILER_ID} + --compiler-version ${CMAKE_CXX_COMPILER_VERSION} + --microarchitectures ${SVS_MICROARCHS} + COMMAND_ERROR_IS_FATAL ANY +) + +file(STRINGS "${FLAGS_TEXT_FILE}" OPTIMIZATION_FLAGS) +message("Flags: ${OPTIMIZATION_FLAGS}") +list(LENGTH OPTIMIZATION_FLAGS OPT_FLAGS_LENGTH) +message("Length of flags: ${OPT_FLAGS_LENGTH}") + +##### +##### Helper targets to support required microarchs and apply relevant compiler optimizations. +##### + +# Set up "base" target to include opt. flags for base microarch +# and flags to enable support of other microarchs in dispatcher +add_library(svs_microarch_options_base INTERFACE) +add_library(svs::microarch_options_base ALIAS svs_microarch_options_base) + +# Get opt. flags for base microarch +list(POP_FRONT SVS_MICROARCHS BASE_MICROARCH) +list(POP_FRONT OPTIMIZATION_FLAGS BASE_OPT_FLAGS) +string(REPLACE "," ";" BASE_OPT_FLAGS ${BASE_OPT_FLAGS}) +message("Opt.flags[base=${BASE_MICROARCH}]: ${BASE_OPT_FLAGS}") + +target_compile_options(svs_microarch_options_base INTERFACE ${BASE_OPT_FLAGS} -DSVS_MICROARCH_SUPPORT_${BASE_MICROARCH} -DSVS_TUNE_TARGET=${BASE_MICROARCH}) + +foreach(MICROARCH OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) + # Tell the microarch dispatcher to include this microarch branch + target_compile_options(svs_microarch_options_base INTERFACE -DSVS_MICROARCH_SUPPORT_${MICROARCH}) + + string(REPLACE "," ";" OPT_FLAGS ${OPT_FLAGS}) + message("Opt.flags[${MICROARCH}]: ${OPT_FLAGS}") + + # Create a new target for this microarch + add_library(svs_microarch_options_${MICROARCH} INTERFACE) + add_library(svs::microarch_options_${MICROARCH} ALIAS svs_microarch_options_${MICROARCH}) + target_compile_options(svs_microarch_options_${MICROARCH} INTERFACE ${OPT_FLAGS} -DSVS_TUNE_TARGET=${MICROARCH}) +endforeach() + +set(MICROARCH_CPP_FILES "${CMAKE_CURRENT_LIST_DIR}/microarch_instantiations.cpp") + +# function to create a set of object files with microarch instantiations +function(create_microarch_instantiations) + set(MICROARCH_OBJECT_FILES "") + foreach(MICROARCH OPT_FLAGS IN ZIP_LISTS SVS_MICROARCHS OPTIMIZATION_FLAGS) + set(OBJ_NAME "microarch_${MICROARCH}") + add_library(${OBJ_NAME} OBJECT ${MICROARCH_CPP_FILES}) + + target_link_libraries(${OBJ_NAME} PRIVATE ${SVS_LIB} svs::compile_options fmt::fmt svs_microarch_options_${MICROARCH}) + + list(APPEND MICROARCH_OBJECT_FILES $) + endforeach() + # Note: this specific way of setting the variable is required to make it available in all targeted scopes + set(MICROARCH_OBJECT_FILES "${MICROARCH_OBJECT_FILES}" CACHE INTERNAL "Microarchitecture-specific object files") +endfunction() diff --git a/bindings/python/microarch.py b/cmake/microarch.py similarity index 89% rename from bindings/python/microarch.py rename to cmake/microarch.py index 99a4ae36..6e6ce58c 100644 --- a/bindings/python/microarch.py +++ b/cmake/microarch.py @@ -17,21 +17,18 @@ # (1) A text file with compiler optimization flags for each microarchitecture formatted for # relatively easy consumption by CMake. # -# (2) A JSON manifest file describing the micreoarchitecture for each compiled library -# that the python library can use to select the correct shared library. -# import archspec import archspec.cpu as cpu import argparse import json + def build_parser(): parser = argparse.ArgumentParser() parser.add_argument( "cmake_flags_text_file", help = "file path to where CMake's text file will go." ) - parser.add_argument("python_output_json_file") parser.add_argument("--compiler", required = True) parser.add_argument("--compiler-version", required = True) parser.add_argument( @@ -48,6 +45,7 @@ def resolve_microarch(name: str): """ custom_aliases = { "native": cpu.host().name, + "icelake_client": "icelake", } # Allow the custom aliases to override the current name. # If an alias doesn't exist, juse pass the name straight through. @@ -85,6 +83,7 @@ def resolve_compiler(name: str): aliases = { "GNU": "gcc", "Clang": "clang", + "AppleClang": "clang", "IntelLLVM": "oneapi", } return aliases.get(name, name) @@ -96,7 +95,6 @@ def run(): # Extract elements from the parser architectures = args.microarchitectures output_text = args.cmake_flags_text_file - output_json = args.python_output_json_file compiler = resolve_compiler(args.compiler) compiler_version = args.compiler_version @@ -120,16 +118,10 @@ def run(): "toolchain": toolchain, "libraries": suffix_to_microarch, } - with open(output_json, "w") as file: - file.write(json.dumps(pre_json_dict, indent = 4)) # Safe flags to file dump_flags_for_cmake(optimization_flags, output_text) - # Print flags to stdout - for flags in optimization_flags: - print(flags) - ##### ##### Execute as script. ##### diff --git a/cmake/microarch_instantiations.cpp b/cmake/microarch_instantiations.cpp new file mode 100644 index 00000000..9f60fe32 --- /dev/null +++ b/cmake/microarch_instantiations.cpp @@ -0,0 +1,23 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "svs/core/distance/cosine.h" +#include "svs/core/distance/euclidean.h" +#include "svs/core/distance/inner_product.h" + +SVS_INSTANTIATE_COSINE_DISTANCE_BY_MICROARCH +SVS_INSTANTIATE_L2_DISTANCE_BY_MICROARCH +SVS_INSTANTIATE_IP_DISTANCE_BY_MICROARCH diff --git a/cmake/microarch_targets_aarch64 b/cmake/microarch_targets_aarch64 new file mode 100644 index 00000000..fe512ced --- /dev/null +++ b/cmake/microarch_targets_aarch64 @@ -0,0 +1,2 @@ +neoverse_v1 +neoverse_n2 diff --git a/cmake/microarch_targets_aarch64_darwin b/cmake/microarch_targets_aarch64_darwin new file mode 100644 index 00000000..b5692e52 --- /dev/null +++ b/cmake/microarch_targets_aarch64_darwin @@ -0,0 +1,2 @@ +m1 +m2 diff --git a/cmake/microarch_targets_x86_64 b/cmake/microarch_targets_x86_64 new file mode 100644 index 00000000..bac14630 --- /dev/null +++ b/cmake/microarch_targets_x86_64 @@ -0,0 +1,7 @@ +nehalem +haswell +x86_64_v4 +skylake_avx512 +cascadelake +icelake_client +sapphirerapids diff --git a/cmake/options.cmake b/cmake/options.cmake index e374a548..7bc7a7be 100644 --- a/cmake/options.cmake +++ b/cmake/options.cmake @@ -140,14 +140,6 @@ else() target_compile_options(${SVS_LIB} INTERFACE -DSVS_INITIALIZE_LOGGER=0) endif() -##### -##### Helper target to apply relevant compiler optimizations. -##### - -add_library(svs_native_options INTERFACE) -add_library(svs::native_options ALIAS svs_native_options) -target_compile_options(svs_native_options INTERFACE -march=native -mtune=native) - # Use an internal INTERFACE target to apply the same build options to both the # unit test and the compiled binaries. add_library(svs_compile_options INTERFACE) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index b9f1c98e..6040e156 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -24,10 +24,10 @@ endif() # # [1] A simple executable is one that takes no commandline arguments. function(create_simple_example exe test file) - add_executable(${exe} ${file}) + add_executable(${exe} ${file} ${MICROARCH_OBJECT_FILES}) target_include_directories(${exe} PRIVATE ${CMAKE_CURRENT_LIST_DIR}) # Link to our library - target_link_libraries(${exe} ${SVS_LIB} svs_compile_options svs_native_options) + target_link_libraries(${exe} ${SVS_LIB} svs_compile_options svs_microarch_options_base) # Create a test. # No-op if the `include(CTest)` line above is not executed. add_test(${test} ${exe}) @@ -37,6 +37,7 @@ endfunction() create_simple_example(saveload test_saveload saveload.cpp) create_simple_example(types test_types types.cpp) create_simple_example(vamana_iterator test_vamana_iterator vamana_iterator.cpp) +create_simple_example(microarch_info test_microarch_info microarch_info.cpp) ## More complicated examples involving more extensive setup. @@ -49,9 +50,9 @@ configure_file(../../data/test_dataset/queries_f32.fvecs . COPYONLY) configure_file(../../data/test_dataset/groundtruth_euclidean.ivecs . COPYONLY) # The vamana test executable. -add_executable(vamana vamana.cpp) +add_executable(vamana vamana.cpp ${MICROARCH_OBJECT_FILES}) target_include_directories(vamana PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_link_libraries(vamana ${SVS_LIB} svs_compile_options svs_native_options) +target_link_libraries(vamana ${SVS_LIB} svs_compile_options svs_microarch_options_base) add_test( NAME test_vamana COMMAND @@ -62,9 +63,9 @@ add_test( ) # The custom thread pool executable. -add_executable(custom_thread_pool custom_thread_pool.cpp) +add_executable(custom_thread_pool custom_thread_pool.cpp ${MICROARCH_OBJECT_FILES}) target_include_directories(custom_thread_pool PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_link_libraries(custom_thread_pool ${SVS_LIB} svs_compile_options svs_native_options) +target_link_libraries(custom_thread_pool ${SVS_LIB} svs_compile_options svs_microarch_options_base) add_test( NAME test_custom_thread_pool COMMAND @@ -79,9 +80,9 @@ add_test( ##### Dispatcher ##### -add_executable(dispatcher dispatcher.cpp) +add_executable(dispatcher dispatcher.cpp ${MICROARCH_OBJECT_FILES}) target_include_directories(dispatcher PRIVATE ${CMAKE_CURRENT_LIST_DIR}) -target_link_libraries(dispatcher ${SVS_LIB} svs_compile_options svs_native_options) +target_link_libraries(dispatcher ${SVS_LIB} svs_compile_options svs_microarch_options_base) # Here we go. add_test( diff --git a/examples/cpp/microarch_info.cpp b/examples/cpp/microarch_info.cpp new file mode 100644 index 00000000..b70954bf --- /dev/null +++ b/examples/cpp/microarch_info.cpp @@ -0,0 +1,49 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "svs/lib/arch.h" +#include "svs/lib/cpuid.h" +#include + +int main() { + std::ostream& out = std::cout; + auto& arch_env = svs::arch::MicroArchEnvironment::get_instance(); + + // Print support status for all ISA extensions + svs::arch::write_extensions_status(out); + + // Print current microarchitecture + auto current_arch = arch_env.get_microarch(); + out << "\nCurrent µarch: " << svs::arch::microarch_to_string(current_arch) << std::endl; + + // Print all supported microarchitectures + const auto& supported_archs = arch_env.get_supported_microarchs(); + out << "\nSupported µarchs: "; + for (const auto& arch : supported_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; + + // Print all compiled microarchitectures + const auto& compiled_archs = arch_env.get_compiled_microarchs(); + out << "\nCompiled µarchs: "; + for (const auto& arch : compiled_archs) { + out << svs::arch::microarch_to_string(arch) << " "; + } + out << std::endl; + + return 0; +} diff --git a/include/svs/core/distance/cosine.h b/include/svs/core/distance/cosine.h index 9738e881..49281c9e 100644 --- a/include/svs/core/distance/cosine.h +++ b/include/svs/core/distance/cosine.h @@ -19,6 +19,7 @@ // svs #include "svs/core/distance/distance_core.h" #include "svs/core/distance/simd_utils.h" +#include "svs/lib/arch.h" #include "svs/lib/saveload.h" #include "svs/lib/static.h" @@ -32,7 +33,8 @@ namespace svs::distance { // Forward declare implementation to allow entry point to be near the top. -template struct CosineSimilarityImpl; +template +struct CosineSimilarityImpl; // Generic Entry Point // Call as one of either: @@ -41,18 +43,18 @@ template struct CosineSimilarityImpl; // (2) CosineSimilarity::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -class CosineSimilarity { +template class CosineSimilarity { public: template - static constexpr float compute(const Ea* a, const Eb* b, float a_norm, size_t N) { - return CosineSimilarityImpl::compute( + SVS_NOINLINE static float compute(const Ea* a, const Eb* b, float a_norm, size_t N) { + return CosineSimilarityImpl::compute( a, b, a_norm, lib::MaybeStatic(N) ); } template - static constexpr float compute(const Ea* a, const Eb* b, float a_norm) { - return CosineSimilarityImpl::compute( + SVS_NOINLINE static float compute(const Ea* a, const Eb* b, float a_norm) { + return CosineSimilarityImpl::compute( a, b, a_norm, lib::MaybeStatic() ); } @@ -139,9 +141,17 @@ float compute(DistanceCosineSimilarity distance, std::span a, std::span< assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - return CosineSimilarity::compute(a.data(), b.data(), distance.norm_, a.size()); + SVS_DISPATCH_CLASS_BY_MICROARCH( + CosineSimilarity, + compute, + SVS_PACK_ARGS(a.data(), b.data(), distance.norm_, a.size()) + ); } else { - return CosineSimilarity::compute(a.data(), b.data(), distance.norm_); + SVS_DISPATCH_CLASS_BY_MICROARCH( + CosineSimilarity, + compute, + SVS_PACK_ARGS(a.data(), b.data(), distance.norm_) + ); } } @@ -166,7 +176,8 @@ float generic_cosine_similarity( return result / (a_norm * std::sqrt(accum)); }; -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { static float compute( const Ea* a, const Eb* b, @@ -224,7 +235,8 @@ template <> struct CosineFloatOp<16> : public svs::simd::ConvertToFloat<16> { // Small Integers SVS_VALIDATE_BOOL_ENV(SVS_AVX512_VNNI) #if SVS_AVX512_VNNI -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -250,7 +262,8 @@ template struct CosineSimilarityImpl { } }; -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto sum = _mm512_setzero_epi32(); @@ -278,7 +291,8 @@ template struct CosineSimilarityImpl { #endif // Floating and Mixed Types -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -286,7 +300,8 @@ template struct CosineSimilarityImpl { } }; -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -294,7 +309,8 @@ template struct CosineSimilarityImpl { }; }; -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>(), a, b, length); @@ -302,7 +318,8 @@ template struct CosineSimilarityImpl { }; }; -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -310,7 +327,8 @@ template struct CosineSimilarityImpl { } }; -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -318,7 +336,8 @@ template struct CosineSimilarityImpl { } }; -template struct CosineSimilarityImpl { +template +struct CosineSimilarityImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, float a_norm, lib::MaybeStatic length) { auto [sum, norm] = simd::generic_simd_op(CosineFloatOp<16>{}, a, b, length); @@ -327,4 +346,51 @@ template struct CosineSimilarityImpl { }; #endif + +// NOTE: dispatching doesn't work for other CosineSimilarity instances than the listed +// below. +#define SVS_INSTANTIATE_COSINE_DISTANCE_BY_MICROARCH \ + SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + CosineSimilarity, signed char, signed char \ + ) \ + SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + CosineSimilarity, unsigned char, unsigned char \ + ) \ + SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + CosineSimilarity, float, float \ + ) \ + SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + CosineSimilarity, float, unsigned char \ + ) \ + SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + CosineSimilarity, float, signed char \ + ) \ + SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + CosineSimilarity, float, svs::float16::Float16 \ + ) \ + SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + CosineSimilarity, svs::float16::Float16, float \ + ) \ + SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + CosineSimilarity, svs::float16::Float16, svs::float16::Float16 \ + ) + +#define SVS_EXTERN_COSINE_DISTANCE \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, int8_t, int8_t) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, uint8_t, uint8_t) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, float) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, uint8_t) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(CosineSimilarity, float, int8_t) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ + CosineSimilarity, float, svs::float16::Float16 \ + ) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ + CosineSimilarity, svs::float16::Float16, float \ + ) \ + SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES( \ + CosineSimilarity, svs::float16::Float16, svs::float16::Float16 \ + ) + +SVS_EXTERN_COSINE_DISTANCE + } // namespace svs::distance diff --git a/include/svs/core/distance/euclidean.h b/include/svs/core/distance/euclidean.h index 2fc86986..6a612684 100644 --- a/include/svs/core/distance/euclidean.h +++ b/include/svs/core/distance/euclidean.h @@ -19,6 +19,7 @@ // svs #include "svs/core/distance/distance_core.h" #include "svs/core/distance/simd_utils.h" +#include "svs/lib/arch.h" #include "svs/lib/float16.h" #include "svs/lib/preprocessor.h" #include "svs/lib/saveload.h" @@ -71,7 +72,7 @@ namespace svs::distance { // Forward declare implementation to allow entry point to be near the top. -template struct L2Impl; +template struct L2Impl; // Generic Entry Point // Call as one of either: @@ -80,16 +81,16 @@ template struct L2Impl; // (2) L2::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -class L2 { +template class L2 { public: template - static constexpr float compute(const Ea* a, const Eb* b, size_t N) { - return L2Impl::compute(a, b, lib::MaybeStatic(N)); + SVS_NOINLINE static float compute(const Ea* a, const Eb* b, size_t N) { + return L2Impl::compute(a, b, lib::MaybeStatic(N)); } template - static constexpr float compute(const Ea* a, const Eb* b) { - return L2Impl::compute(a, b, lib::MaybeStatic()); + SVS_NOINLINE static float compute(const Ea* a, const Eb* b) { + return L2Impl::compute(a, b, lib::MaybeStatic()); } }; @@ -155,9 +156,13 @@ float compute(DistanceL2 /*unused*/, std::span a, std::span b) { assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - return L2::compute(a.data(), b.data(), a.size()); + SVS_DISPATCH_CLASS_BY_MICROARCH( + L2, compute, SVS_PACK_ARGS(a.data(), b.data(), a.size()) + ); } else { - return L2::compute(a.data(), b.data()); + SVS_DISPATCH_CLASS_BY_MICROARCH( + L2, compute, SVS_PACK_ARGS(a.data(), b.data()) + ); } } @@ -177,7 +182,7 @@ float generic_l2( return result; } -template struct L2Impl { +template struct L2Impl { static constexpr float compute(const Ea* a, const Eb* b, lib::MaybeStatic length = lib::MaybeStatic()) { return generic_l2(a, b, length); @@ -252,14 +257,14 @@ template <> struct L2VNNIOp : public svs::simd::ConvertForVNNI struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2VNNIOp(), a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2VNNIOp(), a, b, length); @@ -269,42 +274,42 @@ template struct L2Impl { #endif // Floating and Mixed Types -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); }; }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); }; }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { return simd::generic_simd_op(L2FloatOp<16>{}, a, b, length); @@ -320,7 +325,7 @@ template struct L2Impl { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -340,7 +345,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -362,7 +367,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -383,7 +388,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -407,7 +412,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -434,7 +439,7 @@ template struct L2Impl { } }; -template struct L2Impl { +template struct L2Impl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -462,4 +467,30 @@ template struct L2Impl { }; #endif + +// NOTE: dispatching doesn't work for other L2 instances than the listed below. +#define SVS_INSTANTIATE_L2_DISTANCE_BY_MICROARCH \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, int8_t, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, uint8_t, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, float, float) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, float, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, float, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, float, svs::float16::Float16) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(L2, svs::float16::Float16, float) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + L2, svs::float16::Float16, svs::float16::Float16 \ + ) + +#define SVS_EXTERN_L2_DISTANCE \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(L2, int8_t, int8_t) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(L2, uint8_t, uint8_t) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(L2, float, float) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(L2, float, uint8_t) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(L2, float, int8_t) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(L2, float, svs::float16::Float16) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(L2, svs::float16::Float16, float) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(L2, svs::float16::Float16, svs::float16::Float16) + +SVS_EXTERN_L2_DISTANCE + } // namespace svs::distance diff --git a/include/svs/core/distance/inner_product.h b/include/svs/core/distance/inner_product.h index 2ad51e17..84322858 100644 --- a/include/svs/core/distance/inner_product.h +++ b/include/svs/core/distance/inner_product.h @@ -19,6 +19,7 @@ // svs #include "svs/core/distance/distance_core.h" #include "svs/core/distance/simd_utils.h" +#include "svs/lib/arch.h" #include "svs/lib/float16.h" #include "svs/lib/preprocessor.h" #include "svs/lib/saveload.h" @@ -32,7 +33,7 @@ namespace svs::distance { // Forward declare implementation to allow entry point to be near the top. -template struct IPImpl; +template struct IPImpl; // Generic Entry Point // Call as one of either: @@ -41,16 +42,16 @@ template struct IPImpl; // (2) IP::compute(a, b) // ``` // Where (2) is when length is known at compile time and (1) is when length is not. -class IP { +template class IP { public: template - static constexpr float compute(const Ea* a, const Eb* b, size_t N) { - return IPImpl::compute(a, b, lib::MaybeStatic(N)); + SVS_NOINLINE static float compute(const Ea* a, const Eb* b, size_t N) { + return IPImpl::compute(a, b, lib::MaybeStatic(N)); } template - static constexpr float compute(const Ea* a, const Eb* b) { - return IPImpl::compute(a, b, lib::MaybeStatic()); + SVS_NOINLINE static float compute(const Ea* a, const Eb* b) { + return IPImpl::compute(a, b, lib::MaybeStatic()); } }; @@ -117,9 +118,13 @@ float compute(DistanceIP /*unused*/, std::span a, std::span b) { assert(a.size() == b.size()); constexpr size_t extent = lib::extract_extent(Da, Db); if constexpr (extent == Dynamic) { - return IP::compute(a.data(), b.data(), a.size()); + SVS_DISPATCH_CLASS_BY_MICROARCH( + IP, compute, SVS_PACK_ARGS(a.data(), b.data(), a.size()) + ); } else { - return IP::compute(a.data(), b.data()); + SVS_DISPATCH_CLASS_BY_MICROARCH( + IP, compute, SVS_PACK_ARGS(a.data(), b.data()) + ); } } @@ -138,7 +143,7 @@ float generic_ip( return result; } -template struct IPImpl { +template struct IPImpl { static float compute(const Ea* a, const Eb* b, lib::MaybeStatic length = lib::MaybeStatic()) { return generic_ip(a, b, length); @@ -207,14 +212,14 @@ template <> struct IPVNNIOp : public svs::simd::ConvertForVNNI struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(IPVNNIOp(), a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { return simd::generic_simd_op(IPVNNIOp(), a, b, length); @@ -224,42 +229,42 @@ template struct IPImpl { #endif // Floating and Mixed Types -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const uint8_t* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); }; }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); }; }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const float* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { return svs::simd::generic_simd_op(IPFloatOp<16>{}, a, b, length); @@ -274,7 +279,7 @@ template struct IPImpl { SVS_VALIDATE_BOOL_ENV(SVS_AVX512_F) SVS_VALIDATE_BOOL_ENV(SVS_AVX2) #if !SVS_AVX512_F && SVS_AVX2 -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const float* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -293,7 +298,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const Float16* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -314,7 +319,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const Float16* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -334,7 +339,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const float* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -357,7 +362,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const int8_t* a, const int8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -383,7 +388,7 @@ template struct IPImpl { } }; -template struct IPImpl { +template struct IPImpl { SVS_NOINLINE static float compute(const uint8_t* a, const uint8_t* b, lib::MaybeStatic length) { constexpr size_t vector_size = 8; @@ -410,4 +415,30 @@ template struct IPImpl { }; #endif + +// NOTE: dispatching doesn't work for other IP instances than the listed below. +#define SVS_INSTANTIATE_IP_DISTANCE_BY_MICROARCH \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, int8_t, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, uint8_t, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, float, float) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, float, uint8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, float, int8_t) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, float, svs::float16::Float16) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(IP, svs::float16::Float16, float) \ + SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES( \ + IP, svs::float16::Float16, svs::float16::Float16 \ + ) + +#define SVS_EXTERN_IP_DISTANCE \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(IP, int8_t, int8_t) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(IP, uint8_t, uint8_t) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(IP, float, float) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(IP, float, uint8_t) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(IP, float, int8_t) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(IP, float, svs::float16::Float16) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(IP, svs::float16::Float16, float) \ + SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(IP, svs::float16::Float16, svs::float16::Float16) + +SVS_EXTERN_IP_DISTANCE + } // namespace svs::distance diff --git a/include/svs/lib/arch.h b/include/svs/lib/arch.h new file mode 100644 index 00000000..ec7d0217 --- /dev/null +++ b/include/svs/lib/arch.h @@ -0,0 +1,446 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "svs/lib/arch_defines.h" +#include "svs/lib/cpuid.h" +#include +#include +#include +#include + +namespace svs::arch { + +enum class MicroArch { +#if defined(__x86_64__) + // Refer to the GCC docs for the list of targeted architectures: + // https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html + nehalem, + westmere, + sandybridge, + ivybridge, + haswell, + broadwell, + skylake, + x86_64_v4, + skylake_avx512, + cascadelake, + cooperlake, + icelake_client, + icelake_server, + sapphirerapids, + graniterapids, + graniterapids_d, +#elif defined(__aarch64__) +#if defined(__APPLE__) + m1, + m2, +#else + neoverse_v1, + neoverse_n2, +#endif +#endif + baseline = 0, +}; + +struct MicroArchInfo { + std::optional parent; + std::vector extensions; + std::string name; +}; + +// Unordered map with MicroArch to MicroArchInfo mapping +inline const std::unordered_map& get_microarch_info_map() { + static const std::unordered_map microarch_info = { +#if defined(__x86_64__) + {MicroArch::nehalem, + {std::nullopt, + {ISAExt::MMX, + ISAExt::SSE, + ISAExt::SSE2, + ISAExt::SSE3, + ISAExt::SSSE3, + ISAExt::SSE4_1, + ISAExt::SSE4_2, + ISAExt::POPCNT, + ISAExt::CX16, + ISAExt::SAHF, + ISAExt::FXSR}, + "nehalem"}}, + {MicroArch::westmere, {MicroArch::nehalem, {ISAExt::PCLMUL}, "westmere"}}, + {MicroArch::sandybridge, + {MicroArch::westmere, {ISAExt::AVX, ISAExt::XSAVE}, "sandybridge"}}, + {MicroArch::ivybridge, + {MicroArch::sandybridge, + {ISAExt::FSGSBASE, ISAExt::RDRND, ISAExt::F16C}, + "ivybridge"}}, + {MicroArch::haswell, + {MicroArch::sandybridge, + {ISAExt::AVX2, + ISAExt::BMI, + ISAExt::BMI2, + ISAExt::LZCNT, + ISAExt::FMA, + ISAExt::MOVBE}, + "haswell"}}, + {MicroArch::broadwell, + {MicroArch::haswell, + {ISAExt::RDSEED, ISAExt::ADCX, ISAExt::PREFETCHW}, + "broadwell"}}, + {MicroArch::skylake, + {MicroArch::broadwell, + {ISAExt::AES, ISAExt::CLFLUSHOPT, ISAExt::XSAVEC, ISAExt::XSAVES, ISAExt::SGX}, + "skylake"}}, + {MicroArch::x86_64_v4, + {std::nullopt, + {ISAExt::AVX512_F, + ISAExt::AVX512_VL, + ISAExt::AVX512_BW, + ISAExt::AVX512_DQ, + ISAExt::AVX512_CD}, + "x86_64_v4"}}, + {MicroArch::skylake_avx512, + {MicroArch::skylake, + {ISAExt::AVX512_F, + ISAExt::CLWB, + ISAExt::AVX512_VL, + ISAExt::AVX512_BW, + ISAExt::AVX512_DQ, + ISAExt::AVX512_CD}, + "skylake_avx512"}}, + {MicroArch::cascadelake, + {MicroArch::skylake_avx512, {ISAExt::AVX512_VNNI}, "cascadelake"}}, + {MicroArch::cooperlake, + {MicroArch::cascadelake, {ISAExt::AVX512_BF16}, "cooperlake"}}, + {MicroArch::icelake_client, + {MicroArch::cascadelake, + {ISAExt::PKU, + ISAExt::AVX512_VBMI, + ISAExt::AVX512_IFMA, + ISAExt::SHA, + ISAExt::GFNI, + ISAExt::VAES, + ISAExt::AVX512_VBMI2, + ISAExt::VPCLMULQDQ, + ISAExt::AVX512_BITALG, + ISAExt::RDPID, + ISAExt::AVX512_VPOPCNTDQ}, + "icelake_client"}}, + {MicroArch::icelake_server, + {MicroArch::icelake_client, + {ISAExt::PCONFIG, ISAExt::WBNOINVD, ISAExt::CLWB}, + "icelake_server"}}, + {MicroArch::sapphirerapids, + {MicroArch::icelake_server, + {ISAExt::MOVDIRI, + ISAExt::MOVDIR64B, + ISAExt::ENQCMD, + ISAExt::CLDEMOTE, + ISAExt::PTWRITE, + ISAExt::WAITPKG, + ISAExt::SERIALIZE, + ISAExt::TSXLDTRK, + ISAExt::UINTR, + ISAExt::AMX_BF16, + ISAExt::AMX_TILE, + ISAExt::AMX_INT8, + ISAExt::AVX_VNNI, + ISAExt::AVX512_FP16, + ISAExt::AVX512_BF16}, + "sapphirerapids"}}, + {MicroArch::graniterapids, + {MicroArch::sapphirerapids, + {ISAExt::AMX_FP16, ISAExt::PREFETCHI}, + "graniterapids"}}, + {MicroArch::graniterapids_d, + {MicroArch::graniterapids, {ISAExt::AMX_COMPLEX}, "graniterapids_d"}}, +#elif defined(__aarch64__) +#if defined(__APPLE__) + {MicroArch::m1, {std::nullopt, {ISAExt::M1}, "m1"}}, + {MicroArch::m2, {std::nullopt, {ISAExt::M2}, "m2"}}, +#else + {MicroArch::neoverse_v1, {std::nullopt, {ISAExt::SVE}, "neoverse_v1"}}, + {MicroArch::neoverse_n2, {MicroArch::neoverse_v1, {ISAExt::SVE2}, "neoverse_n2"}}, +#endif +#endif + {MicroArch::baseline, {std::nullopt, {}, "baseline"}} + }; + return microarch_info; +} + +inline bool arch_is_supported(MicroArch arch) { + const auto& info_map = get_microarch_info_map(); + auto it = info_map.find(arch); + if (it == info_map.end()) { + return false; + } + + const auto& info = it->second; + + // First check if parent architecture is supported + if (info.parent.has_value() && !arch_is_supported(info.parent.value())) { + return false; + } + + // Then check additional extensions + return check_extensions(info.extensions); +} + +inline std::string microarch_to_string(MicroArch arch) { + const auto& info_map = get_microarch_info_map(); + auto it = info_map.find(arch); + if (it != info_map.end()) { + return it->second.name; + } + return "unknown"; +} + +inline MicroArch string_to_microarch(const std::string& arch_name) { + const auto& info_map = get_microarch_info_map(); + for (const auto& [arch, info] : info_map) { + if (info.name == arch_name) { + return arch; + } + } + throw std::invalid_argument("Unknown microarchitecture name: " + arch_name); +} + +class MicroArchEnvironment { + public: + static MicroArchEnvironment& get_instance() { + // TODO: ensure thread safety + static MicroArchEnvironment instance; + return instance; + } + MicroArch get_microarch() const { return max_arch_; } + + void set_microarch(MicroArch arch) { + if (arch_is_supported(arch)) { + max_arch_ = arch; + } else { + throw std::invalid_argument("Unsupported microarchitecture"); + } + } + + const std::vector& get_supported_microarchs() const { + return supported_archs_; + } + + const std::vector& get_compiled_microarchs() const { + return compiled_archs_; + } + + private: + MicroArchEnvironment() { + const std::vector compiled_archs = { +#if defined(__x86_64__) + SVS_MICROARCH_COMPILED_nehalem + SVS_MICROARCH_COMPILED_westmere + SVS_MICROARCH_COMPILED_sandybridge + SVS_MICROARCH_COMPILED_ivybridge + SVS_MICROARCH_COMPILED_haswell + SVS_MICROARCH_COMPILED_broadwell + SVS_MICROARCH_COMPILED_skylake + SVS_MICROARCH_COMPILED_x86_64_v4 + SVS_MICROARCH_COMPILED_skylake_avx512 + SVS_MICROARCH_COMPILED_cascadelake + SVS_MICROARCH_COMPILED_cooperlake + SVS_MICROARCH_COMPILED_icelake_client + SVS_MICROARCH_COMPILED_icelake_server + SVS_MICROARCH_COMPILED_sapphirerapids + SVS_MICROARCH_COMPILED_graniterapids + SVS_MICROARCH_COMPILED_graniterapids_d +#elif defined(__aarch64__) +#if defined(__APPLE__) + SVS_MICROARCH_COMPILED_m1 + SVS_MICROARCH_COMPILED_m2 +#else + SVS_MICROARCH_COMPILED_neoverse_v1 + SVS_MICROARCH_COMPILED_neoverse_n2 +#endif +#endif + }; + compiled_archs_ = compiled_archs; + max_arch_ = MicroArch::baseline; + for (const auto& arch : compiled_archs_) { + if (arch_is_supported(arch)) { + supported_archs_.push_back(arch); + if (static_cast(arch) > static_cast(max_arch_)) { + max_arch_ = arch; + } + } + } + } + + std::vector compiled_archs_; + std::vector supported_archs_; + MicroArch max_arch_; +}; + +#if defined(__x86_64__) + +#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ + svs::arch::MicroArch cpu_arch = \ + svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ + } +#elif defined(__aarch64__) + +#if defined(__APPLE__) + +#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ + svs::arch::MicroArch cpu_arch = \ + svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ + } + +#else + +#define SVS_DISPATCH_CLASS_BY_MICROARCH(cls, method, args) \ + svs::arch::MicroArch cpu_arch = \ + svs::arch::MicroArchEnvironment::get_instance().get_microarch(); \ + switch (cpu_arch) { \ + SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, SVS_PACK_ARGS(args)) \ + SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, SVS_PACK_ARGS(args)) \ + default: \ + return cls::method(args); \ + break; \ + } + +#endif + +#endif + +#define SVS_INST_CLASS_METHOD_TMPL_BY_MICROARCH( \ + return_type, cls, method, template_args, args \ +) \ + template return_type cls::method(args); +// Generic distance dispatching macro +#define SVS_INST_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(cls, a_type, b_type) \ + SVS_INST_CLASS_METHOD_TMPL_BY_MICROARCH( \ + float, \ + svs::distance::cls, \ + compute, \ + SVS_PACK_ARGS(a_type, b_type), \ + SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long) \ + ) +// Generic distance extern macro (required for external linking to uarch-specific implementations) +#if defined(__x86_64__) + +#define SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ + SVS_EXTERN_CLASS_METHOD_westmere(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_sandybridge(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_ivybridge(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_haswell(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_broadwell(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_skylake(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_x86_64_v4(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_skylake_avx512(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_cascadelake(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_cooperlake(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_icelake_client(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_icelake_server(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_sapphirerapids(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_graniterapids(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_graniterapids_d(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) + +#elif defined(__aarch64__) + +#if defined(__APPLE__) + +#define SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ + SVS_EXTERN_CLASS_METHOD_m2(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) + +#else + +#define SVS_EXTERN_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ + SVS_EXTERN_CLASS_METHOD_neoverse_n2(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, unsigned long)) + +#endif + +#endif +// Cosine distance dispatching macro +#define SVS_INST_COSINE_DISTANCE_CLASS_BY_MICROARCH_AND_TYPENAMES(cls, a_type, b_type) \ + SVS_INST_CLASS_METHOD_TMPL_BY_MICROARCH( \ + float, \ + svs::distance::cls, \ + compute, \ + SVS_PACK_ARGS(a_type, b_type), \ + SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long) \ + ) + +// Cosine distance extern macro (required for external linking to uarch-specific implementations) +#if defined(__x86_64__) + +#define SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ + SVS_EXTERN_CLASS_METHOD_westmere(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_sandybridge(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_ivybridge(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_haswell(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_broadwell(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_skylake(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_x86_64_v4(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_skylake_avx512(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_cascadelake(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_cooperlake(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_icelake_client(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_icelake_server(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_sapphirerapids(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_graniterapids(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) \ + SVS_EXTERN_CLASS_METHOD_graniterapids_d(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) + +#elif defined(__aarch64__) + +#if defined(__APPLE__) + +#define SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ + SVS_EXTERN_CLASS_METHOD_m2(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) + +#else + +#define SVS_EXTERN_COSINE_DISTANCE_CLASS_BY_TYPENAMES(cls, a_type, b_type) \ + SVS_EXTERN_CLASS_METHOD_neoverse_n2(float, svs::distance::cls, compute, SVS_PACK_ARGS(a_type, b_type), SVS_PACK_ARGS(a_type const*, b_type const*, float, unsigned long)) + +#endif + +#endif + +} // namespace svs::arch diff --git a/include/svs/lib/arch_defines.h b/include/svs/lib/arch_defines.h new file mode 100644 index 00000000..a9241c04 --- /dev/null +++ b/include/svs/lib/arch_defines.h @@ -0,0 +1,432 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define SVS_PACK_ARGS(...) __VA_ARGS__ +#define SVS_CLASS_METHOD_MICROARCH_CASE(microarch, cls, method, args) \ + case svs::arch::MicroArch::microarch: \ + return cls::method(args); \ + break; +#define SVS_TARGET_MICROARCH svs::arch::MicroArch::SVS_TUNE_TARGET + +#define SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + microarch, return_type, cls, method, template_args, args \ +) \ + extern template return_type \ + cls::method(args); + +// TODO: autogenerate this list +#if defined(__x86_64__) + +#if defined(SVS_MICROARCH_SUPPORT_nehalem) +#define SVS_MICROARCH_COMPILED_nehalem MicroArch::nehalem, +#define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(nehalem, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_nehalem(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + nehalem, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_nehalem +#define SVS_CLASS_METHOD_MICROARCH_CASE_nehalem(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_nehalem(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_westmere) +#define SVS_MICROARCH_COMPILED_westmere MicroArch::westmere, +#define SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(westmere, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_westmere(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + westmere, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_westmere +#define SVS_CLASS_METHOD_MICROARCH_CASE_westmere(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_westmere(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_sandybridge) +#define SVS_MICROARCH_COMPILED_sandybridge MicroArch::sandybridge, +#define SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(sandybridge, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_sandybridge(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + sandybridge, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_sandybridge +#define SVS_CLASS_METHOD_MICROARCH_CASE_sandybridge(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_sandybridge(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_ivybridge) +#define SVS_MICROARCH_COMPILED_ivybridge MicroArch::ivybridge, +#define SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(ivybridge, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_ivybridge(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + ivybridge, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_ivybridge +#define SVS_CLASS_METHOD_MICROARCH_CASE_ivybridge(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_ivybridge(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_haswell) +#define SVS_MICROARCH_COMPILED_haswell MicroArch::haswell, +#define SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(haswell, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_haswell(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + haswell, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_haswell +#define SVS_CLASS_METHOD_MICROARCH_CASE_haswell(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_haswell(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_broadwell) +#define SVS_MICROARCH_COMPILED_broadwell MicroArch::broadwell, +#define SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(broadwell, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_broadwell(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + broadwell, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_broadwell +#define SVS_CLASS_METHOD_MICROARCH_CASE_broadwell(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_broadwell(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_skylake) +#define SVS_MICROARCH_COMPILED_skylake MicroArch::skylake, +#define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(skylake, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_skylake(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + skylake, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_skylake +#define SVS_CLASS_METHOD_MICROARCH_CASE_skylake(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_skylake(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_x86_64_v4) +#define SVS_MICROARCH_COMPILED_x86_64_v4 MicroArch::x86_64_v4, +#define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(x86_64_v4, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_x86_64_v4(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + x86_64_v4, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_x86_64_v4 +#define SVS_CLASS_METHOD_MICROARCH_CASE_x86_64_v4(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_x86_64_v4(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_skylake_avx512) +#define SVS_MICROARCH_COMPILED_skylake_avx512 MicroArch::skylake_avx512, +#define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(skylake_avx512, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_skylake_avx512( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + skylake_avx512, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_skylake_avx512 +#define SVS_CLASS_METHOD_MICROARCH_CASE_skylake_avx512(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_skylake_avx512( \ + return_type, cls, method, template_args, args \ +) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_cascadelake) +#define SVS_MICROARCH_COMPILED_cascadelake MicroArch::cascadelake, +#define SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(cascadelake, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_cascadelake(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + cascadelake, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_cascadelake +#define SVS_CLASS_METHOD_MICROARCH_CASE_cascadelake(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_cascadelake(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_cooperlake) +#define SVS_MICROARCH_COMPILED_cooperlake MicroArch::cooperlake, +#define SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(cooperlake, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_cooperlake(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + cooperlake, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_cooperlake +#define SVS_CLASS_METHOD_MICROARCH_CASE_cooperlake(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_cooperlake(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_icelake_client) +#define SVS_MICROARCH_COMPILED_icelake_client MicroArch::icelake_client, +#define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(icelake_client, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_icelake_client( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + icelake_client, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_icelake_client +#define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_client(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_icelake_client( \ + return_type, cls, method, template_args, args \ +) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_icelake_server) +#define SVS_MICROARCH_COMPILED_icelake_server MicroArch::icelake_server, +#define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(icelake_server, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_icelake_server( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + icelake_server, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_icelake_server +#define SVS_CLASS_METHOD_MICROARCH_CASE_icelake_server(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_icelake_server( \ + return_type, cls, method, template_args, args \ +) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_sapphirerapids) +#define SVS_MICROARCH_COMPILED_sapphirerapids MicroArch::sapphirerapids, +#define SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(sapphirerapids, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_sapphirerapids( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + sapphirerapids, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_sapphirerapids +#define SVS_CLASS_METHOD_MICROARCH_CASE_sapphirerapids(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_sapphirerapids( \ + return_type, cls, method, template_args, args \ +) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_graniterapids) +#define SVS_MICROARCH_COMPILED_graniterapids MicroArch::graniterapids, +#define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(graniterapids, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_graniterapids( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + graniterapids, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_graniterapids +#define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_graniterapids(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_graniterapids_d) +#define SVS_MICROARCH_COMPILED_graniterapids_d MicroArch::graniterapids_d, +#define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(graniterapids_d, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_graniterapids_d( \ + return_type, cls, method, template_args, args \ +) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + graniterapids_d, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_graniterapids_d +#define SVS_CLASS_METHOD_MICROARCH_CASE_graniterapids_d(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_graniterapids_d( \ + return_type, cls, method, template_args, args \ +) +#endif + +#elif defined(__aarch64__) + +#if defined(__APPLE__) + +#if defined(SVS_MICROARCH_SUPPORT_m1) +#define SVS_MICROARCH_COMPILED_m1 MicroArch::m1, +#define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(m1, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_m1(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + m1, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_m1 +#define SVS_CLASS_METHOD_MICROARCH_CASE_m1(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_m1(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_m2) +#define SVS_MICROARCH_COMPILED_m2 MicroArch::m2, +#define SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(m2, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_m2(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + m2, return_type, cls, method, SVS_PACK_ARGS(template_args), SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_m2 +#define SVS_CLASS_METHOD_MICROARCH_CASE_m2(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_m2(return_type, cls, method, template_args, args) +#endif + +#else + +#if defined(SVS_MICROARCH_SUPPORT_neoverse_v1) +#define SVS_MICROARCH_COMPILED_neoverse_v1 MicroArch::neoverse_v1, +#define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_v1, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_neoverse_v1(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + neoverse_v1, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_neoverse_v1 +#define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_v1(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_neoverse_v1(return_type, cls, method, template_args, args) +#endif + +#if defined(SVS_MICROARCH_SUPPORT_neoverse_n2) +#define SVS_MICROARCH_COMPILED_neoverse_n2 MicroArch::neoverse_n2, +#define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, args) \ + SVS_CLASS_METHOD_MICROARCH_CASE(neoverse_n2, cls, method, SVS_PACK_ARGS(args)) +#define SVS_EXTERN_CLASS_METHOD_neoverse_n2(return_type, cls, method, template_args, args) \ + SVS_EXTERN_CLASS_METHOD_TMPL_BY_MICROARCH( \ + neoverse_n2, \ + return_type, \ + cls, \ + method, \ + SVS_PACK_ARGS(template_args), \ + SVS_PACK_ARGS(args) \ + ) +#else +#define SVS_MICROARCH_COMPILED_neoverse_n2 +#define SVS_CLASS_METHOD_MICROARCH_CASE_neoverse_n2(cls, method, args) +#define SVS_EXTERN_CLASS_METHOD_neoverse_n2(return_type, cls, method, template_args, args) +#endif + +#endif +#endif diff --git a/include/svs/lib/cpuid.h b/include/svs/lib/cpuid.h new file mode 100644 index 00000000..0ed67b0c --- /dev/null +++ b/include/svs/lib/cpuid.h @@ -0,0 +1,335 @@ +/* + * Copyright 2025 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#if defined(__x86_64__) +#include +#endif + +#if defined(__aarch64__) && defined(__APPLE__) +#include +#endif + +namespace svs::arch { + +#if defined(__x86_64__) + +enum class ISAExt { + // Common extensions + MMX, + SSE, + SSE2, + SSE3, + SSSE3, + SSE4_1, + SSE4_2, + POPCNT, + CX16, + SAHF, + FXSR, + AVX, + XSAVE, + PCLMUL, + FSGSBASE, + RDRND, + F16C, + AVX2, + BMI, + BMI2, + LZCNT, + FMA, + MOVBE, + RDSEED, + ADCX, + PREFETCHW, + AES, + CLFLUSHOPT, + XSAVEC, + XSAVES, + SGX, + CLWB, + PKU, + SHA, + GFNI, + VAES, + VPCLMULQDQ, + RDPID, + PCONFIG, + WBNOINVD, + MOVDIRI, + MOVDIR64B, + ENQCMD, + CLDEMOTE, + PTWRITE, + WAITPKG, + SERIALIZE, + TSXLDTRK, + UINTR, + PREFETCHI, + + // AVX family + AVX_VNNI, + + // AVX512 family + AVX512_F, + AVX512_VL, + AVX512_BW, + AVX512_DQ, + AVX512_CD, + AVX512_VBMI, + AVX512_IFMA, + AVX512_VNNI, + AVX512_VBMI2, + AVX512_BITALG, + AVX512_VPOPCNTDQ, + AVX512_BF16, + AVX512_FP16, + + // AMX family + AMX_BF16, + AMX_TILE, + AMX_INT8, + AMX_FP16, + AMX_COMPLEX +}; + +struct CPUIDFlag { + const uint32_t function; // EAX input for CPUID + const uint32_t subfunction; // ECX input for CPUID + const uint32_t reg; // Register index (0=EAX, 1=EBX, 2=ECX, 3=EDX) + const uint32_t bit; // Bit position in the register + const char* name; + + bool get_value() const { + std::array regs{}; + __cpuid_count(function, subfunction, regs[0], regs[1], regs[2], regs[3]); + return (regs[reg] & (1 << bit)) != 0; + } +}; + +inline const std::unordered_map& get_isa_ext_info() { + static const std::unordered_map isa_ext_info = { + // flags are sorted by function, subfunction, register and bit + {ISAExt::MMX, {1, 0, 3, 23, "MMX"}}, + {ISAExt::FXSR, {1, 0, 3, 24, "FXSR"}}, + {ISAExt::SSE, {1, 0, 3, 25, "SSE"}}, + {ISAExt::SSE2, {1, 0, 3, 26, "SSE2"}}, + {ISAExt::SSE3, {1, 0, 2, 0, "SSE3"}}, + {ISAExt::PCLMUL, {1, 0, 2, 1, "PCLMUL"}}, + {ISAExt::SSSE3, {1, 0, 2, 9, "SSSE3"}}, + {ISAExt::FMA, {1, 0, 2, 12, "FMA"}}, + {ISAExt::CX16, {1, 0, 2, 13, "CX16"}}, + {ISAExt::SSE4_1, {1, 0, 2, 19, "SSE4_1"}}, + {ISAExt::SSE4_2, {1, 0, 2, 20, "SSE4_2"}}, + {ISAExt::MOVBE, {1, 0, 2, 22, "MOVBE"}}, + {ISAExt::POPCNT, {1, 0, 2, 23, "POPCNT"}}, + {ISAExt::AES, {1, 0, 2, 25, "AES"}}, + {ISAExt::XSAVE, {1, 0, 2, 26, "XSAVE"}}, + {ISAExt::AVX, {1, 0, 2, 28, "AVX"}}, + {ISAExt::F16C, {1, 0, 2, 29, "F16C"}}, + {ISAExt::RDRND, {1, 0, 2, 30, "RDRND"}}, + {ISAExt::FSGSBASE, {7, 0, 1, 0, "FSGSBASE"}}, + {ISAExt::SGX, {7, 0, 1, 2, "SGX"}}, + {ISAExt::BMI, {7, 0, 1, 3, "BMI"}}, + {ISAExt::AVX2, {7, 0, 1, 5, "AVX2"}}, + {ISAExt::BMI2, {7, 0, 1, 8, "BMI2"}}, + {ISAExt::AVX512_F, {7, 0, 1, 16, "AVX512_F"}}, + {ISAExt::AVX512_DQ, {7, 0, 1, 17, "AVX512_DQ"}}, + {ISAExt::RDSEED, {7, 0, 1, 18, "RDSEED"}}, + {ISAExt::ADCX, {7, 0, 1, 19, "ADCX"}}, + {ISAExt::AVX512_IFMA, {7, 0, 1, 21, "AVX512_IFMA"}}, + {ISAExt::CLFLUSHOPT, {7, 0, 1, 23, "CLFLUSHOPT"}}, + {ISAExt::CLWB, {7, 0, 1, 24, "CLWB"}}, + {ISAExt::AVX512_CD, {7, 0, 1, 28, "AVX512_CD"}}, + {ISAExt::SHA, {7, 0, 1, 29, "SHA"}}, + {ISAExt::AVX512_BW, {7, 0, 1, 30, "AVX512_BW"}}, + {ISAExt::AVX512_VL, {7, 0, 1, 31, "AVX512_VL"}}, + {ISAExt::AVX512_VBMI, {7, 0, 2, 1, "AVX512_VBMI"}}, + {ISAExt::PKU, {7, 0, 2, 3, "PKU"}}, + {ISAExt::WAITPKG, {7, 0, 2, 5, "WAITPKG"}}, + {ISAExt::AVX512_VBMI2, {7, 0, 2, 6, "AVX512_VBMI2"}}, + {ISAExt::GFNI, {7, 0, 2, 8, "GFNI"}}, + {ISAExt::VAES, {7, 0, 2, 9, "VAES"}}, + {ISAExt::VPCLMULQDQ, {7, 0, 2, 10, "VPCLMULQDQ"}}, + {ISAExt::AVX512_VNNI, {7, 0, 2, 11, "AVX512_VNNI"}}, + {ISAExt::AVX512_BITALG, {7, 0, 2, 12, "AVX512_BITALG"}}, + {ISAExt::AVX512_VPOPCNTDQ, {7, 0, 2, 14, "AVX512_VPOPCNTDQ"}}, + {ISAExt::RDPID, {7, 0, 2, 22, "RDPID"}}, + {ISAExt::CLDEMOTE, {7, 0, 2, 25, "CLDEMOTE"}}, + {ISAExt::MOVDIRI, {7, 0, 2, 27, "MOVDIRI"}}, + {ISAExt::MOVDIR64B, {7, 0, 2, 28, "MOVDIR64B"}}, + {ISAExt::ENQCMD, {7, 0, 2, 29, "ENQCMD"}}, + {ISAExt::UINTR, {7, 0, 3, 5, "UINTR"}}, + {ISAExt::SERIALIZE, {7, 0, 3, 14, "SERIALIZE"}}, + {ISAExt::TSXLDTRK, {7, 0, 3, 16, "TSXLDTRK"}}, + {ISAExt::PCONFIG, {7, 0, 3, 18, "PCONFIG"}}, + {ISAExt::AMX_BF16, {7, 0, 3, 22, "AMX_BF16"}}, + {ISAExt::AVX512_FP16, {7, 0, 3, 23, "AVX512_FP16"}}, + {ISAExt::AMX_TILE, {7, 0, 3, 24, "AMX_TILE"}}, + {ISAExt::AMX_INT8, {7, 0, 3, 25, "AMX_INT8"}}, + {ISAExt::AVX_VNNI, {7, 1, 0, 4, "AVX_VNNI"}}, + {ISAExt::AVX512_BF16, {7, 1, 0, 5, "AVX512_BF16"}}, + {ISAExt::AMX_FP16, {7, 1, 0, 21, "AMX_FP16"}}, + {ISAExt::AMX_COMPLEX, {7, 1, 3, 8, "AMX_COMPLEX"}}, + {ISAExt::PREFETCHI, {7, 1, 3, 14, "PREFETCHI"}}, + {ISAExt::XSAVEC, {0xD, 1, 0, 1, "XSAVEC"}}, + {ISAExt::XSAVES, {0xD, 1, 0, 3, "XSAVES"}}, + {ISAExt::PTWRITE, {0x14, 0, 1, 4, "PTWRITE"}}, + {ISAExt::WBNOINVD, {0x80000008, 0, 1, 9, "WBNOINVD"}}, + {ISAExt::SAHF, {0x80000001, 0, 2, 0, "SAHF"}}, + {ISAExt::LZCNT, {0x80000001, 0, 2, 5, "LZCNT"}}, + {ISAExt::PREFETCHW, {0x80000001, 0, 2, 8, "PREFETCHW"}}, + }; + return isa_ext_info; +} + +#elif defined(__aarch64__) + +#if defined(__APPLE__) + +enum class ISAExt { + M1, + M2, +}; + +struct BrandInfo { + const char* name; + + bool get_value() const { + char buffer[256]; + size_t size = sizeof(buffer); + + if (sysctlbyname("machdep.cpu.brand_string", &buffer, &size, nullptr, 0) == 0) { + std::string brand(buffer); + return brand.find(name) != std::string::npos; + } + + return false; + } +}; + +inline const std::unordered_map& get_isa_ext_info() { + static const std::unordered_map isa_ext_info = { + {ISAExt::M1, {"M1"}}, + {ISAExt::M2, {"M2"}}, + }; + return isa_ext_info; +} + +#else + +enum class ISAExt { + SVE, + SVE2, +}; + +// Define register ID values for ARM features detection +#define ID_AA64PFR0_EL1 0 +#define ID_AA64ZFR0_EL1 1 + +// Helper template to read system registers with mrs instruction +template inline uint64_t read_system_reg() { + uint64_t val; + if constexpr (ID == ID_AA64PFR0_EL1) { + asm("mrs %0, id_aa64pfr0_el1" : "=r"(val)); + } else if constexpr (ID == ID_AA64ZFR0_EL1) { + asm("mrs %0, id_aa64zfr0_el1" : "=r"(val)); + } else { + val = 0; + } + return val; +} + +// Extract bits from register value +inline uint64_t extract_bits(uint64_t val, int pos, int len) { + return (val >> pos) & ((1ULL << len) - 1); +} + +struct MSRFlag { + unsigned int reg_id; // System register ID + int bit_pos; // Bit position in the register + int bit_len; // Number of bits to check + uint64_t expected_val; // Expected value for feature to be present + const char* name; // Feature name + + bool get_value() const { + uint64_t reg_val = 0; + + try { + switch (reg_id) { + case ID_AA64PFR0_EL1: + reg_val = read_system_reg(); + break; + case ID_AA64ZFR0_EL1: + if (extract_bits(read_system_reg(), 32, 4) != 0) { + reg_val = read_system_reg(); + } + break; + default: + return false; + } + + return extract_bits(reg_val, bit_pos, bit_len) == expected_val; + } catch (...) { + // If reading the register fails, the feature is not supported + return false; + } + } +}; + +inline const std::unordered_map& get_isa_ext_info() { + static const std::unordered_map isa_ext_info = { + {ISAExt::SVE, {ID_AA64PFR0_EL1, 32, 4, 1, "sve"}}, + {ISAExt::SVE2, {ID_AA64ZFR0_EL1, 0, 4, 1, "sve2"}}, + }; + return isa_ext_info; +} + +#endif +#endif + +inline bool check_extension(ISAExt ext) { return get_isa_ext_info().at(ext).get_value(); } + +inline bool check_extensions(std::vector exts) { + for (const auto& ext : exts) { + if (!check_extension(ext)) { + return false; + } + } + return true; +} + +template inline void write_extensions_status(StreamType& stream) { + const auto& ext_info = get_isa_ext_info(); + + stream << "CPU Extensions Support Status:" << std::endl; + stream << "-----------------------------" << std::endl; + + for (const auto& [ext, info] : ext_info) { + stream << info.name << ": " + << (check_extension(ext) ? "Supported" : "Not supported") << std::endl; + } +} + +} // namespace svs::arch diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 45054826..6a040384 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -186,7 +186,7 @@ if (SVS_EXPERIMENTAL_ENABLE_NUMA) list(APPEND TEST_SOURCES ${NUMA_TESTS}) endif() -add_executable(tests ${TEST_SOURCES}) +add_executable(tests ${TEST_SOURCES} ${MICROARCH_OBJECT_FILES}) # Path to the test dataset. set(DATA_DIRECTORY "${PROJECT_SOURCE_DIR}/data") @@ -196,7 +196,7 @@ target_compile_definitions(tests PRIVATE SVS_TEST_DATA_DIR="${DATA_DIRECTORY}") target_link_libraries(tests PRIVATE ${SVS_LIB}) target_link_libraries( - tests PRIVATE svs_compile_options svs_native_options svs_benchmark_library + tests PRIVATE svs_compile_options svs_microarch_options_base svs_benchmark_library ) target_link_libraries(tests PRIVATE Catch2::Catch2WithMain) diff --git a/tests/svs/core/distances/cosine.cpp b/tests/svs/core/distances/cosine.cpp index 8c68ebc1..24915d97 100644 --- a/tests/svs/core/distances/cosine.cpp +++ b/tests/svs/core/distances/cosine.cpp @@ -85,11 +85,16 @@ void test_types(T lo, T hi, size_t num_tests) { // Statically Sized Computation auto a_norm = svs::distance::norm(std::span{a.data(), a.size()}); CATCH_REQUIRE( - (svs::distance::CosineSimilarity::compute(a.data(), b.data(), a_norm) == - expected) + // TODO: replace baseline with something else? + (svs::distance::CosineSimilarity::compute( + a.data(), b.data(), a_norm + ) == expected) ); // Dynamically Sized Computation - auto dist = svs::distance::CosineSimilarity::compute(a.data(), b.data(), a_norm, N); + auto dist = + svs::distance::CosineSimilarity::compute( + a.data(), b.data(), a_norm, N + ); CATCH_REQUIRE((dist == expected)); } } diff --git a/tests/svs/core/distances/distance_euclidean.cpp b/tests/svs/core/distances/distance_euclidean.cpp index 88c23fe4..1e375b43 100644 --- a/tests/svs/core/distances/distance_euclidean.cpp +++ b/tests/svs/core/distances/distance_euclidean.cpp @@ -68,9 +68,17 @@ void test_types(T lo, T hi, size_t num_tests) { auto expected = Catch::Approx(euclidean_reference(a, b)); // Statically Sized Computation - CATCH_REQUIRE((svs::distance::L2::compute(a.data(), b.data()) == expected)); + CATCH_REQUIRE( + (svs::distance::L2::compute( + a.data(), b.data() + ) == expected) + ); // Dynamically Sized Computation - CATCH_REQUIRE((svs::distance::L2::compute(a.data(), b.data(), N) == expected)); + CATCH_REQUIRE( + (svs::distance::L2::compute( + a.data(), b.data(), N + ) == expected) + ); } } } // namespace diff --git a/tests/svs/core/distances/inner_product.cpp b/tests/svs/core/distances/inner_product.cpp index a074a058..b5f0462e 100644 --- a/tests/svs/core/distances/inner_product.cpp +++ b/tests/svs/core/distances/inner_product.cpp @@ -76,9 +76,17 @@ void test_types(T lo, T hi, size_t num_tests) { .margin(INNERPRODUCT_MARGIN); // Statically Sized Computation - CATCH_REQUIRE((svs::distance::IP::compute(a.data(), b.data()) == expected)); + CATCH_REQUIRE( + (svs::distance::IP::compute( + a.data(), b.data() + ) == expected) + ); // Dynamically Sized Computation - CATCH_REQUIRE((svs::distance::IP::compute(a.data(), b.data(), N) == expected)); + CATCH_REQUIRE( + (svs::distance::IP::compute( + a.data(), b.data(), N + ) == expected) + ); } } } // anonymous namespace diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 85c6b316..d4935abb 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -13,7 +13,7 @@ # limitations under the License. function(create_utility exe file) - add_executable(${exe} ${file}) + add_executable(${exe} ${file} ${MICROARCH_OBJECT_FILES}) target_include_directories( ${exe} PRIVATE ${CMAKE_CURRENT_LIST_DIR} @@ -22,7 +22,7 @@ function(create_utility exe file) target_link_libraries(${exe} PRIVATE ${SVS_LIB}) # Get common compiler options with the unit tests. - target_link_libraries(${exe} PRIVATE svs_compile_options svs_native_options) + target_link_libraries(${exe} PRIVATE svs_compile_options svs_microarch_options_base) # Link with third-party executables. target_link_libraries(${exe} PRIVATE fmt::fmt)