From 0818a58363b4908b4892d6088c5ad11eccb3ad55 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 2 Sep 2021 16:25:06 -0700 Subject: [PATCH 1/5] universial binary --- cmake/CMakeLists.txt | 108 +++-- cmake/onnxruntime.cmake | 2 +- cmake/onnxruntime_common.cmake | 33 +- cmake/onnxruntime_java.cmake | 19 +- cmake/onnxruntime_mlas.cmake | 640 ++++++++++++++++-------------- cmake/onnxruntime_python.cmake | 5 +- cmake/onnxruntime_training.cmake | 4 +- cmake/onnxruntime_unittests.cmake | 12 +- 8 files changed, 469 insertions(+), 354 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index bbed3461632a0..35bfe9d91103e 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -168,6 +168,14 @@ set(ONNX_CUSTOM_PROTOC_EXECUTABLE "" CACHE STRING "Specify custom protoc executa # pre-build python path option(onnxruntime_PREBUILT_PYTORCH_PATH "Path to pytorch installation dir") +if(APPLE) + if(NOT CMAKE_OSX_ARCHITECTURES) + message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}") + endif() +elseif(NOT WIN32 AND NOT APPLE) + message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}") +endif() + # Single output director for all binaries set (RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.") @@ -507,6 +515,30 @@ endmacro() #Set global compile flags for all the source code(including third_party code like protobuf) #This section must be before any add_subdirectory, otherwise build may fail because /MD,/MT mismatch if (MSVC) + enable_language(ASM_MASM) + if(CMAKE_GENERATOR_PLATFORM) + # Multi-platform generator + set(onnxruntime_target_platform ${CMAKE_GENERATOR_PLATFORM}) + else() + set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR}) + endif() + if(onnxruntime_target_platform STREQUAL "ARM64") + set(onnxruntime_target_platform "ARM64") + elseif(onnxruntime_target_platform STREQUAL "ARM64EC") + set(onnxruntime_target_platform "ARM64EC") + elseif(onnxruntime_target_platform STREQUAL "ARM" OR CMAKE_GENERATOR MATCHES "ARM") + set(onnxruntime_target_platform "ARM") + elseif(onnxruntime_target_platform STREQUAL "x64" OR onnxruntime_target_platform STREQUAL "x86_64" OR onnxruntime_target_platform STREQUAL "AMD64" OR CMAKE_GENERATOR MATCHES "Win64") + set(onnxruntime_target_platform "x64") + elseif(onnxruntime_target_platform STREQUAL "Win32" OR onnxruntime_target_platform STREQUAL "x86" OR onnxruntime_target_platform STREQUAL "i386" OR onnxruntime_target_platform STREQUAL "i686") + set(onnxruntime_target_platform "x86") + if(NOT onnxruntime_BUILD_WEBASSEMBLY) + message("Enabling SAFESEH for x86 build") + set(CMAKE_ASM_MASM_FLAGS "${CMAKE_ASM_MASM_FLAGS} /safeseh") + endif() + endif() + + #Always enable exception handling, even for Windows ARM if(NOT onnxruntime_DISABLE_EXCEPTIONS) string(APPEND CMAKE_CXX_FLAGS " /EHsc") @@ -555,6 +587,9 @@ if (MSVC) SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /guard:cf") endif() else() + if(NOT APPLE) + set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR}) + endif() if(onnxruntime_BUILD_FOR_NATIVE_MACHINE) string(APPEND CMAKE_CXX_FLAGS " -march=native -mtune=native") string(APPEND CMAKE_C_FLAGS " -march=native -mtune=native") @@ -736,6 +771,17 @@ else() endif() if(TARGET protoc) set_target_properties(protoc PROPERTIES FOLDER "External/Protobuf") + get_target_property(PROTOC_OSX_ARCH protoc OSX_ARCHITECTURES) + if(PROTOC_OSX_ARCH) + if (${CMAKE_HOST_SYSTEM_PROCESSOR} IN_LIST PROTOC_OSX_ARCH) + message("protoc can run") + else() + list(APPEND PROTOC_OSX_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR}) + set_target_properties(protoc PROPERTIES OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}") + set_target_properties(libprotoc PROPERTIES OSX_ARCHITECTURES "${PROTOC_OSX_ARCH}") + set_target_properties(libprotobuf PROPERTIES OSX_ARCHITECTURES "${PROTOC_OSX_ARCH}") + endif() + endif() endif() if (onnxruntime_USE_FULL_PROTOBUF) set(PROTOBUF_LIB libprotobuf) @@ -810,31 +856,35 @@ if(NOT TARGET re2::re2) set(RE2_INCLUDE_DIR ${REPO_ROOT}/cmake/external/re2) endif() + # Adding pytorch CPU info library # TODO!! need a better way to find out the supported architectures -set(TARGET_ARCH "${CMAKE_SYSTEM_PROCESSOR}") -if (NOT DEFINED TARGET_ARCH OR "${TARGET_ARCH}" STREQUAL "") - set(TARGET_ARCH "${CMAKE_OSX_ARCHITECTURES}") -endif() - -set(CPUINFO_SUPPORTED TRUE) -if (NOT DEFINED TARGET_ARCH OR "${TARGET_ARCH}" STREQUAL "") - message(WARNING - "Target processor architecture is not specified. cpuinfo not included") - set(CPUINFO_SUPPORTED FALSE) -elseif(NOT TARGET_ARCH MATCHES "^(i[3-6]86|AMD64|x86(_64)?|armv[5-8].*|aarch64|arm64)$") - message(WARNING - "Target processor architecture \"${CPUINFO_TARGET_PROCESSOR}\" is not supported in cpuinfo. " - "cpuinfo not included.") - set(CPUINFO_SUPPORTED FALSE) -elseif(MSVC AND (( CMAKE_SYSTEM_PROCESSOR MATCHES "^(ARM.*|arm.*)$" ) OR (CMAKE_GENERATOR_PLATFORM MATCHES "^(ARM.*|arm.*)$" ) )) - message(WARNING - "Cpuinfo not included for compilation problems with Windows ARM.") - set(CPUINFO_SUPPORTED FALSE) -elseif(WINDOWS_STORE) - message(WARNING - "Cpuinfo not included in Windows Store builds") - set(CPUINFO_SUPPORTED FALSE) +list(LENGTH CMAKE_OSX_ARCHITECTURES CMAKE_OSX_ARCHITECTURES_LEN) +if(APPLE) + if(CMAKE_OSX_ARCHITECTURES_LEN LESS_EQUAL 1) + set(CPUINFO_SUPPORTED TRUE) + endif() +else() + if(onnxruntime_BUILD_WEBASSEMBLY) + set(CPUINFO_SUPPORTED FALSE) + else() + set(CPUINFO_SUPPORTED TRUE) + endif() + if(WIN32) + # Exclude Windows ARM build and Windows Store + if(${onnxruntime_target_platform} MATCHES "^(ARM.*|arm.*)$" ) + message(WARNING "Cpuinfo not included for compilation problems with Windows ARM.") + set(CPUINFO_SUPPORTED FALSE) + elseif(WINDOWS_STORE) + message(WARNING "Cpuinfo not included in Windows Store builds") + set(CPUINFO_SUPPORTED FALSE) + endif() + elseif(NOT ${onnxruntime_target_platform} MATCHES "^(i[3-6]86|AMD64|x86(_64)?|armv[5-8].*|aarch64|arm64)$") + message(WARNING + "Target processor architecture \"${onnxruntime_target_platform}\" is not supported in cpuinfo. " + "cpuinfo not included.") + set(CPUINFO_SUPPORTED FALSE) + endif() endif() # TODO do we have to add target_include_directories to each project that uses this? @@ -844,8 +894,8 @@ if(CPUINFO_SUPPORTED) set(CPUINFO_BUILD_TOOLS OFF CACHE INTERNAL "") set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE INTERNAL "") set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE INTERNAL "") - set(CPUINFO_BUILD_BENCHMARKS OFF CACHE INTERNAL "") - + set(CPUINFO_BUILD_BENCHMARKS OFF CACHE INTERNAL "") + if (CMAKE_SYSTEM_NAME STREQUAL "iOS") set(IOS ON CACHE INTERNAL "") set(IOS_ARCH "${CMAKE_OSX_ARCHITECTURES}" CACHE INTERNAL "") @@ -1162,6 +1212,9 @@ function(onnxruntime_add_shared_library_module target_name) endif() onnxruntime_configure_target(${target_name}) + if (onnxruntime_target_platform STREQUAL "x86" AND NOT onnxruntime_BUILD_WEBASSEMBLY) + target_link_options(${target_name} PRIVATE /SAFESEH) + endif() endfunction() function(onnxruntime_add_executable target_name) @@ -1170,6 +1223,9 @@ function(onnxruntime_add_executable target_name) endif() add_executable(${target_name} ${ARGN}) onnxruntime_configure_target(${target_name}) + if (onnxruntime_target_platform STREQUAL "x86" AND NOT onnxruntime_BUILD_WEBASSEMBLY) + target_link_options(${target_name} PRIVATE /SAFESEH) + endif() endfunction() function(onnxruntime_add_include_to_target dst_target) @@ -1813,7 +1869,7 @@ if (WINDOWS_STORE) target_link_options(onnxruntime PRIVATE /DYNAMICBASE /NXCOMPAT /APPCONTAINER) target_link_options(winml_dll PRIVATE /DYNAMICBASE /NXCOMPAT /APPCONTAINER) - if (onnxruntime_target_platform STREQUAL "x86") + if (onnxruntime_target_platform STREQUAL "x86" AND NOT onnxruntime_BUILD_WEBASSEMBLY) target_link_options(onnxruntime PRIVATE /SAFESEH) target_link_options(winml_dll PRIVATE /SAFESEH) endif() diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 559ada81294ab..be50c0d88cfe2 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -174,8 +174,8 @@ set(onnxruntime_INTERNAL_LIBRARIES ${onnxruntime_tvm_libs} onnxruntime_framework onnxruntime_graph + ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common - onnxruntime_mlas onnxruntime_flatbuffers ) diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake index 0b457bf861cda..060d240066a6a 100644 --- a/cmake/onnxruntime_common.cmake +++ b/cmake/onnxruntime_common.cmake @@ -65,24 +65,6 @@ else() endif() endif() -if(CMAKE_GENERATOR_PLATFORM) - # Multi-platform generator - set(onnxruntime_target_platform ${CMAKE_GENERATOR_PLATFORM}) -else() - set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR}) -endif() -if(onnxruntime_target_platform STREQUAL "ARM64") - set(onnxruntime_target_platform "ARM64") -elseif(onnxruntime_target_platform STREQUAL "ARM64EC") - set(onnxruntime_target_platform "ARM64EC") -elseif(onnxruntime_target_platform STREQUAL "ARM" OR CMAKE_GENERATOR MATCHES "ARM") - set(onnxruntime_target_platform "ARM") -elseif(onnxruntime_target_platform STREQUAL "x64" OR onnxruntime_target_platform STREQUAL "x86_64" OR onnxruntime_target_platform STREQUAL "AMD64" OR CMAKE_GENERATOR MATCHES "Win64") - set(onnxruntime_target_platform "x64") -elseif(onnxruntime_target_platform STREQUAL "Win32" OR onnxruntime_target_platform STREQUAL "x86" OR onnxruntime_target_platform STREQUAL "i386" OR onnxruntime_target_platform STREQUAL "i686") - set(onnxruntime_target_platform "x86") -endif() - if(onnxruntime_target_platform STREQUAL "ARM64EC") if (MSVC) link_directories("$ENV{VCINSTALLDIR}/Tools/MSVC/$ENV{VCToolsVersion}/lib/ARM64EC") @@ -185,18 +167,11 @@ if(MSVC) elseif(onnxruntime_target_platform STREQUAL "x86") set(X86 TRUE) endif() -elseif(NOT onnxruntime_BUILD_WEBASSEMBLY) - if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64") - set(ARM64 TRUE) - elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64e") - set(ARM64 TRUE) - elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "arm") - set(ARM TRUE) - elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") - set(X86_64 TRUE) - elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "i386") - set(X86 TRUE) +elseif(APPLE) + if(CMAKE_OSX_ARCHITECTURES_LEN LESS_EQUAL 1) + set(X64 TRUE) endif() +elseif(NOT onnxruntime_BUILD_WEBASSEMBLY) if (CMAKE_SYSTEM_NAME STREQUAL "Android") if (CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a") set(ARM TRUE) diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake index afc8ed1bf1564..8057fa3ada7d1 100644 --- a/cmake/onnxruntime_java.cmake +++ b/cmake/onnxruntime_java.cmake @@ -79,7 +79,24 @@ endif() # Set platform and arch for packaging # Checks the names set by MLAS on non-Windows platforms first -if (CMAKE_SYSTEM_NAME STREQUAL "Android") +if(APPLE) + get_target_property(ONNXRUNTIME4J_OSX_ARCH onnxruntime4j_jni OSX_ARCHITECTURES) + list(LENGTH ONNXRUNTIME4J_OSX_ARCH ONNXRUNTIME4J_OSX_ARCH_LEN) + if(ONNXRUNTIME4J_OSX_ARCH) + if(ONNXRUNTIME4J_OSX_ARCH_LEN LESS_EQUAL 1) + list(GET ONNXRUNTIME4J_OSX_ARCH 0 JNI_ARCH) + message("Set Java ARCH TO macOS/iOS ${JNI_ARCH}") + else() + message(FATAL_ERROR "Java is currently not supported for macOS universal") + endif() + else() + set(JNI_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR}) + message("Set Java ARCH TO macOS/iOS ${JNI_ARCH}") + endif() + if(JNI_ARCH STREQUAL "x86_64") + set(JNI_ARCH x64) + endif() +elseif (CMAKE_SYSTEM_NAME STREQUAL "Android") set(JNI_ARCH ${ANDROID_ABI}) elseif (ARM64) set(JNI_ARCH aarch64) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 9807c9997dbb3..ec593cb8e1ffb 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -1,65 +1,62 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -set(mlas_common_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/platform.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/threading.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/sgemm.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qdwconv.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/convolve.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/pooling.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/transpose.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/reorder.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/snchwc.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/activate.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/logistic.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/tanh.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/erf.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/compute.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/quantize.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_default.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qladd.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qlmul.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qpostprocessor.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qlgavgpool.cpp +set(MLAS_SRC_DIR ${ONNXRUNTIME_ROOT}/core/mlas/lib) + +onnxruntime_add_static_library(onnxruntime_mlas + ${MLAS_SRC_DIR}/platform.cpp + ${MLAS_SRC_DIR}/threading.cpp + ${MLAS_SRC_DIR}/sgemm.cpp + ${MLAS_SRC_DIR}/qgemm.cpp + ${MLAS_SRC_DIR}/qdwconv.cpp + ${MLAS_SRC_DIR}/convolve.cpp + ${MLAS_SRC_DIR}/pooling.cpp + ${MLAS_SRC_DIR}/transpose.cpp + ${MLAS_SRC_DIR}/reorder.cpp + ${MLAS_SRC_DIR}/snchwc.cpp + ${MLAS_SRC_DIR}/activate.cpp + ${MLAS_SRC_DIR}/logistic.cpp + ${MLAS_SRC_DIR}/tanh.cpp + ${MLAS_SRC_DIR}/erf.cpp + ${MLAS_SRC_DIR}/compute.cpp + ${MLAS_SRC_DIR}/quantize.cpp + ${MLAS_SRC_DIR}/qgemm_kernel_default.cpp + ${MLAS_SRC_DIR}/qladd.cpp + ${MLAS_SRC_DIR}/qlmul.cpp + ${MLAS_SRC_DIR}/qpostprocessor.cpp + ${MLAS_SRC_DIR}/qlgavgpool.cpp ) -if (onnxruntime_BUILD_WEBASSEMBLY) - if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) - file(GLOB_RECURSE mlas_platform_srcs - "${ONNXRUNTIME_ROOT}/core/mlas/lib/wasm_simd/*.cpp" - ) - else() - file(GLOB_RECURSE mlas_platform_srcs - "${ONNXRUNTIME_ROOT}/core/mlas/lib/wasm/*.cpp" - ) - endif() -elseif(MSVC) +set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas) + +#TODO: set MASM flags properly +function(setup_mlas_source_for_windows) + #The onnxruntime_target_platform variable was added by Windows AI team in onnxruntime_common.cmake + #Don't use it for other platforms. if((onnxruntime_target_platform STREQUAL "ARM64") OR (onnxruntime_target_platform STREQUAL "ARM64EC")) set(PREPROCESS_ARMASM_FLAGS "") set(ARMASM_FLAGS "") if(onnxruntime_target_platform STREQUAL "ARM64") - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_neon.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_udot.cpp + target_sources(onnxruntime_mlas PRIVATE + ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp + ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp ) set(mlas_platform_preprocess_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm64/QgemmU8X8KernelNeon.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm64/QgemmS8S8KernelNeon.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm64/QgemmU8X8KernelUdot.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm64/SgemmKernelNeon.asm + ${MLAS_SRC_DIR}/arm64/QgemmU8X8KernelNeon.asm + ${MLAS_SRC_DIR}/arm64/QgemmS8S8KernelNeon.asm + ${MLAS_SRC_DIR}/arm64/QgemmU8X8KernelUdot.asm + ${MLAS_SRC_DIR}/arm64/SgemmKernelNeon.asm ) else() - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_neon.cpp + target_sources(onnxruntime_mlas PRIVATE + ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp ) set(mlas_platform_preprocess_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm64ec/QgemmU8X8KernelNeon.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm64ec/SgemmKernelNeon.asm + ${MLAS_SRC_DIR}/arm64ec/QgemmU8X8KernelNeon.asm + ${MLAS_SRC_DIR}/arm64ec/SgemmKernelNeon.asm ) string(APPEND PREPROCESS_ARMASM_FLAGS " /arm64EC") @@ -88,288 +85,355 @@ elseif(MSVC) DEPENDS ${asm_filename} BYPRODUCTS ${preprocess_filename} ) - list(APPEND mlas_platform_srcs ${obj_filename}) + target_sources(onnxruntime_mlas PRIVATE ${obj_filename}) endforeach() elseif(onnxruntime_target_platform STREQUAL "ARM") - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm/sgemmc.cpp + target_sources(onnxruntime_mlas PRIVATE + ${MLAS_SRC_DIR}/arm/sgemmc.cpp ) elseif(onnxruntime_target_platform STREQUAL "x64") - enable_language(ASM_MASM) file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/mlas/lib/intrinsics/avx/*.cpp" + "${MLAS_SRC_DIR}/intrinsics/avx/*.cpp" ) set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "/arch:AVX") file(GLOB_RECURSE mlas_platform_srcs_avx2 CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/mlas/lib/intrinsics/avx2/*.cpp" + "${MLAS_SRC_DIR}/intrinsics/avx2/*.cpp" ) set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "/arch:AVX2") - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/dgemm.cpp + target_sources(onnxruntime_mlas PRIVATE + ${MLAS_SRC_DIR}/dgemm.cpp ${mlas_platform_srcs_avx} ${mlas_platform_srcs_avx2} - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_avx2.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_sse.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_sse41.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/intrinsics/avx512/quantize_avx512f.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8X8KernelAvx2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8X8KernelAvx512Core.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Vnni.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvxVnni.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelSse2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelAvx.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelFma3.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelAvx512F.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SgemmKernelSse2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SgemmKernelAvx.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SgemmKernelM1Avx.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SgemmKernelFma3.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SgemmKernelAvx512F.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SconvKernelSse2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SconvKernelAvx.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SconvKernelFma3.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SconvKernelAvx512F.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SpoolKernelSse2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SpoolKernelAvx.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SpoolKernelAvx512F.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/sgemma.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/cvtfp16a.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/SoftmaxKernelAvx.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/TransKernelFma3.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/TransKernelAvx512F.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/LogisticKernelFma3.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/TanhKernelFma3.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/ErfKernelFma3.asm + ${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp + ${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp + ${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp + ${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp + ${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAvx2.asm + ${MLAS_SRC_DIR}/amd64/QgemmU8U8KernelAvx2.asm + ${MLAS_SRC_DIR}/amd64/QgemmU8X8KernelAvx2.asm + ${MLAS_SRC_DIR}/amd64/QgemmU8X8KernelAvx512Core.asm + ${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx2.asm + ${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx512Core.asm + ${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx512Vnni.asm + ${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvxVnni.asm + ${MLAS_SRC_DIR}/amd64/DgemmKernelSse2.asm + ${MLAS_SRC_DIR}/amd64/DgemmKernelAvx.asm + ${MLAS_SRC_DIR}/amd64/DgemmKernelFma3.asm + ${MLAS_SRC_DIR}/amd64/DgemmKernelAvx512F.asm + ${MLAS_SRC_DIR}/amd64/SgemmKernelSse2.asm + ${MLAS_SRC_DIR}/amd64/SgemmKernelAvx.asm + ${MLAS_SRC_DIR}/amd64/SgemmKernelM1Avx.asm + ${MLAS_SRC_DIR}/amd64/SgemmKernelFma3.asm + ${MLAS_SRC_DIR}/amd64/SgemmKernelAvx512F.asm + ${MLAS_SRC_DIR}/amd64/SconvKernelSse2.asm + ${MLAS_SRC_DIR}/amd64/SconvKernelAvx.asm + ${MLAS_SRC_DIR}/amd64/SconvKernelFma3.asm + ${MLAS_SRC_DIR}/amd64/SconvKernelAvx512F.asm + ${MLAS_SRC_DIR}/amd64/SpoolKernelSse2.asm + ${MLAS_SRC_DIR}/amd64/SpoolKernelAvx.asm + ${MLAS_SRC_DIR}/amd64/SpoolKernelAvx512F.asm + ${MLAS_SRC_DIR}/amd64/sgemma.asm + ${MLAS_SRC_DIR}/amd64/cvtfp16a.asm + ${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx.asm + ${MLAS_SRC_DIR}/amd64/TransKernelFma3.asm + ${MLAS_SRC_DIR}/amd64/TransKernelAvx512F.asm + ${MLAS_SRC_DIR}/amd64/LogisticKernelFma3.asm + ${MLAS_SRC_DIR}/amd64/TanhKernelFma3.asm + ${MLAS_SRC_DIR}/amd64/ErfKernelFma3.asm ) else() - enable_language(ASM_MASM) - - set(CMAKE_ASM_MASM_FLAGS "${CMAKE_ASM_MASM_FLAGS} /safeseh") - - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_sse.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_sse41.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/i386/SgemmKernelSse2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/i386/SgemmKernelAvx.asm + target_sources(onnxruntime_mlas PRIVATE + ${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp + ${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp + ${MLAS_SRC_DIR}/i386/SgemmKernelSse2.asm + ${MLAS_SRC_DIR}/i386/SgemmKernelAvx.asm ) endif() -else() - if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64") - set(ARM64 TRUE) - elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64e") - set(ARM64 TRUE) - elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "arm") - set(ARM TRUE) - elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") - set(X86_64 TRUE) - elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "i386") - set(X86 TRUE) - endif() - if (CMAKE_SYSTEM_NAME STREQUAL "Android") - if (CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a") - set(ARM TRUE) - elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "arm64-v8a") - set(ARM64 TRUE) - elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86_64") - set(X86_64 TRUE) - elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86") - set(X86 TRUE) - endif() - elseif(CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "iOSCross") - set(IOS TRUE) +endfunction() + +if (onnxruntime_BUILD_WEBASSEMBLY) + if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) + file(GLOB_RECURSE mlas_platform_srcs + "${MLAS_SRC_DIR}/wasm_simd/*.cpp" + ) else() - execute_process( - COMMAND ${CMAKE_C_COMPILER} -dumpmachine - OUTPUT_VARIABLE dumpmachine_output - ERROR_QUIET + file(GLOB_RECURSE mlas_platform_srcs + "${MLAS_SRC_DIR}/wasm/*.cpp" ) - if(dumpmachine_output MATCHES "^arm64.*") - set(ARM64 TRUE) - elseif(dumpmachine_output MATCHES "^arm.*") - set(ARM TRUE) - elseif(dumpmachine_output MATCHES "^aarch64.*") - set(ARM64 TRUE) - elseif(dumpmachine_output MATCHES "^(powerpc.*|ppc.*)") - set(POWER TRUE) - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$") - set(X86 TRUE) - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$") - set(X86_64 TRUE) - endif() endif() + target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs}) +elseif(MSVC) + setup_mlas_source_for_windows() +else() + + if(APPLE) + get_target_property(ONNXRUNTIME_MLAS_OSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES) + + if(NOT ONNXRUNTIME_MLAS_OSX_ARCH) + set(ONNXRUNTIME_MLAS_OSX_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR}) + endif() + foreach(OSX_ARCH ${ONNXRUNTIME_MLAS_OSX_ARCH}) + if (OSX_ARCH STREQUAL "arm64") + set(ARM64 TRUE) + elseif (OSX_ARCH STREQUAL "arm64e") + set(ARM64 TRUE) + elseif (OSX_ARCH STREQUAL "arm") + set(ARM TRUE) + elseif (OSX_ARCH STREQUAL "x86_64") + set(X86_64 TRUE) + elseif (OSX_ARCH STREQUAL "i386") + set(X86 TRUE) + endif() + endforeach() + elseif(ANDROID) + if (CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a") + set(ARM TRUE) + elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "arm64-v8a") + set(ARM64 TRUE) + elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86_64") + set(X86_64 TRUE) + elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86") + set(X86 TRUE) + endif() + else() + #Linux/FreeBSD/PowerPC/... + #The value of CMAKE_SYSTEM_PROCESSOR should be from `uname -m` + #Example values: + #arm64v8/ubuntu -> aarch64 + #arm32v6/alpine -> armv7l + #arm32v7/centos -> armv7l + #ppc64le/debian -> ppc64le + #s390x/ubuntu -> s390x + #ppc64le/busybox -> ppc64le + #arm64v8/ubuntu -> aarch64 + #Android: armv7-a aarch64 i686 x86_64 + #chasun: I don't think anyone uses 'arm64' + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm64.*") + set(ARM64 TRUE) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm.*") + set(ARM TRUE) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*") + set(ARM64 TRUE) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc.*|ppc.*)") + set(POWER TRUE) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$") + set(X86 TRUE) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$") + set(X86_64 TRUE) + endif() + endif() - if(ARM) - enable_language(ASM) + if(APPLE) + get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES) + endif() + list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH) + if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGH GREATER 1) + set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE) + endif() + #If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below + #and split MLAS to multiple static libraries. + #Otherwise, it works like if(...) elseif(...) elseif(...) endif() + set(MLAS_SOURCE_IS_NOT_SET 1) + if(ARM) + enable_language(ASM) - set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfpu=neon") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") + set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfpu=neon") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/aarch32/QgemmU8X8KernelNeon.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/arm/sgemmc.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_neon.cpp - ) - elseif(ARM64) - enable_language(ASM) - - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/aarch64/QgemmU8X8KernelNeon.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/aarch64/QgemmS8S8KernelNeon.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/aarch64/QgemmU8X8KernelUdot.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/aarch64/SgemmKernelNeon.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/aarch64/SgemvKernelNeon.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_neon.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_udot.cpp - ) - elseif(POWER) - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/power/SgemmKernelPower.cpp - ) - check_cxx_compiler_flag("-mcpu=power10" HAS_POWER10) - if(HAS_POWER10) - set(CMAKE_REQUIRED_FLAGS "-mcpu=power10") - check_cxx_source_compiles(" - #include - int main() { - __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); - return 0; - }" - COMPILES_P10 - ) - if(COMPILES_P10) - check_cxx_source_compiles(" - #include - int main() { - unsigned long hwcap2 = getauxval(AT_HWCAP2); - bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1)); - return 0; - }" - HAS_P10_RUNTIME + set(mlas_platform_srcs + ${MLAS_SRC_DIR}/aarch32/QgemmU8X8KernelNeon.S + ${MLAS_SRC_DIR}/arm/sgemmc.cpp + ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp ) - if (HAS_P10_RUNTIME) - set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DPOWER10") + if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH) + set(MLAS_SOURCE_IS_NOT_SET 0) endif() - set(mlas_platform_srcs_power10 - ${ONNXRUNTIME_ROOT}/core/mlas/lib/power/SgemmKernelPOWER10.cpp + endif() + if(ARM64 AND MLAS_SOURCE_IS_NOT_SET ) + enable_language(ASM) + set(mlas_platform_srcs + ${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelNeon.S + ${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelNeon.S + ${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUdot.S + ${MLAS_SRC_DIR}/aarch64/SgemmKernelNeon.S + ${MLAS_SRC_DIR}/aarch64/SgemvKernelNeon.S + ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp + ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp ) - set_source_files_properties(${mlas_platform_srcs_power10} PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10") + if(ONNXRUNTIME_MLAS_MULTI_ARCH) + onnxruntime_add_static_library(onnxruntime_mlas_arm64 ${mlas_platform_srcs}) + set_target_properties(onnxruntime_mlas_arm64 PROPERTIES OSX_ARCHITECTURES "arm64") + list(APPEND ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas_arm64) + set(mlas_platform_srcs ) + else() + set(MLAS_SOURCE_IS_NOT_SET 0) + endif() + endif() + if(POWER AND MLAS_SOURCE_IS_NOT_SET) set(mlas_platform_srcs - ${mlas_platform_srcs} - ${mlas_platform_srcs_power10} + ${MLAS_SRC_DIR}/power/SgemmKernelPower.cpp ) - endif() + check_cxx_compiler_flag("-mcpu=power10" HAS_POWER10) + if(HAS_POWER10) + set(CMAKE_REQUIRED_FLAGS "-mcpu=power10") + check_cxx_source_compiles(" + #include + int main() { + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + return 0; + }" + COMPILES_P10 + ) + if(COMPILES_P10) + check_cxx_source_compiles(" + #include + int main() { + unsigned long hwcap2 = getauxval(AT_HWCAP2); + bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1)); + return 0; + }" + HAS_P10_RUNTIME + ) + if (HAS_P10_RUNTIME) + set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DPOWER10") + endif() + set(mlas_platform_srcs_power10 + ${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp + ) + set_source_files_properties(${mlas_platform_srcs_power10} PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10") + set(mlas_platform_srcs + ${mlas_platform_srcs} + ${mlas_platform_srcs_power10} + ) + endif() endif() - elseif(X86) - enable_language(ASM) + if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH) + set(MLAS_SOURCE_IS_NOT_SET 0) + endif() + endif() + if(X86 AND MLAS_SOURCE_IS_NOT_SET) + enable_language(ASM) - set(mlas_platform_srcs_sse2 - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_sse.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelSse2.S - ) - set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2") + set(mlas_platform_srcs_sse2 + ${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp + ${MLAS_SRC_DIR}/x86/SgemmKernelSse2.S + ) + set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2") - set(mlas_platform_srcs_avx - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelAvx.S - ) - set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx") + set(mlas_platform_srcs_avx + ${MLAS_SRC_DIR}/x86/SgemmKernelAvx.S + ) + set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx") - set(mlas_platform_srcs - ${mlas_platform_srcs_sse2} - ${mlas_platform_srcs_avx} - ) - elseif(X86_64) - enable_language(ASM) - - # Forward the flags for the minimum target platform version from the C - # compiler to the assembler. This works around CMakeASMCompiler.cmake.in - # not including the logic to set this flag for the assembler. - set(CMAKE_ASM${ASM_DIALECT}_OSX_DEPLOYMENT_TARGET_FLAG "${CMAKE_C_OSX_DEPLOYMENT_TARGET_FLAG}") - - # The LLVM assembler does not support the .arch directive to enable instruction - # set extensions and also doesn't support AVX-512F instructions without - # turning on support via command-line option. Group the sources by the - # instruction set extension and explicitly set the compiler flag as appropriate. - - set(mlas_platform_srcs_sse2 - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_sse.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/DgemmKernelSse2.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelSse2.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmTransposePackB16x4Sse2.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SconvKernelSse2.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SpoolKernelSse2.S - ) - set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2") - - set(mlas_platform_srcs_avx - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/DgemmKernelAvx.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelAvx.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelM1Avx.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelM1TransposeBAvx.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmTransposePackB16x4Avx.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SconvKernelAvx.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SpoolKernelAvx.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SoftmaxKernelAvx.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/intrinsics/avx/min_max_elements.cpp - ) - set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx") - - set(mlas_platform_srcs_avx2 - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx2.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx2.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx2.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvxVnni.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8X8KernelAvx2.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/DgemmKernelFma3.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelFma3.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SconvKernelFma3.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/TransKernelFma3.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/LogisticKernelFma3.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/TanhKernelFma3.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/ErfKernelFma3.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/intrinsics/avx2/qladd_avx2.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/intrinsics/avx2/qdwconv_avx2.cpp - ) - set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma") - - set(mlas_platform_srcs_avx512f - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/DgemmKernelAvx512F.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelAvx512F.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SconvKernelAvx512F.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SpoolKernelAvx512F.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/TransKernelAvx512F.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/intrinsics/avx512/quantize_avx512f.cpp - ) - set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f") + set(mlas_platform_srcs + ${mlas_platform_srcs_sse2} + ${mlas_platform_srcs_avx} + ) + if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH) + set(MLAS_SOURCE_IS_NOT_SET 0) + endif() + endif() + if(X86_64 AND MLAS_SOURCE_IS_NOT_SET) + enable_language(ASM) - set(mlas_platform_srcs_avx512core - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Vnni.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8X8KernelAvx512Core.S - ) - set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl") + # Forward the flags for the minimum target platform version from the C + # compiler to the assembler. This works around CMakeASMCompiler.cmake.in + # not including the logic to set this flag for the assembler. + set(CMAKE_ASM${ASM_DIALECT}_OSX_DEPLOYMENT_TARGET_FLAG "${CMAKE_C_OSX_DEPLOYMENT_TARGET_FLAG}") - set(mlas_platform_srcs - ${ONNXRUNTIME_ROOT}/core/mlas/lib/dgemm.cpp - ${ONNXRUNTIME_ROOT}/core/mlas/lib/qgemm_kernel_avx2.cpp - ${mlas_platform_srcs_sse2} - ${mlas_platform_srcs_avx} - ${mlas_platform_srcs_avx2} - ${mlas_platform_srcs_avx512f} - ${mlas_platform_srcs_avx512core} - ) - endif() + # The LLVM assembler does not support the .arch directive to enable instruction + # set extensions and also doesn't support AVX-512F instructions without + # turning on support via command-line option. Group the sources by the + # instruction set extension and explicitly set the compiler flag as appropriate. + + set(mlas_platform_srcs_sse2 + ${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp + ${MLAS_SRC_DIR}/x86_64/DgemmKernelSse2.S + ${MLAS_SRC_DIR}/x86_64/SgemmKernelSse2.S + ${MLAS_SRC_DIR}/x86_64/SgemmTransposePackB16x4Sse2.S + ${MLAS_SRC_DIR}/x86_64/SconvKernelSse2.S + ${MLAS_SRC_DIR}/x86_64/SpoolKernelSse2.S + ) + set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2") + + set(mlas_platform_srcs_avx + ${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx.S + ${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx.S + ${MLAS_SRC_DIR}/x86_64/SgemmKernelM1Avx.S + ${MLAS_SRC_DIR}/x86_64/SgemmKernelM1TransposeBAvx.S + ${MLAS_SRC_DIR}/x86_64/SgemmTransposePackB16x4Avx.S + ${MLAS_SRC_DIR}/x86_64/SconvKernelAvx.S + ${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx.S + ${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx.S + ${MLAS_SRC_DIR}/intrinsics/avx/min_max_elements.cpp + ) + set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx") + + set(mlas_platform_srcs_avx2 + ${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAvx2.S + ${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx2.S + ${MLAS_SRC_DIR}/x86_64/QgemmU8U8KernelAvx2.S + ${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvxVnni.S + ${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx2.S + ${MLAS_SRC_DIR}/x86_64/DgemmKernelFma3.S + ${MLAS_SRC_DIR}/x86_64/SgemmKernelFma3.S + ${MLAS_SRC_DIR}/x86_64/SconvKernelFma3.S + ${MLAS_SRC_DIR}/x86_64/TransKernelFma3.S + ${MLAS_SRC_DIR}/x86_64/LogisticKernelFma3.S + ${MLAS_SRC_DIR}/x86_64/TanhKernelFma3.S + ${MLAS_SRC_DIR}/x86_64/ErfKernelFma3.S + ${MLAS_SRC_DIR}/intrinsics/avx2/qladd_avx2.cpp + ${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp + ) + set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma") + + set(mlas_platform_srcs_avx512f + ${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S + ${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S + ${MLAS_SRC_DIR}/x86_64/SconvKernelAvx512F.S + ${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx512F.S + ${MLAS_SRC_DIR}/x86_64/TransKernelAvx512F.S + ${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp + ) + set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f") + + set(mlas_platform_srcs_avx512core + ${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Core.S + ${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Vnni.S + ${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx512Core.S + ) + set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl") + + set(mlas_platform_srcs + ${MLAS_SRC_DIR}/dgemm.cpp + ${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp + ${mlas_platform_srcs_sse2} + ${mlas_platform_srcs_avx} + ${mlas_platform_srcs_avx2} + ${mlas_platform_srcs_avx512f} + ${mlas_platform_srcs_avx512core} + ) + + if(ONNXRUNTIME_MLAS_MULTI_ARCH) + onnxruntime_add_static_library(onnxruntime_mlas_x86_64 ${mlas_platform_srcs}) + set_target_properties(onnxruntime_mlas_x86_64 PROPERTIES OSX_ARCHITECTURES "x86_64") + list(APPEND ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas_x86_64) + set(mlas_platform_srcs ) + else() + set(MLAS_SOURCE_IS_NOT_SET 1) + endif() + + endif() + target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs}) endif() -onnxruntime_add_static_library(onnxruntime_mlas ${mlas_common_srcs} ${mlas_platform_srcs}) -target_include_directories(onnxruntime_mlas PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${ONNXRUNTIME_ROOT}/core/mlas/lib) +foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS}) + target_include_directories(${mlas_target} PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${MLAS_SRC_DIR}) +endforeach() set_target_properties(onnxruntime_mlas PROPERTIES FOLDER "ONNXRuntime") if (WIN32) target_compile_options(onnxruntime_mlas PRIVATE "/wd6385" "/wd4127") diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index cb41b8b86d967..f0ea0279b75b9 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -134,8 +134,8 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE onnxruntime_framework onnxruntime_util onnxruntime_graph + ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common - onnxruntime_mlas onnxruntime_flatbuffers ${pybind11_lib} ) @@ -384,6 +384,9 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy ${REPO_ROOT}/docs/Privacy.md $/onnxruntime/ + COMMAND ${CMAKE_COMMAND} -E copy + ${REPO_ROOT}/docs/python/README.rst + $/onnxruntime/ COMMAND ${CMAKE_COMMAND} -E copy ${REPO_ROOT}/LICENSE $/onnxruntime/ diff --git a/cmake/onnxruntime_training.cmake b/cmake/onnxruntime_training.cmake index acbc4818efd06..8bd7b219ec617 100644 --- a/cmake/onnxruntime_training.cmake +++ b/cmake/onnxruntime_training.cmake @@ -132,8 +132,8 @@ if (onnxruntime_BUILD_UNIT_TESTS) list(APPEND ONNXRUNTIME_LIBS onnxruntime_graph - onnxruntime_common - onnxruntime_mlas + ${ONNXRUNTIME_MLAS_LIBS} + onnxruntime_common onnxruntime_flatbuffers ) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index d5e64523f0e6b..7cd6aac8a3a59 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -410,8 +410,8 @@ set(onnxruntime_test_framework_libs onnxruntime_framework onnxruntime_util onnxruntime_graph + ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common - onnxruntime_mlas ) set(onnxruntime_test_server_libs @@ -504,8 +504,8 @@ set(ONNXRUNTIME_TEST_LIBS onnxruntime_framework onnxruntime_util onnxruntime_graph + ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common - onnxruntime_mlas onnxruntime_flatbuffers ) @@ -853,9 +853,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) file(GLOB_RECURSE MLAS_BENCH_SOURCE_FILES "${MLAS_BENCH_DIR}/*.cpp" "${MLAS_BENCH_DIR}/*.h") onnxruntime_add_executable(onnxruntime_mlas_benchmark ${MLAS_BENCH_SOURCE_FILES}) target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc) - target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util onnxruntime_framework onnxruntime_mlas onnxruntime_common ${CMAKE_DL_LIBS}) + target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util onnxruntime_framework ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS}) if(NOT WIN32) - target_link_libraries(onnxruntime_mlas_benchmark PRIVATE nsync_cpp) + target_link_libraries(onnxruntime_mlas_benchmark PRIVATE nsync_cpp ${CMAKE_DL_LIBS}) endif() set_target_properties(onnxruntime_mlas_benchmark PROPERTIES FOLDER "ONNXRuntimeTest") endif() @@ -896,8 +896,8 @@ if(onnxruntime_ENABLE_EAGER_MODE) onnxruntime_framework flatbuffers onnxruntime_graph + ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common - onnxruntime_mlas onnx onnx_proto ${PROTOBUF_LIB} @@ -1105,7 +1105,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) endif() target_include_directories(onnxruntime_mlas_test PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}) - set(onnxruntime_mlas_test_libs GTest::gtest GTest::gmock onnxruntime_mlas onnxruntime_common) + set(onnxruntime_mlas_test_libs GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common) if(NOT WIN32) list(APPEND onnxruntime_mlas_test_libs nsync_cpp ${CMAKE_DL_LIBS}) endif() From 73c67ca3f2d1503eb7bc0cfbe8ccebab494a14ce Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 2 Sep 2021 16:47:12 -0700 Subject: [PATCH 2/5] spaces --- cmake/CMakeLists.txt | 4 ++-- cmake/onnxruntime_training.cmake | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 35bfe9d91103e..29b43b47d3f9d 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -894,8 +894,8 @@ if(CPUINFO_SUPPORTED) set(CPUINFO_BUILD_TOOLS OFF CACHE INTERNAL "") set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE INTERNAL "") set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE INTERNAL "") - set(CPUINFO_BUILD_BENCHMARKS OFF CACHE INTERNAL "") - + set(CPUINFO_BUILD_BENCHMARKS OFF CACHE INTERNAL "") + if (CMAKE_SYSTEM_NAME STREQUAL "iOS") set(IOS ON CACHE INTERNAL "") set(IOS_ARCH "${CMAKE_OSX_ARCHITECTURES}" CACHE INTERNAL "") diff --git a/cmake/onnxruntime_training.cmake b/cmake/onnxruntime_training.cmake index 8bd7b219ec617..56b67c2122796 100644 --- a/cmake/onnxruntime_training.cmake +++ b/cmake/onnxruntime_training.cmake @@ -133,7 +133,7 @@ if (onnxruntime_BUILD_UNIT_TESTS) list(APPEND ONNXRUNTIME_LIBS onnxruntime_graph ${ONNXRUNTIME_MLAS_LIBS} - onnxruntime_common + onnxruntime_common onnxruntime_flatbuffers ) From ab169dd9da7cb665236b19a566831589bfa3bcab Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 2 Sep 2021 17:14:38 -0700 Subject: [PATCH 3/5] revert --- cmake/onnxruntime_python.cmake | 3 --- 1 file changed, 3 deletions(-) diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index f0ea0279b75b9..ccdd86ca96f94 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -384,9 +384,6 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy ${REPO_ROOT}/docs/Privacy.md $/onnxruntime/ - COMMAND ${CMAKE_COMMAND} -E copy - ${REPO_ROOT}/docs/python/README.rst - $/onnxruntime/ COMMAND ${CMAKE_COMMAND} -E copy ${REPO_ROOT}/LICENSE $/onnxruntime/ From f139b018776cee402601c750efc3b07882a9cc0a Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Fri, 3 Sep 2021 19:07:27 -0700 Subject: [PATCH 4/5] update --- cmake/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 29b43b47d3f9d..de674c57111de 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -863,6 +863,10 @@ list(LENGTH CMAKE_OSX_ARCHITECTURES CMAKE_OSX_ARCHITECTURES_LEN) if(APPLE) if(CMAKE_OSX_ARCHITECTURES_LEN LESS_EQUAL 1) set(CPUINFO_SUPPORTED TRUE) + elseif(onnxruntime_BUILD_APPLE_FRAMEWORK) + # We stitch multiple static libraries together when onnxruntime_BUILD_APPLE_FRAMEWORK is true, + # but that would not work for universal static libraries + message(FATAL_ERROR "universal binary is not supported for apple framework") endif() else() if(onnxruntime_BUILD_WEBASSEMBLY) From 4826a39e67b8e4a42f87be18c2e75b3710da1d16 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Fri, 3 Sep 2021 19:08:06 -0700 Subject: [PATCH 5/5] tab to space --- cmake/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index de674c57111de..7fc8f74c6908c 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -533,7 +533,7 @@ if (MSVC) elseif(onnxruntime_target_platform STREQUAL "Win32" OR onnxruntime_target_platform STREQUAL "x86" OR onnxruntime_target_platform STREQUAL "i386" OR onnxruntime_target_platform STREQUAL "i686") set(onnxruntime_target_platform "x86") if(NOT onnxruntime_BUILD_WEBASSEMBLY) - message("Enabling SAFESEH for x86 build") + message("Enabling SAFESEH for x86 build") set(CMAKE_ASM_MASM_FLAGS "${CMAKE_ASM_MASM_FLAGS} /safeseh") endif() endif() @@ -863,17 +863,17 @@ list(LENGTH CMAKE_OSX_ARCHITECTURES CMAKE_OSX_ARCHITECTURES_LEN) if(APPLE) if(CMAKE_OSX_ARCHITECTURES_LEN LESS_EQUAL 1) set(CPUINFO_SUPPORTED TRUE) - elseif(onnxruntime_BUILD_APPLE_FRAMEWORK) + elseif(onnxruntime_BUILD_APPLE_FRAMEWORK) # We stitch multiple static libraries together when onnxruntime_BUILD_APPLE_FRAMEWORK is true, - # but that would not work for universal static libraries - message(FATAL_ERROR "universal binary is not supported for apple framework") + # but that would not work for universal static libraries + message(FATAL_ERROR "universal binary is not supported for apple framework") endif() else() if(onnxruntime_BUILD_WEBASSEMBLY) set(CPUINFO_SUPPORTED FALSE) - else() + else() set(CPUINFO_SUPPORTED TRUE) - endif() + endif() if(WIN32) # Exclude Windows ARM build and Windows Store if(${onnxruntime_target_platform} MATCHES "^(ARM.*|arm.*)$" )