diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash index 4398ee1e67..f1e13b16b4 100644 --- a/.github/scripts/fbgemm_gpu_test.bash +++ b/.github/scripts/fbgemm_gpu_test.bash @@ -498,7 +498,8 @@ test_fbgemm_gpu_setup_and_pip_install () { ) elif [ "$variant_type" == "rocm" ]; then local variant_versions=( - 6.0.2 + 6.1.2 + 6.2.4 ) elif [ "$variant_type" == "cpu" ]; then local variant_versions=( diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash index 2dbc92c400..98885cbd26 100644 --- a/.github/scripts/utils_pip.bash +++ b/.github/scripts/utils_pip.bash @@ -42,7 +42,7 @@ __export_package_variant_info () { local package_variant_type_version="$1" local FALLBACK_VERSION_CUDA="12.4.1" - local FALLBACK_VERSION_ROCM="6.0.2" + local FALLBACK_VERSION_ROCM="6.2.4" if [ "$package_variant_type_version" == "cuda" ]; then # If "cuda", default to latest CUDA @@ -205,7 +205,7 @@ install_from_pytorch_pip () { echo " ${FUNCNAME[0]} build_env torch 1.11.0 cpu # Install the CPU variant, specific version from release channel" echo " ${FUNCNAME[0]} build_env torch release cpu # Install the CPU variant, latest version from release channel" echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.8.0 cuda/12.4.0 # Install the CUDA 12.4 variant, specific version from test channel" - echo " ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/6.1 # Install the ROCM 6.1 variant, latest version from nightly channel" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/6.2 # Install the ROCM 6.2 variant, latest version from nightly channel" echo " ${FUNCNAME[0]} build_env pytorch_triton 1.11.0 # Install specific version from release channel" echo " ${FUNCNAME[0]} build_env pytorch_triton release # Install latest version from release channel" echo " ${FUNCNAME[0]} build_env pytorch_triton test/0.8.0 # Install specific version from test channel" @@ -250,7 +250,7 @@ download_from_pytorch_pip () { echo " ${FUNCNAME[0]} build_env torch 1.11.0 cpu # Download the CPU variant, specific version from release channel" echo " ${FUNCNAME[0]} build_env torch release cpu # Download the CPU variant, latest version from release channel" echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.8.0 cuda/12.4.0 # Download the CUDA 12.4 variant, specific version from test channel" - echo " ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/6.1 # Download the ROCM 6.1 variant, latest version from nightly channel" + echo " ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/6.2 # Download the ROCM 6.2 variant, latest version from nightly channel" return 1 else echo "################################################################################" diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash index a80cf7627b..f2ecba7b91 100644 --- a/.github/scripts/utils_pytorch.bash +++ b/.github/scripts/utils_pytorch.bash @@ -113,7 +113,7 @@ install_pytorch_pip () { echo " ${FUNCNAME[0]} build_env test/2.1.0 cpu # Install the CPU variant for a specific version" echo " ${FUNCNAME[0]} build_env release cpu # Install the CPU variant, latest release version" echo " ${FUNCNAME[0]} build_env test cuda/12.4.0 # Install the CUDA 12.4 variant, latest test version" - echo " ${FUNCNAME[0]} build_env nightly rocm/6.1 # Install the ROCM 6.1 variant, latest nightly version" + echo " ${FUNCNAME[0]} build_env nightly rocm/6.2 # Install the ROCM 6.2 variant, latest nightly version" return 1 else echo "################################################################################" diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml index d30588e8f3..50aede6510 100644 --- a/.github/workflows/fbgemm_gpu_ci_rocm.yml +++ b/.github/workflows/fbgemm_gpu_ci_rocm.yml @@ -66,7 +66,7 @@ jobs: ] container-image: [ "ubuntu:22.04" ] python-version: [ "3.9", "3.10", "3.11", "3.12" ] - rocm-version: [ "6.1", "6.2" ] + rocm-version: [ "6.1.2", "6.2.4" ] compiler: [ "gcc", "clang" ] steps: @@ -147,7 +147,7 @@ jobs: ] # ROCm machines are limited, so we only test a subset of Python versions python-version: [ "3.12" ] - rocm-version: [ "6.2" ] + rocm-version: [ "6.2.4" ] compiler: [ "gcc", "clang" ] needs: build_artifact diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml index e8e0fffb11..2e8a624da0 100644 --- a/.github/workflows/fbgemm_gpu_pip.yml +++ b/.github/workflows/fbgemm_gpu_pip.yml @@ -186,7 +186,7 @@ jobs: ] # ROCm machines are limited, so we only test a subset of Python versions python-version: [ "3.11", "3.12" ] - rocm-version: [ "6.2" ] + rocm-version: [ "6.1.2", "6.2.4" ] steps: - name: Setup Build Container diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake index ac9bf1228b..6afcfb9fc1 100644 --- a/cmake/modules/GpuCppLibrary.cmake +++ b/cmake/modules/GpuCppLibrary.cmake @@ -9,7 +9,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake) function(prepare_target_sources) # This function does the following: # 1. Take all the specified project sources for a target - # 1. Filter the files out based on CPU-only, CUDA, and HIP build modes + # 1. Filter files out based on CPU-only, CUDA, and HIP build modes # 1. Bucketize them into sets of CXX, CU, and HIP files # 1. Apply common source file properties for each bucket # 1. Merge the buckets back into a single list of sources @@ -36,7 +36,12 @@ function(prepare_target_sources) ############################################################################ # Add the CPU CXX sources - set(${args_PREFIX}_sources_cpp ${args_CPU_SRCS}) + LIST_FILTER( + INPUT ${args_CPU_SRCS} + OUTPUT cpu_sources_cpp + REGEX "^.+\.cpp$" + ) + set(${args_PREFIX}_sources_cpp ${cpu_sources_cpp}) # For GPU mode, add the CXX sources from GPU_SRCS if(NOT FBGEMM_CPU_ONLY) @@ -127,37 +132,6 @@ function(prepare_target_sources) set(${args_PREFIX}_sources ${${args_PREFIX}_sources_combined} PARENT_SCOPE) endfunction() -function(prepare_hipified_target_sources) - # This function does the following: - # 1. Take all the specified target sources - # 1. Look up their equivalent HIPified files if applicable (presumes that hipify() already been run) - # 1. Apply source file properties - # 1. Update the HIP include directories - - set(flags) - set(singleValueArgs PREFIX) - set(multiValueArgs SRCS INCLUDE_DIRS) - - cmake_parse_arguments( - args - "${flags}" "${singleValueArgs}" "${multiValueArgs}" - ${ARGN}) - - get_hipified_list("${args_SRCS}" args_SRCS) - - set_source_files_properties(${args_SRCS} - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - - # Add include directories - hip_include_directories("${args_INCLUDE_DIRS}") - - ############################################################################ - # Set the Output Variable(s) - ############################################################################ - - set(${args_PREFIX}_sources_hipified ${args_SRCS} PARENT_SCOPE) -endfunction() - function(gpu_cpp_library) # This function does the following: # 1. Take all the target sources and select relevant sources based on build type (CPU-only, CUDA, HIP) @@ -174,6 +148,7 @@ function(gpu_cpp_library) GPU_SRCS # Sources common to both CUDA and HIP builds. .CU files specified here will be HIPified when building a HIP target CUDA_SPECIFIC_SRCS # Sources available only for CUDA build HIP_SPECIFIC_SRCS # Sources available only for HIP build + OTHER_SRCS # Sources from third-party libraries GPU_FLAGS # Compile flags for GPU builds INCLUDE_DIRS # Include directories for compilation ) @@ -204,12 +179,16 @@ function(gpu_cpp_library) set(lib_name ${args_PREFIX}_py) if(USE_ROCM) - # Fetch the HIPified sources - prepare_hipified_target_sources( - PREFIX ${args_PREFIX} - SRCS ${lib_sources} - INCLUDE_DIRS ${args_INCLUDE_DIRS}) - set(lib_sources_hipified ${${args_PREFIX}_sources_hipified}) + # Fetch the equivalent HIPified sources if available. + # This presumes that hipify() has already been run. + get_hipified_list("${lib_sources}" lib_sources_hipified) + + # Set properties for the HIPified sources + set_source_files_properties(${lib_sources_hipified} + PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + + # Set the include directories for HIP + hip_include_directories("${args_INCLUDE_DIRS}") # Create the HIP library hip_add_library(${lib_name} SHARED @@ -223,7 +202,8 @@ function(gpu_cpp_library) target_include_directories(${lib_name} PUBLIC ${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} - ${ROCM_SMI_INCLUDE}) + ${ROCM_SMI_INCLUDE} + ${args_INCLUDE_DIRS}) else() # Create the C++/CUDA library @@ -296,6 +276,9 @@ function(gpu_cpp_library) "HIP_SPECIFIC_SRCS" "${args_HIP_SPECIFIC_SRCS}" " " + "OTHER_SRCS:" + "${args_OTHER_SRCS}" + " " "GPU_FLAGS:" "${args_GPU_FLAGS}" " " diff --git a/cmake/modules/Utilities.cmake b/cmake/modules/Utilities.cmake index 193a5648be..c557b3a7e1 100644 --- a/cmake/modules/Utilities.cmake +++ b/cmake/modules/Utilities.cmake @@ -40,6 +40,26 @@ function(LIST_FILTER) set(${args_OUTPUT} ${${args_OUTPUT}} PARENT_SCOPE) endfunction() + +function(prepend_filepaths) + set(flags) + set(singleValueArgs PREFIX OUTPUT) + set(multiValueArgs INPUT) + + cmake_parse_arguments( + args + "${flags}" "${singleValueArgs}" "${multiValueArgs}" + ${ARGN}) + + set(${args_OUTPUT}) + + foreach(filepath ${args_INPUT}) + list(APPEND ${args_OUTPUT} "${args_PREFIX}/${filepath}") + endforeach() + + set(${args_OUTPUT} ${${args_OUTPUT}} PARENT_SCOPE) +endfunction() + function(add_to_package) set(flags) set(singleValueArgs DESTINATION) diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 8db9bf6a5c..77a57a9641 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -108,6 +108,68 @@ set(fbgemm_sources_include_directories ${NCCL_INCLUDE_DIRS}) +################################################################################ +# TBE Code Generation +################################################################################ + +set(CMAKE_CODEGEN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/codegen) + +macro(RUN_GEN_SCRIPT SCRIPT) + if(USE_ROCM) + set(rocm_flag --is_rocm) + endif() + + BLOCK_PRINT( + "Running code generation script ..." + "${PYTHON_EXECUTABLE} ${SCRIPT} --opensource ${rocm_flag}" + ) + + execute_process( + COMMAND "${PYTHON_EXECUTABLE}" ${SCRIPT} "--opensource" ${rocm_flag}) +endmacro() + +foreach(script + "${CMAKE_CODEGEN_DIR}/genscript/generate_backward_split.py" + "${CMAKE_CODEGEN_DIR}/genscript/generate_embedding_optimizer.py" + "${CMAKE_CODEGEN_DIR}/genscript/generate_forward_quantized.py" + "${CMAKE_CODEGEN_DIR}/genscript/generate_forward_split.py" + "${CMAKE_CODEGEN_DIR}/genscript/generate_index_select.py") + RUN_GEN_SCRIPT(${script}) +endforeach() + + +# ################################################################################ +# HIP Code Generation +# ################################################################################ + +if(USE_ROCM) + set(include_dirs_for_hipification + # All directories need to be included for headers to be properly HIPified + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/experimental/gen_ai) + + # HIPify all .CU and .CUH sources under the current directory (`/fbgemm_gpu`) + # .H sources are not automatically HIPified, so they need #ifdef USE_ROCM guards + hipify( + CUDA_SOURCE_DIR + ${PROJECT_SOURCE_DIR} + HEADER_INCLUDE_DIR + ${include_dirs_for_hipification}) + + BLOCK_PRINT( + "HIPify Sources" + " " + "CUDA_SOURCE_DIR:" + "${PROJECT_SOURCE_DIR}" + " " + "HEADER_INCLUDE_DIR:" + "${include_dirs_for_hipification}" + ) +endif() + + ################################################################################ # Build FBGEMM_GPU (Main) Module ################################################################################ @@ -131,6 +193,7 @@ if(NOT FBGEMM_CPU_ONLY) endif() if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) - # TODO: Re-enable gen_ai for ROCm after enabling build support for ROCm 6.2 + # TODO: Re-enable gen_ai for ROCm once ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_ab_scale.hpp + # lands into latest ROCm add_subdirectory(experimental/gen_ai) endif() diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake index acda2aa675..74172ae3da 100644 --- a/fbgemm_gpu/FbgemmGpu.cmake +++ b/fbgemm_gpu/FbgemmGpu.cmake @@ -10,8 +10,6 @@ include(${CMAKEMODULES}/Utilities.cmake) -set(CMAKE_CODEGEN_DIR ${CMAKE_CURRENT_SOURCE_DIR}/codegen) - ################################################################################ # Third Party Sources @@ -85,35 +83,6 @@ set(WEIGHT_OPTIONS unweighted) -################################################################################ -# TBE Code Generation -################################################################################ - -macro(RUN_GEN_SCRIPT SCRIPT) - set(rocm_flag "") - if(USE_ROCM) - set(rocm_flag --is_rocm) - endif() - - BLOCK_PRINT( - "Running code generation script ..." - "${PYTHON_EXECUTABLE} ${SCRIPT} --opensource ${rocm_flag}" - ) - - execute_process( - COMMAND "${PYTHON_EXECUTABLE}" ${SCRIPT} "--opensource" ${rocm_flag}) -endmacro() - -foreach(script - "${CMAKE_CODEGEN_DIR}/genscript/generate_backward_split.py" - "${CMAKE_CODEGEN_DIR}/genscript/generate_embedding_optimizer.py" - "${CMAKE_CODEGEN_DIR}/genscript/generate_forward_quantized.py" - "${CMAKE_CODEGEN_DIR}/genscript/generate_forward_split.py" - "${CMAKE_CODEGEN_DIR}/genscript/generate_index_select.py") - RUN_GEN_SCRIPT(${script}) -endforeach() - - ################################################################################ # Optimizer Groups ################################################################################ @@ -443,7 +412,7 @@ set_source_files_properties(${fbgemm_sources} # FBGEMM_GPU Static Sources ################################################################################ -set(fbgemm_gpu_sources_static_cpu +set(fbgemm_gpu_sources_cpu_static codegen/training/forward/embedding_forward_split_cpu.cpp codegen/inference/embedding_forward_quantized_host_cpu.cpp codegen/training/backward/embedding_backward_dense_host_cpu.cpp @@ -479,7 +448,7 @@ set(fbgemm_gpu_sources_static_cpu codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp) if(NOT FBGEMM_CPU_ONLY) - list(APPEND fbgemm_gpu_sources_static_cpu + list(APPEND fbgemm_gpu_sources_cpu_static codegen/inference/embedding_forward_quantized_host.cpp codegen/utils/embedding_bounds_check_host.cpp src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp @@ -489,7 +458,6 @@ if(NOT FBGEMM_CPU_ONLY) src/quantize_ops/quantize_ops_gpu.cpp src/sparse_ops/sparse_ops_gpu.cpp src/split_embeddings_utils/split_embeddings_utils.cpp - src/split_embeddings_cache/split_embeddings_cache_ops.cu src/metric_ops/metric_ops_host.cpp src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp src/input_combine_ops/input_combine_gpu.cpp @@ -497,7 +465,7 @@ if(NOT FBGEMM_CPU_ONLY) if(NVML_LIB_PATH OR USE_ROCM) message(STATUS "Adding merge_pooled_embeddings sources") - list(APPEND fbgemm_gpu_sources_static_cpu + list(APPEND fbgemm_gpu_sources_cpu_static src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp src/topology_utils.cpp) else() @@ -505,18 +473,8 @@ if(NOT FBGEMM_CPU_ONLY) endif() endif() -if(CXX_AVX2_FOUND) - set_source_files_properties(${fbgemm_gpu_sources_static_cpu} - PROPERTIES COMPILE_OPTIONS - "${AVX2_FLAGS}") -else() - set_source_files_properties(${fbgemm_gpu_sources_static_cpu} - PROPERTIES COMPILE_OPTIONS - "-fopenmp") -endif() - if(NOT FBGEMM_CPU_ONLY) - set(fbgemm_gpu_sources_static_gpu + set(fbgemm_gpu_sources_gpu_static codegen/utils/embedding_bounds_check_v1.cu codegen/utils/embedding_bounds_check_v2.cu codegen/inference/embedding_forward_quantized_split_lookup.cu @@ -585,31 +543,11 @@ if(NOT FBGEMM_CPU_ONLY) src/split_embeddings_cache/lxu_cache.cu src/split_embeddings_cache/linearize_cache_indices.cu src/split_embeddings_cache/reset_weight_momentum.cu + src/split_embeddings_cache/split_embeddings_cache_ops.cu src/split_embeddings_utils/generate_vbe_metadata.cu src/split_embeddings_utils/get_infos_metadata.cu src/split_embeddings_utils/radix_sort_pairs.cu src/split_embeddings_utils/transpose_embedding_input.cu) - - set_source_files_properties(${fbgemm_gpu_sources_static_gpu} - PROPERTIES COMPILE_OPTIONS - "${TORCH_CUDA_OPTIONS}") - - set_source_files_properties(${fbgemm_gpu_sources_static_gpu} - PROPERTIES INCLUDE_DIRECTORIES - "${fbgemm_sources_include_directories}") -endif() - -set_source_files_properties(${fbgemm_gpu_sources_static_cpu} - PROPERTIES INCLUDE_DIRECTORIES - "${fbgemm_sources_include_directories}") - -if(NOT FBGEMM_CPU_ONLY) - set(fbgemm_gpu_sources_static - ${fbgemm_gpu_sources_static_gpu} - ${fbgemm_gpu_sources_static_cpu}) -else() - set(fbgemm_gpu_sources_static - ${fbgemm_gpu_sources_static_cpu}) endif() @@ -617,115 +555,47 @@ endif() # FBGEMM_GPU HIP Code Generation ################################################################################ -if(USE_ROCM) - # HIPify CUDA code - set(header_include_dir - ${CMAKE_CURRENT_SOURCE_DIR}/include - ${CMAKE_CURRENT_SOURCE_DIR}/src - ${CMAKE_CURRENT_SOURCE_DIR}) - - hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR} - HEADER_INCLUDE_DIR ${header_include_dir}) - - # Get the absolute paths of all generated sources - set(fbgemm_gpu_sources_gen_abs) - foreach(source_gen_filename ${fbgemm_gpu_sources_gen}) - list(APPEND fbgemm_gpu_sources_gen_abs - "${CMAKE_BINARY_DIR}/${source_gen_filename}") - endforeach() +set(fbgemm_gpu_sources_cpu_gen + ${gen_cpu_source_files}) - # HIPify FBGEMM, FBGEMM_GPU static, and FBGEMM_GPU generated sources - get_hipified_list("${fbgemm_gpu_sources_static}" fbgemm_gpu_sources_static) - get_hipified_list("${fbgemm_gpu_sources_gen_abs}" fbgemm_gpu_sources_gen_abs) - get_hipified_list("${fbgemm_sources}" fbgemm_sources) - - # Combine all HIPified sources - set(fbgemm_gpu_sources_hip - ${fbgemm_sources} - ${fbgemm_gpu_sources_static} - ${fbgemm_gpu_sources_gen_abs}) +set(fbgemm_gpu_sources_gpu_gen + ${gen_gpu_kernel_source_files} + ${gen_gpu_host_source_files} + ${gen_defused_optim_source_files}) - set_source_files_properties(${fbgemm_gpu_sources_hip} - PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - - # Add FBGEMM include/ - hip_include_directories("${fbgemm_sources_include_directories}") +if(USE_ROCM) + prepend_filepaths( + PREFIX ${CMAKE_BINARY_DIR} + INPUT ${fbgemm_gpu_sources_cpu_gen} + OUTPUT fbgemm_gpu_sources_cpu_gen) + + prepend_filepaths( + PREFIX ${CMAKE_BINARY_DIR} + INPUT ${fbgemm_gpu_sources_gpu_gen} + OUTPUT fbgemm_gpu_sources_gpu_gen) endif() ################################################################################ -# FBGEMM_GPU Full Python Module +# FBGEMM_GPU C++ Modules ################################################################################ -if(USE_ROCM) - # Create a HIP library if using ROCm - hip_add_library(fbgemm_gpu_py SHARED - ${asmjit_sources} - ${fbgemm_gpu_sources_hip} - ${FBGEMM_HIP_HCC_LIBRARIES} - HIPCC_OPTIONS - ${HIP_HCC_FLAGS}) - - target_include_directories(fbgemm_gpu_py PUBLIC - ${FBGEMM_HIP_INCLUDE} - ${ROCRAND_INCLUDE} - ${ROCM_SMI_INCLUDE}) - - list(GET TORCH_INCLUDE_DIRS 0 TORCH_PATH) - -else() - # Else create a CUDA library - add_library(fbgemm_gpu_py MODULE +gpu_cpp_library( + PREFIX + fbgemm_gpu + INCLUDE_DIRS + ${fbgemm_sources_include_directories} + CPU_SRCS + ${fbgemm_gpu_sources_cpu_static} + ${fbgemm_gpu_sources_cpu_gen} + GPU_SRCS + ${fbgemm_gpu_sources_gpu_static} + ${fbgemm_gpu_sources_gpu_gen} + OTHER_SRCS ${asmjit_sources} ${fbgemm_sources} - ${fbgemm_gpu_sources_static} - ${fbgemm_gpu_sources_gen}) -endif() - -# Add PyTorch include/ -target_include_directories(fbgemm_gpu_py PRIVATE - ${TORCH_INCLUDE_DIRS} - ${NCCL_INCLUDE_DIRS}) - -# Remove `lib` from the output artifact name `libfbgemm_gpu_py.so` -set_target_properties(fbgemm_gpu_py PROPERTIES PREFIX "") - -# Link to PyTorch -target_link_libraries(fbgemm_gpu_py - ${TORCH_LIBRARIES} - ${NCCL_LIBRARIES} - ${CUDA_DRIVER_LIBRARIES}) - -# Link to NVML -if(NVML_LIB_PATH) - target_link_libraries(fbgemm_gpu_py ${NVML_LIB_PATH}) -endif() - -# Silence warnings in asmjit -target_compile_options(fbgemm_gpu_py PRIVATE - -Wno-deprecated-anon-enum-enum-conversion) -target_compile_options(fbgemm_gpu_py PRIVATE - -Wno-deprecated-declarations) - - -################################################################################ -# FBGEMM_GPU Install -################################################################################ - -install(TARGETS fbgemm_gpu_py - DESTINATION fbgemm_gpu) - -install(FILES ${gen_python_source_files} - DESTINATION fbgemm_gpu/split_embedding_codegen_lookup_invokers) - -install(FILES ${gen_defused_optim_py_files} - DESTINATION fbgemm_gpu/split_embedding_optimizer_codegen) - -add_custom_target(fbgemm_gpu_py_clean_rpath ALL - WORKING_DIRECTORY ${OUTPUT_DIR} - COMMAND bash ${FBGEMM}/.github/scripts/fbgemm_gpu_postbuild.bash) - -add_dependencies(fbgemm_gpu_py_clean_rpath fbgemm_gpu_py) + GPU_FLAGS + ${TORCH_CUDA_OPTIONS}) # TODO: Test target, need to properly integrate into FBGEMM_GPU main build gpu_cpp_library( @@ -740,3 +610,17 @@ gpu_cpp_library( src/embedding_inplace_ops/embedding_inplace_update.cu GPU_FLAGS ${TORCH_CUDA_OPTIONS}) + + +################################################################################ +# FBGEMM_GPU Package +################################################################################ + +install(TARGETS fbgemm_gpu_py + DESTINATION fbgemm_gpu) + +install(FILES ${gen_python_source_files} + DESTINATION fbgemm_gpu/split_embedding_codegen_lookup_invokers) + +install(FILES ${gen_defused_optim_py_files} + DESTINATION fbgemm_gpu/split_embedding_optimizer_codegen) diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt index 2b8ac59dca..3e402ae2e3 100644 --- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt +++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt @@ -46,23 +46,6 @@ file(GLOB_RECURSE experimental_gen_ai_python_source_files *.py) -################################################################################ -# FBGEMM_GPU HIP Code Generation -################################################################################ - -if(USE_ROCM) - # HIPify CUDA code - set(header_include_dir - ${CMAKE_CURRENT_SOURCE_DIR}/../../include - ${CMAKE_CURRENT_SOURCE_DIR}/../../src - ${CMAKE_CURRENT_SOURCE_DIR}/../.. - ${CMAKE_CURRENT_SOURCE_DIR}) - - hipify(CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR} - HEADER_INCLUDE_DIR ${header_include_dir}) -endif() - - ################################################################################ # Build Shared Library ################################################################################ diff --git a/fbgemm_gpu/test/tbe/common.py b/fbgemm_gpu/test/tbe/common.py index 40f1b49e7e..f929596b75 100644 --- a/fbgemm_gpu/test/tbe/common.py +++ b/fbgemm_gpu/test/tbe/common.py @@ -19,12 +19,13 @@ if open_source: # pyre-ignore[21] - from test_utils import gpu_unavailable, running_on_github + from test_utils import gpu_unavailable, running_on_github, TEST_WITH_ROCM else: torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:cumem_utils") from fbgemm_gpu.test.test_utils import ( # noqa F401 gpu_unavailable, running_on_github, + TEST_WITH_ROCM, ) diff --git a/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py b/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py index c615fa23ea..11776e7ca1 100644 --- a/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py +++ b/fbgemm_gpu/test/tbe/inference/nbit_forward_autovec_test.py @@ -22,13 +22,17 @@ ) from hypothesis import given, settings, Verbosity -from ..common import MAX_EXAMPLES +from ..common import MAX_EXAMPLES, TEST_WITH_ROCM from .common import get_nbit_weights_ty, NBitFowardTestCommon VERBOSITY: Verbosity = Verbosity.verbose class NBitFowardAutovecTest(NBitFowardTestCommon): + @unittest.skipIf( + TEST_WITH_ROCM, + "Test appears to be unreliable on ROCm", + ) @given( nbit_weights_ty=get_nbit_weights_ty(), pooling_mode=st.sampled_from( diff --git a/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py b/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py index 90db49c93a..47ac7e429a 100644 --- a/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py +++ b/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py @@ -686,6 +686,10 @@ def test_nbit_forward_cpu_seq_int8( equal_nan=False, ) + @unittest.skipIf( + TEST_WITH_ROCM, + "Test appears to be unreliable on ROCm", + ) @given( D=st.sampled_from([32, 256, 384, 512, 1024]), B=st.integers(min_value=8, max_value=32),