easybuilders · Flamefire · Feb 21, 2025 · Feb 24, 2025 · Feb 24, 2025
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.2.1-foss-2023b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.2.1-foss-2023b.eb
@@ -0,0 +1,152 @@
+name = 'PyTorch'
+version = '2.2.1'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2023b'}
+
+source_urls = [GITHUB_RELEASE]
+sources = ['%(namelower)s-v%(version)s.tar.gz']
+patches = [
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
+    'PyTorch-1.12.1_add-hypothesis-suppression.patch',
+    'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
+    'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
+    'PyTorch-1.12.1_skip-test_round_robin.patch',
+    'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
+    'PyTorch-1.13.1_fix-protobuf-dependency.patch',
+    'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
+    'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
+    'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
+    'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
+    'PyTorch-2.0.1_fix-skip-decorators.patch',
+    'PyTorch-2.0.1_fix-vsx-loadu.patch',
+    'PyTorch-2.0.1_skip-failing-gradtest.patch',
+    'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
+    'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
+    'PyTorch-2.1.0_disable-gcc12-warning.patch',
+    'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
+    'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
+    'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
+    'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
+    'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
+    'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
+    'PyTorch-2.1.2_fix-vsx-vector-div.patch',
+    'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
+    'PyTorch-2.2.1_no-cuda-stubs-rpath.patch',
+    'PyTorch-2.2.1_fix-test_extension_backend-without-vectorization.patch',
+]
+checksums = [
+    {'pytorch-v2.2.1.tar.gz': '8069467387b8ab7a7279671b9144d80a5c5342b4fa022eb3c1db629a6fd806c9'},
+    {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
+    {'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
+     '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
+    {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
+     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
+    {'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
+     '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
+    {'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
+    {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
+    {'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
+     '5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
+    {'PyTorch-1.13.1_fix-protobuf-dependency.patch':
+     '8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
+    {'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
+     'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
+    {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
+     '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
+    {'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
+     '481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
+    {'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
+     '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
+    {'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
+    {'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
+    {'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
+    {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
+     '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
+    {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
+     '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
+    {'PyTorch-2.1.0_disable-gcc12-warning.patch': 'c858b8db0010f41005dc06f9a50768d0d3dc2d2d499ccbdd5faf8a518869a421'},
+    {'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
+     '3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
+    {'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
+     'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
+    {'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
+     '35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
+    {'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
+    {'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
+     '5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
+    {'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
+    {'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
+    {'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
+     '7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
+    {'PyTorch-2.2.1_no-cuda-stubs-rpath.patch': '713f98b45f33be955ff581fc14d16cd843d8b48190d3fdffa02afcdfd3583100'},
+    {'PyTorch-2.2.1_fix-test_extension_backend-without-vectorization.patch':
+     '8d8c72d68c8391ddec5133fcabbb8653fef890acb9eece8ff1ddc43f128f2450'},
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.27.6'),
+    ('hypothesis', '6.90.0'),
+    # For tests
+    ('pytest-flakefinder', '1.1.0'),
+    ('pytest-rerunfailures', '14.0'),
+    ('pytest-shard', '0.1.2'),
+    ('unittest-xml-reporting', '3.1.0'),
+]
+
+dependencies = [
+    ('Ninja', '1.11.1'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.11.5'),
+    ('Python-bundle-PyPI', '2023.10'),
+    ('protobuf', '25.3'),
+    ('protobuf-python', '4.25.3'),
+    ('pybind11', '2.11.1'),
+    ('SciPy-bundle', '2023.11'),
+    ('PyYAML', '6.0.1'),
+    ('MPFR', '4.2.1'),
+    ('GMP', '6.3.0'),
+    ('numactl', '2.0.16'),
+    ('FFmpeg', '6.0'),
+    ('Pillow', '10.2.0'),
+    ('expecttest', '0.2.1'),
+    ('networkx', '3.2.1'),
+    ('sympy', '1.12'),
+    ('Z3', '4.13.0',),
+]
+
+use_pip = True
+buildcmd = '%(python)s setup.py build'  # Run the (long) build in the build step
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
+        'distributions/test_constraints',
+        # no xdoctest
+        'doctests',
+        # failing on broadwell
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'test_native_mha',
+        # intermittent failures on various systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'distributed/rpc/test_tensorpipe_agent',
+    ]
+}
+
+runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'
+
+# Especially test_quantization has a few corner cases that are triggered by the random input values,
+# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
+# So allow a low number of tests to fail as the tests "usually" succeed
+max_failed_tests = 2
+
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'ai'
diff --git a/...asyconfigs/p/PyTorch/PyTorch-2.2.1_fix-test_extension_backend-without-vectorization.patch b/...asyconfigs/p/PyTorch/PyTorch-2.2.1_fix-test_extension_backend-without-vectorization.patch
@@ -0,0 +1,39 @@
+The test checks for a substring "loadu" in generated code.
+On AVX systems that line is:
+> auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(i0))
+however on non-AVX systems it is
+> auto tmp0 = in_ptr0[static_cast<long>(i0)];
+
+the difference depends on `codecache.valid_vec_isa_list()` being non-empty.
+See torch/_inductor/codegen/cpp.py:2639
+
+Modify the test to account for that.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/inductor/test_extension_backend.py b/test/inductor/test_extension_backend.py
+index 7d6f35d7b74..decc61d62d7 100644
+--- a/test/inductor/test_extension_backend.py
++++ b/test/inductor/test_extension_backend.py
+@@ -20,7 +20,7 @@ except ImportError:
+     )
+
+ from torch._C import FileCheck
+-from torch._inductor import metrics
++from torch._inductor import codecache, metrics
+ from torch._inductor.codegen.common import (
+     get_scheduling_for_device,
+     get_wrapper_codegen_for_device,
+@@ -130,7 +130,11 @@ class ExtensionBackendTests(TestCase):
+         metrics.reset()
+         opt_fn = torch.compile()(fn)
+         _, code = run_and_get_cpp_code(opt_fn, x, y, z)
+-        FileCheck().check("void kernel").check("loadu").check("extension_device").run(
++        if codecache.valid_vec_isa_list():
++            load_expr = 'loadu'
++        else:
++            load_expr = ' = in_ptr0[static_cast<long>(i0)];'
++        FileCheck().check("void kernel").check(load_expr).check("extension_device").run(
+             code
+         )
+         opt_fn(x, y, z)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.2.1_no-cuda-stubs-rpath.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.2.1_no-cuda-stubs-rpath.patch
@@ -0,0 +1,145 @@
+# PyTorch's CMAKE configuration by default sets RUNPATH on libraries if they link other libraries
+# that are outside the build tree, which is done because of the CMAKE config on 
+# https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L10.
+# This provides problems, since the cuda stubs library path then also gets added to the RUNPATH.
+# As a result, at runtime, the stub version of things like libcuda.so.1 gets picked up, instead of the real drivers
+# See https://github.com/easybuilders/easybuild-easyconfigs/issues/14359
+# This line https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L16
+# Makes sure that any path that is linked, is also added to the RUNPATH.
+# This has been reported upstream in https://github.com/pytorch/pytorch/issues/35418
+# and a fix was attempted in https://github.com/pytorch/pytorch/pull/37737 but it was reverted
+#
+# This EasyBuild patch changes behavior for the libraries that were failing, i.e. the ones in this list:
+# https://github.com/easybuilders/easybuild-easyconfigs/issues/14359#issuecomment-970479904
+# This is done by setting INSTALL_RPATH_USE_LINK_PATH to false, and instead, specifying the RPATH
+# explicitely by defining INSTALL_RPATH, but only adding directories that do not match to the "stubs" regex
+#
+# Original patch: Caspar van Leeuwen
+# Updated: Alexander Grund (TU Dresden)
+#
+# See https://github.com/pytorch/pytorch/pull/87593
+diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt
+index 15f47bf52ae..edf1ab26149 100644
+--- a/binaries/CMakeLists.txt
++++ b/binaries/CMakeLists.txt
+@@ -56,7 +56,8 @@ endif()
+
+ if(USE_CUDA)
+   caffe2_binary_target("inspect_gpu.cc")
+-  target_link_libraries(inspect_gpu ${CUDA_LIBRARIES})
++  include(${Torch_SOURCE_DIR}/cmake/LinkCudaLibraries.cmake)
++  link_cuda_libraries(inspect_gpu ${CUDA_LIBRARIES})
+   caffe2_binary_target("print_core_object_sizes_gpu.cc")
+
+   if(BUILD_TEST)
+diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
+index 748363725bc..a0b75597b34 100644
+--- a/caffe2/CMakeLists.txt
++++ b/caffe2/CMakeLists.txt
+@@ -624,14 +624,13 @@ endif()
+ if(USE_CUDA)
+   list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
+   add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
++  include(${Torch_SOURCE_DIR}/cmake/LinkCudaLibraries.cmake)
++  link_cuda_libraries(caffe2_nvrtc ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB})
+   if(MSVC)
+     # Delay load nvcuda.dll so we can import torch compiled with cuda on a CPU-only machine
+-    set(DELAY_LOAD_FLAGS "-DELAYLOAD:nvcuda.dll;delayimp.lib")
+-  else()
+-    set(DELAY_LOAD_FLAGS "")
++    target_link_libraries(caffe2_nvrtc "-DELAYLOAD:nvcuda.dll;delayimp.lib")
+   endif()
+
+-  target_link_libraries(caffe2_nvrtc ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS})
+   install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+   if(USE_NCCL)
+     list(APPEND Caffe2_GPU_SRCS
+@@ -1541,6 +1540,7 @@ endif()
+
+ # ---[ CUDA library.
+ if(USE_CUDA)
++  include(${Torch_SOURCE_DIR}/cmake/LinkCudaLibraries.cmake)
+   # FIXME: If kineto is linked with CUPTI it pollutes torch_cpu with CUDA dependencies
+   # Even worse, it never declares that it depends on cudart, but calls the API, see
+   # https://github.com/pytorch/kineto/blob/aef2f5c0f15e3be52406ac0b885e8689de6bc9f6/libkineto/src/CudaDeviceProperties.cpp#L24
+@@ -1554,13 +1554,13 @@ if(USE_CUDA)
+       torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
+   target_include_directories(
+       torch_cuda PRIVATE ${Caffe2_GPU_INCLUDE})
+-  target_link_libraries(
++  link_cuda_libraries(
+       torch_cuda PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})
+
+   # These public dependencies must go after the previous dependencies, as the
+   # order of the libraries in the linker call matters here when statically
+   # linking; libculibos and cublas must be last.
+-  target_link_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
++  link_cuda_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+ endif()
+
+ # ---[ Metal(OSX) modification
+diff --git a/cmake/LinkCudaLibraries.cmake b/cmake/LinkCudaLibraries.cmake
+new file mode 100644
+index 00000000000..e09d1186f6d
+--- /dev/null
++++ b/cmake/LinkCudaLibraries.cmake
+@@ -0,0 +1,33 @@
++# Link CUDA libraries to the given target, i.e.: `target_link_libraries(target <args>)`
++#
++# Additionally makes sure CUDA stub libs don't end up being in RPath
++#
++# Example: link_cuda_libraries(mytarget PRIVATE ${CUDA_LIBRARIES})
++function(link_cuda_libraries target)
++  set(libs ${ARGN})
++  set(install_rpath "$ORIGIN")
++  set(filtered FALSE)
++  foreach(lib IN LISTS libs)
++    # CUDA stub libs are in form /prefix/lib/stubs/libcuda.so
++    # So extract the name of the parent folder, to check against "stubs"
++    # And the parent path which we need to add to the INSTALL_RPATH for non-stubs
++    get_filename_component(parent_path "${lib}" DIRECTORY)
++    get_filename_component(parent_name "${parent_path}" NAME)
++    if(parent_name STREQUAL "stubs")
++      message(STATUS "Filtering ${lib} from being set in ${target}'s RPATH, "
++                     "because it appears to point to the CUDA stubs directory.")
++      set(filtered TRUE)
++    elseif(parent_path)
++      list(APPEND install_rpath ${parent_path})
++    endif()
++  endforeach()
++
++  # Regular link command
++  target_link_libraries(${target} ${libs})
++  # Manually set INSTALL_RPATH when there were any stub libs
++  if(filtered)
++    list(REMOVE_DUPLICATES install_rpath)
++    set_target_properties(${target} PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
++    set_target_properties(${target} PROPERTIES INSTALL_RPATH "${install_rpath}")
++  endif()
++endfunction()
+diff --git a/test/test_torch.py b/test/test_torch.py
+index efc3a1edba5..865416a817e 100644
+--- a/test/test_torch.py
++++ b/test/test_torch.py
+@@ -9767,6 +9767,21 @@ def add_neg_dim_tests():
+         assert not hasattr(TestTorch, test_name), "Duplicated test name: " + test_name
+         setattr(TestTorch, test_name, make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim))
+
++class TestRPATH(TestCase):
++    @unittest.skipIf(not sys.platform.startswith('linux'), "linux-only test")
++    def test_rpath(self):
++        """
++        Make sure RPATH (or RUNPATH) in nvrtc does not contain a cuda stubs directory
++        issue gh-35418
++        """
++        libdir = os.path.join(os.path.dirname(torch._C.__file__), 'lib')
++        caffe2_nvrtc = os.path.join(libdir, 'libcaffe2_nvrtc.so')
++        if os.path.exists(caffe2_nvrtc):
++            output = subprocess.check_output(['objdump', '-x', caffe2_nvrtc])
++            for line in output.split(b'\n'):
++                if b'RPATH' in line or b'RUNPATH' in line:
++                    self.assertFalse(b'stubs' in line)
++
+ # TODO: these empy classes are temporarily instantiated for XLA compatibility
+ #   once XLA updates their test suite it should be removed
+ class TestViewOps(TestCase):