Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{ai}[foss/2023b] PyTorch v2.2.1 #22361

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-2.2.1-foss-2023b.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
name = 'PyTorch'
version = '2.2.1'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2023b'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
'PyTorch-1.13.1_fix-protobuf-dependency.patch',
'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
'PyTorch-2.0.1_fix-skip-decorators.patch',
'PyTorch-2.0.1_fix-vsx-loadu.patch',
'PyTorch-2.0.1_skip-failing-gradtest.patch',
'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
'PyTorch-2.1.0_disable-gcc12-warning.patch',
'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
'PyTorch-2.1.2_fix-vsx-vector-div.patch',
'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
'PyTorch-2.2.1_no-cuda-stubs-rpath.patch',
'PyTorch-2.2.1_fix-test_extension_backend-without-vectorization.patch',
]
checksums = [
{'pytorch-v2.2.1.tar.gz': '8069467387b8ab7a7279671b9144d80a5c5342b4fa022eb3c1db629a6fd806c9'},
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
'5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
{'PyTorch-1.13.1_fix-protobuf-dependency.patch':
'8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
{'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
{'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
{'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
'02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
{'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
{'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
{'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
{'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
'7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
{'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
'166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
{'PyTorch-2.1.0_disable-gcc12-warning.patch': 'c858b8db0010f41005dc06f9a50768d0d3dc2d2d499ccbdd5faf8a518869a421'},
{'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
'3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
{'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
{'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
'35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
{'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
{'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
'5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
{'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
{'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
{'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
'7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
{'PyTorch-2.2.1_no-cuda-stubs-rpath.patch': '713f98b45f33be955ff581fc14d16cd843d8b48190d3fdffa02afcdfd3583100'},
{'PyTorch-2.2.1_fix-test_extension_backend-without-vectorization.patch':
'8d8c72d68c8391ddec5133fcabbb8653fef890acb9eece8ff1ddc43f128f2450'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.27.6'),
('hypothesis', '6.90.0'),
# For tests
('pytest-flakefinder', '1.1.0'),
('pytest-rerunfailures', '14.0'),
('pytest-shard', '0.1.2'),
('unittest-xml-reporting', '3.1.0'),
]

dependencies = [
('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions
('Python', '3.11.5'),
('Python-bundle-PyPI', '2023.10'),
('protobuf', '25.3'),
('protobuf-python', '4.25.3'),
('pybind11', '2.11.1'),
('SciPy-bundle', '2023.11'),
('PyYAML', '6.0.1'),
('MPFR', '4.2.1'),
('GMP', '6.3.0'),
('numactl', '2.0.16'),
('FFmpeg', '6.0'),
('Pillow', '10.2.0'),
('expecttest', '0.2.1'),
('networkx', '3.2.1'),
('sympy', '1.12'),
('Z3', '4.13.0',),
]

use_pip = True
buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# no xdoctest
'doctests',
# failing on broadwell
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'test_native_mha',
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 2

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
The test checks for a substring "loadu" in generated code.
On AVX systems that line is:
> auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(i0))
however on non-AVX systems it is
> auto tmp0 = in_ptr0[static_cast<long>(i0)];

the difference depends on `codecache.valid_vec_isa_list()` being non-empty.
See torch/_inductor/codegen/cpp.py:2639

Modify the test to account for that.

Author: Alexander Grund (TU Dresden)

diff --git a/test/inductor/test_extension_backend.py b/test/inductor/test_extension_backend.py
index 7d6f35d7b74..decc61d62d7 100644
--- a/test/inductor/test_extension_backend.py
+++ b/test/inductor/test_extension_backend.py
@@ -20,7 +20,7 @@ except ImportError:
)

from torch._C import FileCheck
-from torch._inductor import metrics
+from torch._inductor import codecache, metrics
from torch._inductor.codegen.common import (
get_scheduling_for_device,
get_wrapper_codegen_for_device,
@@ -130,7 +130,11 @@ class ExtensionBackendTests(TestCase):
metrics.reset()
opt_fn = torch.compile()(fn)
_, code = run_and_get_cpp_code(opt_fn, x, y, z)
- FileCheck().check("void kernel").check("loadu").check("extension_device").run(
+ if codecache.valid_vec_isa_list():
+ load_expr = 'loadu'
+ else:
+ load_expr = ' = in_ptr0[static_cast<long>(i0)];'
+ FileCheck().check("void kernel").check(load_expr).check("extension_device").run(
code
)
opt_fn(x, y, z)
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# PyTorch's CMAKE configuration by default sets RUNPATH on libraries if they link other libraries
# that are outside the build tree, which is done because of the CMAKE config on
# https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L10.
# This provides problems, since the cuda stubs library path then also gets added to the RUNPATH.
# As a result, at runtime, the stub version of things like libcuda.so.1 gets picked up, instead of the real drivers
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/14359
# This line https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L16
# Makes sure that any path that is linked, is also added to the RUNPATH.
# This has been reported upstream in https://github.com/pytorch/pytorch/issues/35418
# and a fix was attempted in https://github.com/pytorch/pytorch/pull/37737 but it was reverted
#
# This EasyBuild patch changes behavior for the libraries that were failing, i.e. the ones in this list:
# https://github.com/easybuilders/easybuild-easyconfigs/issues/14359#issuecomment-970479904
# This is done by setting INSTALL_RPATH_USE_LINK_PATH to false, and instead, specifying the RPATH
# explicitely by defining INSTALL_RPATH, but only adding directories that do not match to the "stubs" regex
#
# Original patch: Caspar van Leeuwen
# Updated: Alexander Grund (TU Dresden)
#
# See https://github.com/pytorch/pytorch/pull/87593
diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt
index 15f47bf52ae..edf1ab26149 100644
--- a/binaries/CMakeLists.txt
+++ b/binaries/CMakeLists.txt
@@ -56,7 +56,8 @@ endif()

if(USE_CUDA)
caffe2_binary_target("inspect_gpu.cc")
- target_link_libraries(inspect_gpu ${CUDA_LIBRARIES})
+ include(${Torch_SOURCE_DIR}/cmake/LinkCudaLibraries.cmake)
+ link_cuda_libraries(inspect_gpu ${CUDA_LIBRARIES})
caffe2_binary_target("print_core_object_sizes_gpu.cc")

if(BUILD_TEST)
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 748363725bc..a0b75597b34 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -624,14 +624,13 @@ endif()
if(USE_CUDA)
list(APPEND Caffe2_GPU_CU_SRCS ${Caffe2_GPU_HIP_JIT_FUSERS_SRCS})
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
+ include(${Torch_SOURCE_DIR}/cmake/LinkCudaLibraries.cmake)
+ link_cuda_libraries(caffe2_nvrtc ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB})
if(MSVC)
# Delay load nvcuda.dll so we can import torch compiled with cuda on a CPU-only machine
- set(DELAY_LOAD_FLAGS "-DELAYLOAD:nvcuda.dll;delayimp.lib")
- else()
- set(DELAY_LOAD_FLAGS "")
+ target_link_libraries(caffe2_nvrtc "-DELAYLOAD:nvcuda.dll;delayimp.lib")
endif()

- target_link_libraries(caffe2_nvrtc ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS})
install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
if(USE_NCCL)
list(APPEND Caffe2_GPU_SRCS
@@ -1541,6 +1540,7 @@ endif()

# ---[ CUDA library.
if(USE_CUDA)
+ include(${Torch_SOURCE_DIR}/cmake/LinkCudaLibraries.cmake)
# FIXME: If kineto is linked with CUPTI it pollutes torch_cpu with CUDA dependencies
# Even worse, it never declares that it depends on cudart, but calls the API, see
# https://github.com/pytorch/kineto/blob/aef2f5c0f15e3be52406ac0b885e8689de6bc9f6/libkineto/src/CudaDeviceProperties.cpp#L24
@@ -1554,13 +1554,13 @@ if(USE_CUDA)
torch_cuda INTERFACE $<INSTALL_INTERFACE:include>)
target_include_directories(
torch_cuda PRIVATE ${Caffe2_GPU_INCLUDE})
- target_link_libraries(
+ link_cuda_libraries(
torch_cuda PRIVATE ${Caffe2_CUDA_DEPENDENCY_LIBS})

# These public dependencies must go after the previous dependencies, as the
# order of the libraries in the linker call matters here when statically
# linking; libculibos and cublas must be last.
- target_link_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+ link_cuda_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
endif()

# ---[ Metal(OSX) modification
diff --git a/cmake/LinkCudaLibraries.cmake b/cmake/LinkCudaLibraries.cmake
new file mode 100644
index 00000000000..e09d1186f6d
--- /dev/null
+++ b/cmake/LinkCudaLibraries.cmake
@@ -0,0 +1,33 @@
+# Link CUDA libraries to the given target, i.e.: `target_link_libraries(target <args>)`
+#
+# Additionally makes sure CUDA stub libs don't end up being in RPath
+#
+# Example: link_cuda_libraries(mytarget PRIVATE ${CUDA_LIBRARIES})
+function(link_cuda_libraries target)
+ set(libs ${ARGN})
+ set(install_rpath "$ORIGIN")
+ set(filtered FALSE)
+ foreach(lib IN LISTS libs)
+ # CUDA stub libs are in form /prefix/lib/stubs/libcuda.so
+ # So extract the name of the parent folder, to check against "stubs"
+ # And the parent path which we need to add to the INSTALL_RPATH for non-stubs
+ get_filename_component(parent_path "${lib}" DIRECTORY)
+ get_filename_component(parent_name "${parent_path}" NAME)
+ if(parent_name STREQUAL "stubs")
+ message(STATUS "Filtering ${lib} from being set in ${target}'s RPATH, "
+ "because it appears to point to the CUDA stubs directory.")
+ set(filtered TRUE)
+ elseif(parent_path)
+ list(APPEND install_rpath ${parent_path})
+ endif()
+ endforeach()
+
+ # Regular link command
+ target_link_libraries(${target} ${libs})
+ # Manually set INSTALL_RPATH when there were any stub libs
+ if(filtered)
+ list(REMOVE_DUPLICATES install_rpath)
+ set_target_properties(${target} PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(${target} PROPERTIES INSTALL_RPATH "${install_rpath}")
+ endif()
+endfunction()
diff --git a/test/test_torch.py b/test/test_torch.py
index efc3a1edba5..865416a817e 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -9767,6 +9767,21 @@ def add_neg_dim_tests():
assert not hasattr(TestTorch, test_name), "Duplicated test name: " + test_name
setattr(TestTorch, test_name, make_neg_dim_test(name, tensor_arg, arg_constr, types, extra_dim))

+class TestRPATH(TestCase):
+ @unittest.skipIf(not sys.platform.startswith('linux'), "linux-only test")
+ def test_rpath(self):
+ """
+ Make sure RPATH (or RUNPATH) in nvrtc does not contain a cuda stubs directory
+ issue gh-35418
+ """
+ libdir = os.path.join(os.path.dirname(torch._C.__file__), 'lib')
+ caffe2_nvrtc = os.path.join(libdir, 'libcaffe2_nvrtc.so')
+ if os.path.exists(caffe2_nvrtc):
+ output = subprocess.check_output(['objdump', '-x', caffe2_nvrtc])
+ for line in output.split(b'\n'):
+ if b'RPATH' in line or b'RUNPATH' in line:
+ self.assertFalse(b'stubs' in line)
+
# TODO: these empy classes are temporarily instantiated for XLA compatibility
# once XLA updates their test suite it should be removed
class TestViewOps(TestCase):
Loading