diff --git a/.gitlab/build-and-test-lassen.yml b/.gitlab/build-and-test-lassen.yml index 2421c07d03..a14b11a0a7 100644 --- a/.gitlab/build-and-test-lassen.yml +++ b/.gitlab/build-and-test-lassen.yml @@ -38,13 +38,13 @@ include: clang-16-0-6-gcc-11-2-1-cuda-12-2-2-lassen: variables: COMPILER_FAMILY: clang - MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5 fftw/3.3.10-gcc-11.2.1" extends: .build-and-test-on-lassen clang-16-0-6-gcc-11-2-1-cuda-12-2-2-distconv-lassen: variables: COMPILER_FAMILY: clang - MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5" + MODULES: "clang/16.0.6-gcc-11.2.1 spectrum-mpi/rolling-release cuda/12.2.2 cmake/3.29.2 python/3.11.5 fftw/3.3.10-gcc-11.2.1" WITH_DISTCONV: "1" extends: .build-and-test-on-lassen diff --git a/.gitlab/build-and-test.sh b/.gitlab/build-and-test.sh index 99275c6c81..ead88c1055 100755 --- a/.gitlab/build-and-test.sh +++ b/.gitlab/build-and-test.sh @@ -95,20 +95,25 @@ echo "~~~~~ Project dir: ${project_dir}" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" prefix="${project_dir}/install-deps-${CI_JOB_NAME_SLUG:-${job_unique_id}}" +#dha_prefix=${INSTALL_EXTERNALS_ROOT}/rocm-5.7.1/amd/cray-mpich-8.1.29/dha_with_distconv dha_prefix=${prefix} # Just for good measure... -export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} +export CMAKE_PREFIX_PATH=${dha_prefix}/aluminum:${dha_prefix}/hydrogen:${dha_prefix}/dihydrogen:${CMAKE_PREFIX_PATH} +#export CMAKE_PREFIX_PATH=${prefix}/aluminum:${prefix}/hydrogen:${prefix}/dihydrogen:${CMAKE_PREFIX_PATH} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} # Allow a user to force this rebuild_deps=${REBUILD_DEPS:-""} +#rebuild_deps=0 # Rebuild if the prefix doesn't exist. -if [[ ! -d "${prefix}" ]] +#if [[ ! -d "${prefix}" ]] +if [[ ! -d "${dha_prefix}" ]] then rebuild_deps=1 fi +#rebuild_deps=0 # Rebuild if latest hashes don't match if [[ -z "${rebuild_deps}" ]] @@ -220,10 +225,21 @@ echo "~~~~~ Installing Python Packages with PIP" echo "~~~~~ $(date)" echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" -CMD="python3 -m pip install -i https://pypi.org/simple --prefix ${prefix}/lbann protobuf tqdm numpy scipy" +CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann protobuf tqdm numpy scipy" echo ${CMD} ${CMD} +case "${cluster}" in + pascal) + CMD="python3 -m pip install -i https://pypi.org/simple -U --force-reinstall --prefix ${prefix}/lbann torch" + echo ${CMD} + ${CMD} + ;; + *) + echo "Unable to install torch via pip on ${cluster}" + ;; +esac + LBANN_MODFILES_DIR=${build_dir}/install/lbann/etc/modulefiles #echo "I think that the module is in ${LBANN_MODFILES_DIR}" ml use ${LBANN_MODFILES_DIR} diff --git a/.gitlab/configure_lbann.sh b/.gitlab/configure_lbann.sh index 4081f04d8a..acdd428395 100644 --- a/.gitlab/configure_lbann.sh +++ b/.gitlab/configure_lbann.sh @@ -1,22 +1,23 @@ if [[ "$cluster" == "lassen" ]] then - # lbann_lapack_opt="-D LBANN_BLA_VENDOR=IBMESSL" lbann_lapack_opt="-D BLA_VENDOR=Generic" + build_fft=ON else lbann_lapack_opt="" fi -# Just for good measure... +if [[ "$cluster" == "tioga" ]] +then + build_fft=OFF +fi + +# Default RPATH rules will not include in-source libraries from the prefix path... add them here. if [ -z "${extra_rpaths}" ]; then extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64 else extra_rpaths=${dha_prefix}/aluminum/lib64:${dha_prefix}/hydrogen/lib:${dha_prefix}/dihydrogen/lib64:${extra_rpaths:-""} fi -echo "I have modified the extra rpaths to be ${extra_rpaths}" - # -D CMAKE_BUILD_RPATH="${extra_rpaths//:/\;}" \ - # -D CMAKE_INSTALL_RPATH="${extra_rpaths//:/\;}" \ - cmake -G Ninja \ -S ${project_dir} \ -B ${build_dir}/build-lbann \ @@ -57,9 +58,3 @@ cmake -G Ninja \ -D LBANN_WITH_EMBEDDED_PYTHON=ON \ -D LBANN_WITH_PYTHON_FRONTEND=ON \ -D LBANN_WITH_VISION=ON - - # -D CMAKE_BUILD_RPATH_USE_ORIGIN=OFF \ - # -D CMAKE_BUILD_WITH_INSTALL_RPATH=OFF \ - # -D CMAKE_SKIP_BUILD_RPATH=OFF \ - # -D CMAKE_SKIP_INSTALL_RPATH=OFF \ - # -D CMAKE_SKIP_RPATH=OFF \ diff --git a/.gitlab/setup_env.sh b/.gitlab/setup_env.sh index 4e8393a106..6d724ffb94 100644 --- a/.gitlab/setup_env.sh +++ b/.gitlab/setup_env.sh @@ -78,6 +78,7 @@ case "${cluster}" in else extra_rpaths="${ROCM_PATH}/lib:${ROCM_PATH}/llvm/lib:${extra_rpaths}" fi + extra_rpaths="/usr/workspace/lbann/ci_stable_dependencies/tioga/rocm-5.7.1/cray/cray-mpich-8.1.30/aws_ofi_rccl/lib:${extra_rpaths}" rocm_platform=ON gpu_arch=gfx90a,gfx942 launcher=flux @@ -103,6 +104,7 @@ source ${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS}/logs/lbann_s export CMAKE_PREFIX_PATH=${CI_STABLE_DEPENDENCIES_ROOT}/half-2.1.0:${CMAKE_PREFIX_PATH} #CMAKE_PREFIX_PATH=${INSTALL_EXTERNALS_ROOT}/${SYSTEM_INSTALL_PREFIX_EXTERNALS} CMAKE_CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH//:/;} +echo "BVE HERE I think that AWS_OFI_RCCL_LIBRARY=${AWS_OFI_RCCL_LIBRARY}" CFLAGS=${CFLAGS:-""} CXXFLAGS=${CXXFLAGS:-""} diff --git a/scripts/superbuild/ci/ci_tioga_env.sh b/scripts/superbuild/ci/ci_tioga_env.sh index 9657389ae3..679a35b9a9 100644 --- a/scripts/superbuild/ci/ci_tioga_env.sh +++ b/scripts/superbuild/ci/ci_tioga_env.sh @@ -44,9 +44,10 @@ PE_ENV_lc=$(echo "${PE_ENV}" | tr '[:upper:]' '[:lower:]') INSTALL_ROOT=/usr/workspace/lbann/ci_stable_dependencies/tioga/${ROCM_VER}/${PE_ENV_lc} INSTALL_PREFIX_EXTERNALS=${INSTALL_ROOT}/cray-mpich-${CRAY_MPICH_VERSION} -if [[ "${PE_ENV_lc}" == "cray" ]]; then +if [[ "${PE_ENV_lc}" = "cray" ]]; then # If using PrgEnv-cray add ${CRAYLIBS_X86_64} EXTRA_RPATHS="${CRAYLIBS_X86_64}|${EXTRA_RPATHS}" + export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} fi # Use an accessible build directory so that the source files are preserved for debuggin