Merge branch 'main' into farhadr/ft_refactor

NVIDIA · Jan 13, 2025 · bcfdfb4 · bcfdfb4
2 parents 5d50cfc + db237fc
commit bcfdfb4
Show file tree

Hide file tree

Showing 14 changed files with 298 additions and 96 deletions.
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -0,0 +1,125 @@
+name: "[Optional] BioNemo Image Build and Unit Tests"
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+  merge_group:
+    types: [checks_requested]
+
+defaults:
+  run:
+    shell: bash -x -e -u -o pipefail {0}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: "recursive"
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+      - run: pip install -r requirements-dev.txt
+      - run: ./ci/scripts/static_checks.sh
+      - uses: trufflesecurity/trufflehog@main
+        with:
+          extra_args: --only-verified
+
+  build-bionemo-image:
+    needs: pre-commit
+    runs-on: self-hosted-azure-cpu
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'SKIP_CI') }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+          submodules: "recursive"
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker Metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: nemoci.azurecr.io/bionemo
+          labels: nemo.library=bionemo
+          tags: |
+            type=schedule
+            type=ref,event=branch
+            type=ref,event=tag
+            type=ref,event=pr
+            type=raw,value=${{ github.run_id }}
+
+      - uses: int128/docker-build-cache-config-action@v1
+        id: cache
+        with:
+          image: nemoci.azurecr.io/bionemo/build-cache
+          pull-request-cache: true
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          file: ${{ github.run_id }}/Dockerfile
+          context: ${{ github.run_id }}/
+          push: true
+          tags: ${{ steps.metadata.outputs.tags }}
+          labels: ${{ steps.metadata.outputs.labels }}
+          cache-from: ${{ steps.cache.outputs.cache-from }}
+          cache-to: ${{ steps.cache.outputs.cache-to }}
+
+  run-tests:
+    needs: build-bionemo-image
+    runs-on: self-hosted-nemo-gpus-1
+    defaults:
+      run:
+        working-directory: ./${{ github.run_id }}
+    container:
+      image: nemoci.azurecr.io/bionemo:${{ github.run_id }}
+      options: --gpus all
+      volumes:
+        - /home/azureuser/actions-runner-bionemo/cache:/github/home/.cache
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          path: ${{ github.run_id }}
+
+      - name: Run tests
+        env:
+          BIONEMO_DATA_SOURCE: ngc
+        run: ./ci/scripts/run_pytest.sh --no-nbval --skip-slow
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          working-directory: ${{ github.run_id }}
+
+      - name: Upload test results to Codecov
+        if: ${{ !cancelled() }}
+        uses: codecov/test-results-action@v1
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          working-directory: ${{ github.run_id }}
+
+  clean-up:
+    needs: run-tests
+    runs-on: self-hosted-nemo-gpus-1
+    if: ${{ always() }}
+    steps:
+      - name: clean up image
+        run: docker rmi nemoci.azurecr.io/bionemo:${{ github.run_id }}
+
+# TODO: exclude tests from base image; run tests from github workspace mounted in the image.
+# TODO: figure out way of cleaning up working directory (requires sudo or for us to fix file ownership from release container)
diff --git a/3rdparty/NeMo b/3rdparty/NeMo
diff --git a/Dockerfile.arm b/Dockerfile.arm
@@ -1,5 +1,5 @@
 # Base image with apex and transformer engine, but without NeMo or Megatron-LM.
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.02-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3
 
 FROM rust:1.82.0 as rust-env
 
@@ -55,24 +55,27 @@ RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-di
   git+https://github.com/Dao-AILab/[email protected]
 
 # Build LLVM and triton
+# It's important to select a specific version of LLVM as per triton's README instructions, and
+# also important to constrain the build targets to the systems we care about or else there will
+# be many strange unlinked symbol issues. Here we assume this dockerfile is build on an aarch64
+# target (host), and build for NVIDIA GPUS (NVPTX). Unclear why, but we also need to build for
+# AMDGPUs to get triton to properly build or else there are linker issues.
 RUN git clone https://github.com/llvm/llvm-project.git && \
     pip install ninja && \
     cd llvm-project && \
-    git fetch origin 5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372 && \
-    git checkout 5e5a22caf88ac1ccfa8dc5720295fdeba0ad9372 && \
+    git fetch origin 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \
+    git checkout 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \
     mkdir build && cd build && \
-    cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON  ../llvm -DLLVM_ENABLE_PROJECTS="mlir;llvm" && \
+    cmake -G Ninja  -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_PROJECTS="mlir;llvm" -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" ../llvm && \
     ninja && \
     export LLVM_BUILD_DIR=${WORKDIR}/llvm-project/build && \
-
     cd ${WORKDIR} && \
     git clone https://github.com/triton-lang/triton.git && \
     pip install cmake wheel pybind11 && \
     cd triton && \
-    git fetch origin 79c6c9b209a5692b9a895398f4f3a033f8f80415 && \
-    git checkout 79c6c9b209a5692b9a895398f4f3a033f8f80415 && \
-    LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install python/ && \
-
+    git fetch origin release/3.1.x && \
+    git checkout release/3.1.x && \
+    LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install --verbose python/ && \
     cd ${WORKDIR} && \
     rm -rf llvm-project && \
     rm -rf triton
@@ -93,25 +96,20 @@ RUN rm -rf /build
 
 # Addressing Security Scan Vulnerabilities
 RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
-RUN apt-get update  && \
-  apt-get install -y openssh-client=1:8.9p1-3ubuntu0.10 && \
-  rm -rf /var/lib/apt/lists/*
-RUN apt purge -y libslurm37 libpmi2-0 && \
+RUN apt purge -y libpmi2-0 && \
   apt autoremove -y
-RUN source /usr/local/nvm/nvm.sh && \
-  NODE_VER=$(nvm current) && \
-  nvm deactivate && \
-  nvm uninstall $NODE_VER && \
-  sed -i "/NVM/d" /root/.bashrc && \
-  sed -i "/nvm.sh/d" /etc/bash.bashrc
 
 # Use UV to install python packages from the workspace. This just installs packages into the system's python
-# environment, and does not use the current uv.lock file.
+# environment, and does not use the current uv.lock file. Note that with python 3.12, we now need to set
+# UV_BREAK_SYSTEM_PACKAGES, since the pytorch base image has made the decision not to use a virtual environment and UV
+# does not respect the PIP_BREAK_SYSTEM_PACKAGES environment variable set in the base dockerfile.
 COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
 ENV UV_LINK_MODE=copy \
   UV_COMPILE_BYTECODE=1 \
   UV_PYTHON_DOWNLOADS=never \
-  UV_SYSTEM_PYTHON=true
+  UV_SYSTEM_PYTHON=true \
+  UV_NO_CACHE=1 \
+  UV_BREAK_SYSTEM_PACKAGES=1
 
 # Install the bionemo-geomtric requirements ahead of copying over the rest of the repo, so that we can cache their
 # installation. These involve building some torch extensions, so they can take a while to install.
@@ -133,12 +131,35 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup
 ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
 ENV RUSTUP_HOME="/usr/local/rustup"
 
-# Build decord
+# # Build decord
+# This needs a specific version of ffmpeg:
+# root@e1fc53d00844:/workspace/bionemo2# ffmpeg -version
+# ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
+# built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
+# configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/aarch64-linux-gnu --incdir=/usr/include/aarch64-linux-gnu --arch=arm64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared
+# libavutil      56. 70.100 / 56. 70.100
+# libavcodec     58.134.100 / 58.134.100
+# libavformat    58. 76.100 / 58. 76.100
+# libavdevice    58. 13.100 / 58. 13.100
+# libavfilter     7.110.100 /  7.110.100
+# libswscale      5.  9.100 /  5.  9.100
+# libswresample   3.  9.100 /  3.  9.100
+# libpostproc    55.  9.100 / 55.  9.100
+#
+# Issue link: https://github.com/dmlc/decord/issues/257
+# Diff to make it all work https://github.com/dmlc/decord/issues/186#issuecomment-1171882325
+
+# Consider this:
+# sudo apt install libnvidia-decode-550
+# cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/
+# cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release
+
 RUN apt-get update && \
     apt-get install -y build-essential python3-dev python3-setuptools make cmake && \
-    apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev && \
+    apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev
+RUN --mount=type=bind,source=./arm_build/decord_ffmpeg6_fix.patch,target=/decord_ffmpeg6_fix.patch \
     git clone --recursive https://github.com/dmlc/decord && \
-    cd decord && \
+    cd decord && git apply /decord_ffmpeg6_fix.patch && \
     mkdir build && cd build && \
     cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \
     make && \
@@ -173,20 +194,28 @@ RUN git clone --single-branch --branch 1.15.0rc4 https://github.com/single-cell-
 
 WORKDIR /workspace/bionemo2
 # Note, we need to mount the .git folder here so that setuptools-scm is able to fetch git tag for version.
+# For some reason, we do not need to do the tensorstore verson package hack on arm64, while we do need this for x86 build.
 RUN --mount=type=bind,source=./.git,target=./.git \
   --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
   --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
-  <<EOF
-set -eo pipefail
-uv pip install maturin --no-build-isolation && uv pip install --no-build-isolation \
+  uv pip install maturin --no-build-isolation --break-system-packages
+RUN --mount=type=bind,source=./.git,target=./.git \
+  --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
+  --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
+  pip install --use-deprecated=legacy-resolver  --no-build-isolation \
+  tensorstore==0.1.45
+
+RUN --mount=type=bind,source=./.git,target=./.git \
+  --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
+  --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
+# Comment out mamba install in NeMo as this causes issues.
+  sed -i "/mamba-ssm/d" ./3rdparty/NeMo/requirements/requirements_nlp.txt && \
+  uv pip install --no-build-isolation \
   ./3rdparty/* \
   ./sub-packages/bionemo-* \
   -r /requirements-cve.txt \
-  -r /requirements-test.txt
-rm -rf ./3rdparty
-rm -rf /tmp/*
-rm -rf ./sub-packages/bionemo-noodles/target
-EOF
+  -r /requirements-test.txt && rm -rf ./3rdparty && rm -rf /tmp/* && rm -rf ./sub-packages/bionemo-noodles/target \
+  && rm -rf /root/.cache/*
 
 # In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the
 # base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that
@@ -286,6 +315,12 @@ for sub in ./3rdparty/* ./sub-packages/bionemo-*; do
     uv pip install --no-deps --no-build-isolation --editable $sub
 done
 EOF
+# This is needed because faiss is not compatible with ARM at all.
+# Bionemo doesn't use faiss, but megatron core does.
+# We do not use this codepath at all, therefore we just make is_sve_supported return False
+# to circumvent python import issues
+RUN sed -i '42i\        # Bionemo hack to fix ARM issues with faiss\n        return False' /usr/local/lib/python3.12/dist-packages/faiss/loader.py
+
 # Since the entire repo is owned by root, swithcing username for development breaks things.
 ARG USERNAME=bionemo
 RUN chown $USERNAME:$USERNAME -R /workspace/bionemo2/
@@ -312,7 +347,6 @@ COPY --from=rust-env /usr/local/rustup /usr/local/rustup
 
 
 # RUN rm -rf /usr/local/cargo /usr/local/rustup
-RUN rm -rf /root/.cache/bazel
 RUN chmod 777 -R /workspace/bionemo2/
 
 # Transformer engine attention defaults

diff --git a/README.md b/README.md
@@ -2,7 +2,8 @@
 
 [![Click here to deploy.](https://uohmivykqgnnbiouffke.supabase.co/storage/v1/object/public/landingpage/brevdeploynavy.svg)](https://console.brev.dev/launchable/deploy/now?launchableID=env-2pPDA4sJyTuFf3KsCv5KWRbuVlU)
 [![Docs Build](https://img.shields.io/github/actions/workflow/status/NVIDIA/bionemo-framework/pages/pages-build-deployment?label=docs-build)](https://nvidia.github.io/bionemo-framework)
-![Latest Tag](https://img.shields.io/github/v/tag/NVIDIA/bionemo-framework?label=latest-version)
+[![Latest Tag](https://img.shields.io/github/v/tag/NVIDIA/bionemo-framework?label=latest-version)](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/clara/containers/bionemo-framework/tags)
+[![codecov](https://codecov.io/gh/NVIDIA/bionemo-framework/branch/main/graph/badge.svg?token=XqhegdZRqB)](https://codecov.io/gh/NVIDIA/bionemo-framework)
 
 NVIDIA BioNeMo Framework is a collection of programming tools, libraries, and models for computational drug discovery.
 It accelerates the most time-consuming and costly stages of building and adapting biomolecular AI models by providing
+3 −2		.github/workflows/cicd-main.yml
+1 −1		Dockerfile.ci
+1 −2		docs/source/nlp/information_retrieval.rst
+0 −1		nemo/collections/diffusion/scripts/train.sh
+3 −0		nemo/collections/llm/gpt/model/gemma.py
+0 −2		nemo/collections/llm/recipes/gemma_2b.py
+0 −4		nemo/collections/llm/recipes/gemma_7b.py
+5 −0		nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+3 −0		nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
+2 −2		nemo/collections/vlm/mllama/model/language.py
+1 −1		nemo/lightning/pytorch/callbacks/peft.py
+1 −1		requirements/requirements_multimodal.txt
+2 −1		scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+2 −0		tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
+2 −2		tests/collections/llm/bitexact/mixtral/run.sh
+5 −0		tests/collections/llm/gpt/model/test_model_import.py
+0 −1		tests/collections/llm/hf/peft_nemorun.py
+0 −1		tests/collections/llm/hf/sft_nemorun.py
+2 −0		tests/collections/llm/megatron_mixtral_pretraining.py
+14 −0		tests/conftest.py
+2 −2		tests/core/test_exp_manager.py
+5 −5		tests/lightning/test_nemo_resume_from_ckpt.py
+0 −2		tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
+0 −2		tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
+2 −7		tutorials/llm/mamba/mamba.rst