-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathDockerfile.arm
356 lines (298 loc) · 15.7 KB
/
Dockerfile.arm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# Base image with apex and transformer engine, but without NeMo or Megatron-LM.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.12-py3
FROM rust:1.82.0 as rust-env
RUN rustup set profile minimal && \
rustup install 1.82.0 && \
rustup target add aarch64-unknown-linux-gnu && \
rustup default 1.82.0
FROM ${BASE_IMAGE} AS bionemo2-base
# Install NeMo dependencies.
ENV WORKDIR=/build
WORKDIR ${WORKDIR}
ARG MAX_JOBS=4
ENV MAX_JOBS=${MAX_JOBS}
# See NeMo readme for the latest tested versions of these libraries
ENV TORCH_CUDA_ARCH_LIST=9.0
ARG APEX_COMMIT=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
git checkout ${APEX_COMMIT} && \
pip install . -v --no-build-isolation --disable-pip-version-check --no-cache-dir \
--config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam --group_norm"
# Transformer Engine pre-1.7.0. 1.7 standardizes the meaning of bits in the attention mask to match
ARG TE_COMMIT=c27ee60ec746210bcea4ec33958dbbff06706506
RUN git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin ${TE_COMMIT} && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
# Install core apt packages.
RUN apt-get update \
&& apt-get install -y \
libsndfile1 \
ffmpeg \
git \
curl \
cmake \
pre-commit \
sudo \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get install -y gnupg
# Check the nemo dependency for causal conv1d and make sure this checkout
# tag matches. If not, update the tag in the following line.
RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-dir install \
git+https://github.com/Dao-AILab/[email protected]
# Build LLVM and triton
# It's important to select a specific version of LLVM as per triton's README instructions, and
# also important to constrain the build targets to the systems we care about or else there will
# be many strange unlinked symbol issues. Here we assume this dockerfile is build on an aarch64
# target (host), and build for NVIDIA GPUS (NVPTX). Unclear why, but we also need to build for
# AMDGPUs to get triton to properly build or else there are linker issues.
RUN git clone https://github.com/llvm/llvm-project.git && \
pip install ninja && \
cd llvm-project && \
git fetch origin 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \
git checkout 10dc3a8e916d73291269e5e2b82dd22681489aa1 && \
mkdir build && cd build && \
cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_PROJECTS="mlir;llvm" -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" ../llvm && \
ninja && \
export LLVM_BUILD_DIR=${WORKDIR}/llvm-project/build && \
cd ${WORKDIR} && \
git clone https://github.com/triton-lang/triton.git && \
pip install cmake wheel pybind11 && \
cd triton && \
git fetch origin release/3.1.x && \
git checkout release/3.1.x && \
LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib LLVM_SYSPATH=$LLVM_BUILD_DIR pip install --verbose python/ && \
cd ${WORKDIR} && \
rm -rf llvm-project && \
rm -rf triton
# Mamba dependancy installation
RUN pip --disable-pip-version-check --no-cache-dir install \
git+https://github.com/state-spaces/[email protected]
RUN pip install hatchling # needed to install nemo-run
ARG NEMO_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_TAG}
RUN mkdir -p /workspace/bionemo2/
# Delete the temporary /build directory.
WORKDIR /workspace
RUN rm -rf /build
# Addressing Security Scan Vulnerabilities
RUN rm -rf /opt/pytorch/pytorch/third_party/onnx
RUN apt purge -y libpmi2-0 && \
apt autoremove -y
# Use UV to install python packages from the workspace. This just installs packages into the system's python
# environment, and does not use the current uv.lock file. Note that with python 3.12, we now need to set
# UV_BREAK_SYSTEM_PACKAGES, since the pytorch base image has made the decision not to use a virtual environment and UV
# does not respect the PIP_BREAK_SYSTEM_PACKAGES environment variable set in the base dockerfile.
COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
ENV UV_LINK_MODE=copy \
UV_COMPILE_BYTECODE=1 \
UV_PYTHON_DOWNLOADS=never \
UV_SYSTEM_PYTHON=true \
UV_NO_CACHE=1 \
UV_BREAK_SYSTEM_PACKAGES=1
# Install the bionemo-geomtric requirements ahead of copying over the rest of the repo, so that we can cache their
# installation. These involve building some torch extensions, so they can take a while to install.
RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \
--mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked \
uv pip install --no-build-isolation -r /requirements-pyg.txt
ENV WORKDIR=/workspace/bionemo2
WORKDIR ${WORKDIR}
# Install 3rd-party deps and bionemo submodules.
COPY ./LICENSE /workspace/bionemo2/LICENSE
COPY ./3rdparty /workspace/bionemo2/3rdparty
COPY ./sub-packages /workspace/bionemo2/sub-packages
COPY --from=rust-env /usr/local/cargo /usr/local/cargo
COPY --from=rust-env /usr/local/rustup /usr/local/rustup
ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"
# # Build decord
# This needs a specific version of ffmpeg:
# root@e1fc53d00844:/workspace/bionemo2# ffmpeg -version
# ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
# built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
# configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/aarch64-linux-gnu --incdir=/usr/include/aarch64-linux-gnu --arch=arm64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-pocketsphinx --enable-librsvg --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared
# libavutil 56. 70.100 / 56. 70.100
# libavcodec 58.134.100 / 58.134.100
# libavformat 58. 76.100 / 58. 76.100
# libavdevice 58. 13.100 / 58. 13.100
# libavfilter 7.110.100 / 7.110.100
# libswscale 5. 9.100 / 5. 9.100
# libswresample 3. 9.100 / 3. 9.100
# libpostproc 55. 9.100 / 55. 9.100
#
# Issue link: https://github.com/dmlc/decord/issues/257
# Diff to make it all work https://github.com/dmlc/decord/issues/186#issuecomment-1171882325
# Consider this:
# sudo apt install libnvidia-decode-550
# cp /usr/lib/aarch64-linux-gnu/libnvcuvid* /usr/local/cuda/
# cmake .. -DUSE_CUDA=ON -DCMAKE_BUILD_TYPE=Release
RUN apt-get update && \
apt-get install -y build-essential python3-dev python3-setuptools make cmake && \
apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev
RUN --mount=type=bind,source=./arm_build/decord_ffmpeg6_fix.patch,target=/decord_ffmpeg6_fix.patch \
git clone --recursive https://github.com/dmlc/decord && \
cd decord && git apply /decord_ffmpeg6_fix.patch && \
mkdir build && cd build && \
cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \
make && \
cd ../python && \
pip install . && \
cd ${WORKDIR} && \
rm -rf decord
RUN pip install --upgrade pip setuptools
RUN pip install setuptools_scm py-cpuinfo
# Install C++20 toolchain
RUN apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:ubuntu-toolchain-r/test && \
apt-get update && \
apt-get install -y gcc-13 g++-13 make pkg-config opencc
WORKDIR /usr/lib/tiledb
RUN wget https://github.com/TileDB-Inc/TileDB/releases/download/2.27.0-rc3/tiledb-linux-arm64-2.27.0-rc3-8d581f2.tar.gz -O tiledb.tar.gz && \
tar -xvzf tiledb.tar.gz
ENV TILEDB_PATH=/usr/lib/tiledb
WORKDIR /dependencies
ENV CC=gcc-13 CXX=g++-13
RUN dpkg -l | awk '/libfmt/ {print $2}' | xargs apt-get remove -y
RUN dpkg -l | awk '/spdlog/ {print $2}' | xargs apt-get remove -y
RUN rm -f /usr/lib/*/cmake/spdlog/spdlogConfig.cmake
RUN rm -f /usr/lib/cmake/spdlog/spdlogConfig.cmake
RUN git clone --single-branch --branch 1.15.0rc4 https://github.com/single-cell-data/TileDB-SOMA.git && \
cd TileDB-SOMA/apis/python && \
pip install -v .
WORKDIR /workspace/bionemo2
# Note, we need to mount the .git folder here so that setuptools-scm is able to fetch git tag for version.
# For some reason, we do not need to do the tensorstore verson package hack on arm64, while we do need this for x86 build.
RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
uv pip install maturin --no-build-isolation --break-system-packages
RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
pip install --use-deprecated=legacy-resolver --no-build-isolation \
tensorstore==0.1.45
RUN --mount=type=bind,source=./.git,target=./.git \
--mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
--mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
# Comment out mamba install in NeMo as this causes issues.
sed -i "/mamba-ssm/d" ./3rdparty/NeMo/requirements/requirements_nlp.txt && \
uv pip install --no-build-isolation \
./3rdparty/* \
./sub-packages/bionemo-* \
-r /requirements-cve.txt \
-r /requirements-test.txt && rm -rf ./3rdparty && rm -rf /tmp/* && rm -rf ./sub-packages/bionemo-noodles/target \
&& rm -rf /root/.cache/*
# In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the
# base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that
# they can be installed in an editable fashion from the workspace directory. This lets us install all the package
# dependencies in a cached fashion, so they don't have to be built from scratch every time the devcontainer is rebuilt.
FROM ${BASE_IMAGE} AS dev
RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,id=apt-lib,target=/var/lib/apt,sharing=locked \
<<EOF
set -eo pipefail
apt-get update -qy
apt-get install -qyy \
sudo
rm -rf /tmp/* /var/tmp/*
EOF
# Create a non-root user to use inside a devcontainer.
ARG USERNAME=bionemo
ARG USER_UID=1000
ARG USER_GID=$USER_UID
RUN groupadd --gid $USER_GID $USERNAME \
&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \
&& echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
&& chmod 0440 /etc/sudoers.d/$USERNAME
# Here we delete the dist-packages directory from the pytorch base image, and copy over the dist-packages directory from
# the build image. This ensures we have all the necessary dependencies installed (megatron, nemo, etc.).
RUN <<EOF
set -eo pipefail
rm -rf /usr/local/lib/python3.10/dist-packages
mkdir -p /usr/local/lib/python3.10/dist-packages
chmod 777 /usr/local/lib/python3.10/dist-packages
chmod 777 /usr/local/bin
EOF
USER $USERNAME
COPY --from=bionemo2-base --chown=$USERNAME:$USERNAME --chmod=777 \
/usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=ghcr.io/astral-sh/uv:0.4.25 /uv /usr/local/bin/uv
ENV UV_LINK_MODE=copy \
UV_COMPILE_BYTECODE=0 \
UV_PYTHON_DOWNLOADS=never \
UV_SYSTEM_PYTHON=true
# Bring in the rust toolchain, as maturin is a dependency listed in requirements-dev
COPY --from=rust-env /usr/local/cargo /usr/local/cargo
COPY --from=rust-env /usr/local/rustup /usr/local/rustup
ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"
RUN --mount=type=bind,source=./requirements-dev.txt,target=/workspace/bionemo2/requirements-dev.txt \
--mount=type=cache,id=uv-cache,target=/root/.cache,sharing=locked <<EOF
set -eo pipefail
uv pip install -r /workspace/bionemo2/requirements-dev.txt
rm -rf /tmp/*
EOF
RUN <<EOF
set -eo pipefail
rm -rf /usr/local/lib/python3.10/dist-packages/bionemo*
pip uninstall -y nemo_toolkit megatron_core
EOF
# Transformer engine attention defaults
# FIXME the following result in unstable training curves even if they are faster
# see https://github.com/NVIDIA/bionemo-framework/pull/421
#ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0
FROM dev AS development
WORKDIR /workspace/bionemo2
COPY --from=bionemo2-base /workspace/bionemo2/ .
COPY ./internal ./internal
# because of the `rm -rf ./3rdparty` in bionemo2-base
COPY ./3rdparty ./3rdparty
USER root
COPY --from=rust-env /usr/local/cargo /usr/local/cargo
COPY --from=rust-env /usr/local/rustup /usr/local/rustup
ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"
RUN uv pip uninstall maturin
RUN uv pip install maturin --no-build-isolation
RUN <<EOF
set -eo pipefail
find . -name __pycache__ -type d -print | xargs rm -rf
uv pip install --no-build-isolation --editable ./internal/infra-bionemo
for sub in ./3rdparty/* ./sub-packages/bionemo-*; do
uv pip install --no-deps --no-build-isolation --editable $sub
done
EOF
# This is needed because faiss is not compatible with ARM at all.
# Bionemo doesn't use faiss, but megatron core does.
# We do not use this codepath at all, therefore we just make is_sve_supported return False
# to circumvent python import issues
RUN sed -i '42i\ # Bionemo hack to fix ARM issues with faiss\n return False' /usr/local/lib/python3.12/dist-packages/faiss/loader.py
# Since the entire repo is owned by root, swithcing username for development breaks things.
ARG USERNAME=bionemo
RUN chown $USERNAME:$USERNAME -R /workspace/bionemo2/
USER $USERNAME
# The 'release' target needs to be last so that it's the default build target. In the future, we could consider a setup
# similar to the devcontainer above, where we copy the dist-packages folder from the build image into the release image.
# This would reduce the overall image size by reducing the number of intermediate layers. In the meantime, we match the
# existing release image build by copying over remaining files from the repo into the container.
FROM bionemo2-base AS release
RUN mkdir -p /workspace/bionemo2/.cache/
COPY VERSION .
COPY ./scripts ./scripts
COPY ./README.md ./
# Copy over folders so that the image can run tests in a self-contained fashion.
COPY ./ci/scripts ./ci/scripts
COPY ./docs ./docs
COPY --from=rust-env /usr/local/cargo /usr/local/cargo
COPY --from=rust-env /usr/local/rustup /usr/local/rustup
# RUN rm -rf /usr/local/cargo /usr/local/rustup
RUN chmod 777 -R /workspace/bionemo2/
# Transformer engine attention defaults
# We have to declare this again because the devcontainer splits from the release image's base.
# FIXME the following results in unstable training curves even if faster.
# See https://github.com/NVIDIA/bionemo-framework/pull/421
# ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0