Skip to content

Commit

Permalink
Dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
dirkgr committed Nov 12, 2024
1 parent f18381e commit 3a2d62e
Showing 1 changed file with 79 additions and 0 deletions.
79 changes: 79 additions & 0 deletions scripts/augusta/beaker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
FROM --platform=linux/amd64 nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04

ARG DEBIAN_FRONTEND="noninteractive"
ENV TZ="America/Los_Angeles"

# Install base tools.
RUN apt-get update && apt-get install -y \
build-essential \
curl \
git \
jq \
language-pack-en \
make \
sudo \
unzip \
vim \
wget \
parallel \
iputils-ping \
tmux

ARG BEAKER_VERSION
RUN curl --silent \
--connect-timeout 5 \
--max-time 10 \
--retry 5 \
--retry-delay 0 \
--retry-max-time 40 \
--output beaker.tar.gz \
"https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
&& tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
&& rm beaker.tar.gz

# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
# puts the right NVIDIA things in the right place
ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute

# Install conda. We give anyone in the users group the ability to run
# conda commands and install packages in the base (default) environment.
# Things installed into the default environment won't persist, but we prefer
# convenience in this case and try to make sure the user is aware of this
# with a message that's printed when the session starts.
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh \
&& echo "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh" \
| sha256sum --check \
&& bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -b -p /opt/miniconda3 \
&& rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh

ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH

RUN conda install -y pytorch::pytorch==2.5.1 packaging "numpy<2"

# Ensure users can modify their container environment.
RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers

# Install MLNX OFED user-space drivers
# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
ENV MOFED_VER 5.8-1.1.2.1
ENV OS_VER ubuntu20.04
ENV PLATFORM x86_64
RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz

RUN apt-get install ninja-build -y

ENV HF_HUB_ENABLE_HF_TRANSFER=1
RUN pip install --no-cache-dir --upgrade pip "setuptools<70.0.0" wheel
# TODO, unpin setuptools when this issue in flash attention is resolved
RUN pip install --no-cache-dir flash-attn==2.6.3 --no-build-isolation
RUN python -c "import torch; print(torch.__version__)"

RUN pip install --no-cache-dir ai2-olmo-core==0.1.0 omegaconf rich boto3 google-cloud-storage tokenizers "cached_path>=1.6.2" transformers importlib_resources py-spy wandb beaker-gantry click torchmetrics safetensors datasets scikit-learn "msgspec>=0.14.0" "smashed[remote]>=0.21.1"

RUN apt-get clean

0 comments on commit 3a2d62e

Please sign in to comment.