docker-compose.yaml

# Requires the latest version of Docker Compose.
# Docker Compose V2 is recommended.
# `docker-compose.yaml` files cannot use shell outputs as their inputs.
# See https://docs.docker.com/compose/compose-file/compose-file-v3
# for a guide on how to interpret the `docker-compose.yaml` file.
# Variables are in ${VARIBALE:-DEFAULT_VALUE} format
# to ensure that default values are given to the Dockerfile.
# Users are recommended to use an `.env` file to set variables.
# Run `make env` to create a basic `.env` file with the UID and GID variables.

services:
  train:  # Service name. Change the name as necessary for each project.
    env_file:  # Explicitly specifying the `.env` file causes an error if it does not exist.
      - ${ENV_FILE:-.env}  # Forcing users to create a `.env` file before using compose, thus preventing errors.
    # Remember to use different image names for different users and projects.
    # Otherwise, images will be repeatedly removed and recreated.
    # The removed images will remain cached, however.
    # If no image with the specified image name exists,
    # a new image with that name will be created.
    image: pytorch_source:${TRAIN_NAME:-train}
    # `ipc: host` is a known security vulnerability, but it removes the shared memory cap.
    ipc: host  # Equivalent to `--ipc=host` in `docker run`. Disable this for WSL.
    tty: true  # Equivalent to `-t` flag in `docker run`.
    init: true  # Equivalent to `--init` flag in `docker run`.
    stdin_open: true  # equivalent to `-i` flag in `docker run`.
    # Docker volumes are the preferred method for connecting to the host filesystem.
    # Setting `HOST_PATH:CONTAINER_PATH` allows the container to access `HOST_PATH` as `CONTAINER_PATH`.
    # See https://docs.docker.com/storage/volumes for details.
    volumes:  # Add volumes as necessary. Equivalent to `-v` flag in `docker run`.
      - $PWD:${PROJECT_ROOT:-/opt/project}
    build:  # Options for building. Used when `--build` is called in `docker compose`.
      target: train  # Specify build target.
      context: .  # `.dockerignore` should remove all context, making this equivalent to the `Makefile` results.
      dockerfile: Dockerfile
      cache_from:  # Useful if cache images have been created with the Makefile commands beforehand.
        - pytorch_source:${INSTALL_NAME:-build_install}
        - pytorch_source:${TORCH_NAME:-'build_torch-v1.10.0'}
      # All arguments given during the build with must be respecified
      # in `args` to prevent a cache miss from occurring.
      # Default values of the `Dockerfile` (but not the `Makefile`) may be omitted.
      args:  # Equivalent to `--build-arg`.
        TORCH_CUDA_ARCH_LIST: ${CC:-'5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX'}
        PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-'v1.10.0'}
        TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-'v0.11.1'}
        TORCHTEXT_VERSION_TAG: ${TORCHTEXT_VERSION_TAG:-'v0.11.0-rc3'}
        TORCHAUDIO_VERSION_TAG: ${TORCHAUDIO_VERSION_TAG:-'v0.10.0'}
        PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
        GID: ${GID:-1000}  # Run `id -g` on the terminal to find your GID. Check that it is set properly in `.env`.
        UID: ${UID:-1000}  # Run `id -u` on the terminal to find your UID. Check that it is set properly in `.env`.
        TZ: ${TZ:-Asia/Seoul}  # Used during the build.
    working_dir: ${PROJECT_ROOT:-/opt/project}
#    ports:  # Only necessary for Tensorboard, Jupyter, etc. Also not necessary if the VSCode terminal is being used.
#      - ${PORT:-8080}:22
    user: ${UID:-1000}:${GID:-1000}
    environment:  # Environment variables for the container, not the build. Equivalent to `--env`
      TZ: ${TZ:-Asia/Seoul}  # Used during runtime.
      CUDA_DEVICE_ORDER: PCI_BUS_ID
    deploy:  # API dependent on compose version.
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [ gpu ]
#              device_ids: [ '0' ]  # Use only GPU 0.


  # Perhaps placing unrelated services in the same file is not best practice,
  # but I did not wish to create another file. Users are generally expected
  # to use either the `train` or `full` services but not both simultaneously.
  # Perhaps using different variable names for the default and `full` builds
  # would have been better, but I wished to match the `Makefile` exactly.
  # Use a separate `full.env` file for different configurations if necessary.
  full:  # Service for `*-full` installs.
    env_file:
      - ${ENV_FILE:-.env}
    image: pytorch_source:${TRAIN_NAME:-train}
    ipc: host
    tty: true
    init: true
    stdin_open: true
    volumes:
      - $PWD:${PROJECT_ROOT:-/opt/project}
    build:
      target: train
      context: .
      dockerfile: Dockerfile
      cache_from:
        # Note that the `INSTALL_NAME_FULL` and `TORCH_NAME_FULL` variables have
        # different variable names from the `train` service.
        # This may cause a cache miss if set incorrectly.
        - pytorch_source:${INSTALL_NAME_FULL:-'build_install-ubuntu18.04-cuda10.2-cudnn8-py3.9'}
        - pytorch_source:${TORCH_NAME_FULL:-'build_torch-v.1.9.1-ubuntu18.04-cuda10.2-cudnn8-py3.9'}
      args:  # Equivalent to `--build-arg`. Set to default values for `*-full`.
        LINUX_DISTRO: ${LINUX_DISTRO:-ubuntu}
        DISTRO_VERSION: ${DISTRO_VERSION:-18.04}
        CUDA_VERSION: ${CUDA_VERSION:-10.2}
        CUDNN_VERSION: ${CUDNN_VERSION:-8}
        PYTHON_VERSION: ${PYTHON_VERSION:-3.9}
        MAGMA_VERSION: ${MAGMA_VERSION:-102}
        TORCH_CUDA_ARCH_LIST: ${CC:-'5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX'}
        PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-'v1.10.0'}
        TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-'v0.11.1'}
        TORCHTEXT_VERSION_TAG: ${TORCHTEXT_VERSION_TAG:-'v0.11.0-rc3'}
        TORCHAUDIO_VERSION_TAG: ${TORCHAUDIO_VERSION_TAG:-'v0.10.0'}
        PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
        GID: ${GID:-1000}
        UID: ${UID:-1000}
        TZ: ${TZ:-Asia/Seoul}
    working_dir: ${PROJECT_ROOT:-/opt/project}
    user: ${UID:-1000}:${GID:-1000}
    environment:
      TZ: ${TZ:-Asia/Seoul}
      CUDA_DEVICE_ORDER: PCI_BUS_ID
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [ gpu ]


  ngc:  # NGC image service.
    env_file: 
      - ${ENV_FILE:-.env}
    image: pytorch_source:ngc-${YEAR:-21}.${MONTH:-10}
    ipc: host
    tty: true
    init: true
    stdin_open: true
    volumes:
      - $PWD:${PROJECT_ROOT:-/opt/project}
    build:
      target: ngc
      context: .
      dockerfile: ngc.Dockerfile
      cache_from: 
        - nvcr.io/nvidia/pytorch:${YEAR:-21}.${MONTH:-10}-py3
      args:
        PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
        YEAR: ${YEAR:-21}
        MONTH: ${MONTH:-10}
        GID: ${GID:-1000}
        UID: ${UID:-1000}
        TZ: ${TZ:-Asia/Seoul}
    working_dir: ${PROJECT_ROOT:-/opt/project}
    user: ${UID:-1000}:${GID:-1000}
    environment:
      TZ: ${TZ:-Asia/Seoul}
      CUDA_DEVICE_ORDER: PCI_BUS_ID
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [ gpu ]