forked from cresset-template/cresset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocker-compose.yaml
155 lines (150 loc) · 7.07 KB
/
docker-compose.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Requires the latest version of Docker Compose.
# Docker Compose V2 is recommended.
# `docker-compose.yaml` files cannot use shell outputs as their inputs.
# See https://docs.docker.com/compose/compose-file/compose-file-v3
# for a guide on how to interpret the `docker-compose.yaml` file.
# Variables are in ${VARIBALE:-DEFAULT_VALUE} format
# to ensure that default values are given to the Dockerfile.
# Users are recommended to use an `.env` file to set variables.
# Run `make env` to create a basic `.env` file with the UID and GID variables.
services:
train: # Service name. Change the name as necessary for each project.
env_file: # Explicitly specifying the `.env` file causes an error if it does not exist.
- ${ENV_FILE:-.env} # Forcing users to create a `.env` file before using compose, thus preventing errors.
# Remember to use different image names for different users and projects.
# Otherwise, images will be repeatedly removed and recreated.
# The removed images will remain cached, however.
# If no image with the specified image name exists,
# a new image with that name will be created.
image: pytorch_source:${TRAIN_NAME:-train}
# `ipc: host` is a known security vulnerability, but it removes the shared memory cap.
ipc: host # Equivalent to `--ipc=host` in `docker run`. Disable this for WSL.
tty: true # Equivalent to `-t` flag in `docker run`.
init: true # Equivalent to `--init` flag in `docker run`.
stdin_open: true # equivalent to `-i` flag in `docker run`.
# Docker volumes are the preferred method for connecting to the host filesystem.
# Setting `HOST_PATH:CONTAINER_PATH` allows the container to access `HOST_PATH` as `CONTAINER_PATH`.
# See https://docs.docker.com/storage/volumes for details.
volumes: # Add volumes as necessary. Equivalent to `-v` flag in `docker run`.
- $PWD:${PROJECT_ROOT:-/opt/project}
build: # Options for building. Used when `--build` is called in `docker compose`.
target: train # Specify build target.
context: . # `.dockerignore` should remove all context, making this equivalent to the `Makefile` results.
dockerfile: Dockerfile
cache_from: # Useful if cache images have been created with the Makefile commands beforehand.
- pytorch_source:${INSTALL_NAME:-build_install}
- pytorch_source:${TORCH_NAME:-'build_torch-v1.10.0'}
# All arguments given during the build with must be respecified
# in `args` to prevent a cache miss from occurring.
# Default values of the `Dockerfile` (but not the `Makefile`) may be omitted.
args: # Equivalent to `--build-arg`.
TORCH_CUDA_ARCH_LIST: ${CC:-'5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX'}
PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-'v1.10.0'}
TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-'v0.11.1'}
TORCHTEXT_VERSION_TAG: ${TORCHTEXT_VERSION_TAG:-'v0.11.0-rc3'}
TORCHAUDIO_VERSION_TAG: ${TORCHAUDIO_VERSION_TAG:-'v0.10.0'}
PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
GID: ${GID:-1000} # Run `id -g` on the terminal to find your GID. Check that it is set properly in `.env`.
UID: ${UID:-1000} # Run `id -u` on the terminal to find your UID. Check that it is set properly in `.env`.
TZ: ${TZ:-Asia/Seoul} # Used during the build.
working_dir: ${PROJECT_ROOT:-/opt/project}
# ports: # Only necessary for Tensorboard, Jupyter, etc. Also not necessary if the VSCode terminal is being used.
# - ${PORT:-8080}:22
user: ${UID:-1000}:${GID:-1000}
environment: # Environment variables for the container, not the build. Equivalent to `--env`
TZ: ${TZ:-Asia/Seoul} # Used during runtime.
CUDA_DEVICE_ORDER: PCI_BUS_ID
deploy: # API dependent on compose version.
resources:
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
# device_ids: [ '0' ] # Use only GPU 0.
# Perhaps placing unrelated services in the same file is not best practice,
# but I did not wish to create another file. Users are generally expected
# to use either the `train` or `full` services but not both simultaneously.
# Perhaps using different variable names for the default and `full` builds
# would have been better, but I wished to match the `Makefile` exactly.
# Use a separate `full.env` file for different configurations if necessary.
full: # Service for `*-full` installs.
env_file:
- ${ENV_FILE:-.env}
image: pytorch_source:${TRAIN_NAME:-train}
ipc: host
tty: true
init: true
stdin_open: true
volumes:
- $PWD:${PROJECT_ROOT:-/opt/project}
build:
target: train
context: .
dockerfile: Dockerfile
cache_from:
# Note that the `INSTALL_NAME_FULL` and `TORCH_NAME_FULL` variables have
# different variable names from the `train` service.
# This may cause a cache miss if set incorrectly.
- pytorch_source:${INSTALL_NAME_FULL:-'build_install-ubuntu18.04-cuda10.2-cudnn8-py3.9'}
- pytorch_source:${TORCH_NAME_FULL:-'build_torch-v.1.9.1-ubuntu18.04-cuda10.2-cudnn8-py3.9'}
args: # Equivalent to `--build-arg`. Set to default values for `*-full`.
LINUX_DISTRO: ${LINUX_DISTRO:-ubuntu}
DISTRO_VERSION: ${DISTRO_VERSION:-18.04}
CUDA_VERSION: ${CUDA_VERSION:-10.2}
CUDNN_VERSION: ${CUDNN_VERSION:-8}
PYTHON_VERSION: ${PYTHON_VERSION:-3.9}
MAGMA_VERSION: ${MAGMA_VERSION:-102}
TORCH_CUDA_ARCH_LIST: ${CC:-'5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX'}
PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-'v1.10.0'}
TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-'v0.11.1'}
TORCHTEXT_VERSION_TAG: ${TORCHTEXT_VERSION_TAG:-'v0.11.0-rc3'}
TORCHAUDIO_VERSION_TAG: ${TORCHAUDIO_VERSION_TAG:-'v0.10.0'}
PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
GID: ${GID:-1000}
UID: ${UID:-1000}
TZ: ${TZ:-Asia/Seoul}
working_dir: ${PROJECT_ROOT:-/opt/project}
user: ${UID:-1000}:${GID:-1000}
environment:
TZ: ${TZ:-Asia/Seoul}
CUDA_DEVICE_ORDER: PCI_BUS_ID
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]
ngc: # NGC image service.
env_file:
- ${ENV_FILE:-.env}
image: pytorch_source:ngc-${YEAR:-21}.${MONTH:-10}
ipc: host
tty: true
init: true
stdin_open: true
volumes:
- $PWD:${PROJECT_ROOT:-/opt/project}
build:
target: ngc
context: .
dockerfile: ngc.Dockerfile
cache_from:
- nvcr.io/nvidia/pytorch:${YEAR:-21}.${MONTH:-10}-py3
args:
PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
YEAR: ${YEAR:-21}
MONTH: ${MONTH:-10}
GID: ${GID:-1000}
UID: ${UID:-1000}
TZ: ${TZ:-Asia/Seoul}
working_dir: ${PROJECT_ROOT:-/opt/project}
user: ${UID:-1000}:${GID:-1000}
environment:
TZ: ${TZ:-Asia/Seoul}
CUDA_DEVICE_ORDER: PCI_BUS_ID
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [ gpu ]