-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
docker-compose-vllm.yml
64 lines (61 loc) · 1.34 KB
/
docker-compose-vllm.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
version: '3'
services:
h2ogpt:
build:
context: .
dockerfile: Dockerfile
restart: always
shm_size: '2gb'
depends_on:
vllm:
condition: service_healthy
ports:
- '${H2OGPT_PORT}:7860'
volumes:
- cache:/workspace/.cache
- save:/workspace/save
networks:
- h2ogpt
command:
- /workspace/generate.py
- --inference_server="vllm:vllm:5000"
- --base_model=${H2OGPT_BASE_MODEL}
- --langchain_mode=UserData
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['2', '3']
capabilities: [gpu]
vllm:
image: vllm/vllm-openai:latest
restart: always
shm_size: '64gb'
expose:
- 5000
volumes:
- cache:/workspace/.cache
networks:
- h2ogpt
entrypoint: python3
command: -m vllm.entrypoints.openai.api_server --port=5000 --host=0.0.0.0 ${H2OGPT_VLLM_ARGS}
environment:
- NCCL_IGNORE_DISABLED_P2P=1
healthcheck:
test: [ "CMD", "curl", "-f", "http://0.0.0.0:5000/v1/models" ]
interval: 30s
timeout: 5s
retries: 20
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0', '1']
capabilities: [gpu]
volumes:
cache:
save:
networks:
h2ogpt: