-
Notifications
You must be signed in to change notification settings - Fork 116
/
Copy pathdocker-compose.yml
381 lines (366 loc) · 18 KB
/
docker-compose.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# See usages of this compose file and its commands in the README.md file
version: '3.7'
volumes:
local_pg_data:
driver: local
local_es_data:
driver: local
services:
usaspending-db:
command: postgres -c 'max_connections=500'
profiles:
- usaspending # must pass --profile usaspending to docker-compose for this to come up, or run a service with one of these other profiles
- manage
- test
- ci
image: postgres:13.8-alpine
container_name: usaspending-db
volumes:
- type: volume
source: local_pg_data
target: /var/lib/postgresql/data
ports:
- ${USASPENDING_DB_PORT:-5432}:5432
environment:
POSTGRES_USER: ${USASPENDING_DB_USER:-usaspending}
POSTGRES_PASSWORD: ${USASPENDING_DB_PASSWORD:-usaspender}
POSTGRES_DB: ${USASPENDING_DB_NAME:-data_store_api}
usaspending-manage:
profiles:
- manage # must pass --profile manage to docker-compose for this to come up, or use docker-compose run
image: usaspending-backend # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
build: .
container_name: usaspending-manage
volumes:
- .:/dockermount
depends_on:
- usaspending-db
# For an interactive shell, override this command with: docker-compose run --rm usaspending-manage python3 -u manage.py shell
command: python3 -u manage.py help
environment:
DJANGO_DEBUG: ${DJANGO_DEBUG}
DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
ES_HOSTNAME: ${ES_HOSTNAME}
DATA_BROKER_DATABASE_URL: postgresql://${BROKER_DB_USER}:${BROKER_DB_PASSWORD}@${BROKER_DB_HOST}:${BROKER_DB_PORT}/data_broker
usaspending-test:
profiles:
- test # must pass --profile test to docker-compose for this to come up, or use docker-compose run
image: usaspending-testing # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
build:
context: .
dockerfile: Dockerfile.testing
container_name: usaspending-test
volumes:
- .:/dockermount
# Required to interact with host's docker daemon from within this running container,
# to spin up the data-act-broker-init-test-db container used for broker integration tests (see: conftest.broker_db_setup)
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- usaspending-db
- usaspending-es
command: make tests
environment:
DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
ES_HOST: ${ES_HOST}
ES_HOSTNAME: ${ES_HOSTNAME}
DATA_BROKER_DATABASE_URL: postgresql://${BROKER_DB_USER}:${BROKER_DB_PASSWORD}@${BROKER_DB_HOST}:${BROKER_DB_PORT}/data_broker
# Location in host machine where broker src code root can be found
DATA_BROKER_SRC_PATH: "$PWD/../data-act-broker-backend"
MINIO_HOST: ${MINIO_HOST}
DOWNLOAD_DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
usaspending-ci:
profiles:
- ci # must pass --profile ci to docker-compose for this to come up, or use docker-compose run
image: usaspending-backend # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
build: .
container_name: usaspending-ci
volumes:
- .:/dockermount
# Required to interact with host's docker daemon from within this running container,
# to spin up the data-act-broker-init-test-db container used for broker integration tests (see: conftest.broker_db_setup)
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- usaspending-db
- usaspending-es
command:
- sh
- -c
- |
printf "==============\nChecking code format:\n"
black --check --diff .
printf -- "-------\nChecking code syntax:\n"
flake8 && echo "Successfully passed"
printf -- "-------\nChecking API documentation files:\n"
python3 manage.py check_for_endpoint_documentation
printf -- "-------\nRunning unit tests:\n"
pytest --durations 50 --ignore-glob='**/tests/integration/*' --cov=usaspending_api --cov-report= --reuse-db -rsx
printf -- "-------\nRunning integration tests:\n"
pytest --durations 50 --override-ini=python_files='**/tests/integration/*' --cov=usaspending_api --cov-append --cov-report term --cov-report xml:coverage.xml --reuse-db -rsx
environment:
DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
ES_HOSTNAME: ${ES_HOSTNAME}
DATA_BROKER_DATABASE_URL: postgresql://${BROKER_DB_USER}:${BROKER_DB_PASSWORD}@${BROKER_DB_HOST}:${BROKER_DB_PORT}/data_broker
# Location in host machine where broker src code root can be found
DATA_BROKER_SRC_PATH: "$PWD/../data-act-broker-backend"
usaspending-api:
profiles:
- usaspending # must pass --profile usaspending to docker-compose for this to come up
image: usaspending-backend # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
build: .
container_name: usaspending-api
volumes:
- .:/dockermount
ports:
- 8000:8000
depends_on:
- usaspending-db
- usaspending-es
restart: on-failure:3 # 3 max attempt, and then it will stop restarting
# Must wait on postgres db to be up (~9s)
command: /bin/sh -c "sleep 9s; python3 -u manage.py runserver --verbosity 2 0.0.0.0:8000"
environment:
DJANGO_DEBUG: ${DJANGO_DEBUG}
RUN_LOCAL_DOWNLOAD_IN_PROCESS: "False"
DB_SOURCE: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
DB_R1: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
DOWNLOAD_DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
ES_HOSTNAME: ${ES_HOSTNAME}
usaspending-bulk-download:
profiles:
- usaspending # must pass --profile usaspending to docker-compose for this to come up
image: usaspending-backend # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
build: .
container_name: usaspending-bulk-download
restart: on-failure:5 # 5 max attempt, and then it will stop restarting. NOTE: bulk download errors will cause one failure+restart iterations
volumes:
- .:/dockermount
command: python3 manage.py download_sqs_worker
environment:
DJANGO_DEBUG: ${DJANGO_DEBUG}
DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
DOWNLOAD_DATABASE_URL: postgres://${USASPENDING_DB_USER}:${USASPENDING_DB_PASSWORD}@${USASPENDING_DB_HOST}:${USASPENDING_DB_PORT}/data_store_api
usaspending-es:
profiles:
- usaspending # must pass --profile usaspending to docker-compose for this to come up
- test
- ci
image: docker.elastic.co/elasticsearch/elasticsearch:7.1.1
container_name: usaspending-es
environment:
- node.name=usaspending-es
- discovery.seed_hosts=usaspending-es
- cluster.initial_master_nodes=usaspending-es
- cluster.name=usaspending
- network.host=0.0.0.0
- bootstrap.memory_lock=true
- "ES_JAVA_OPTS=-Xms1536m -Xmx1536m" # Ensure Docker is allocated plenty of memory, otherwise this will fail
# Inject plugin install, then resume with orignial entrypoint command
command: >
/bin/sh -c "
if [ ! -d /usr/share/elasticsearch/plugins/mapper-murmur3 ]; then
# Certificate problem workaround when on VPN - wget without checking cert, then install from local filesystem
wget --no-check-certificate https://artifacts.elastic.co/downloads/elasticsearch-plugins/mapper-murmur3/mapper-murmur3-7.1.1.zip
./bin/elasticsearch-plugin install file:///usr/share/elasticsearch/mapper-murmur3-7.1.1.zip
fi
/usr/local/bin/docker-entrypoint.sh"
ulimits:
memlock:
soft: -1
hard: -1
volumes:
- type: volume
source: local_es_data
target: /usr/share/elasticsearch/data
ports:
- 9200:9200
usaspending-kibana-es:
profiles:
- usaspending # must pass --profile usaspending to docker-compose for this to come up
image: docker.elastic.co/kibana/kibana-oss:7.1.1
container_name: usaspending-kibana-es
# ELASTICSEARCH_HOSTS should match the port for "usaspending-es"; value will need to be updated if using Windows
environment:
- ELASTICSEARCH_HOSTS="http://docker.for.mac.localhost:9200"
ports:
- 5601:5601
minio:
profiles: # must pass one of these with --profile to docker-compose
- s3
- spark
- test
image: minio/minio:RELEASE.2022-04-12T06-55-35Z
container_name: minio
volumes:
- .:/dockermount
- type: bind
source: ${MINIO_DATA_DIR:-../data/s3}
target: /data
ports:
- ${MINIO_PORT:-10001}:10001
- ${MINIO_CONSOLE_PORT:-10002}:10002
environment:
MINIO_ACCESS_KEY: ${MINIO_ACCESS_KEY:-usaspending}
MINIO_SECRET_KEY: ${MINIO_SECRET_KEY:-usaspender}
entrypoint: >
/bin/sh -c "
# Create the bucket within MinIO and file path for the data dict
mkdir -p data/dti-da-public-files-nonprod/user_reference_docs
# Create the bucket within MinIO used for endpoints that list generated downloads
mkdir -p data/bulk_download
cp dockermount/usaspending_api/data/Data_Dictionary_Crosswalk.xlsx data/dti-da-public-files-nonprod/user_reference_docs/Data_Dictionary_Crosswalk.xlsx
minio server --address ":10001" --console-address ":10002" /data
"
healthcheck:
test: ["CMD", "curl", "-f", "http://${MINIO_HOST:-localhost}:${MINIO_PORT:-10001}/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
spark-master:
profiles:
- spark # must pass --profile spark to docker-compose for this to come up
- test
image: spark-base # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
# build context path needs to be relative to project root, from where docker-compose will be run
build:
context: .
dockerfile: Dockerfile.spark
args:
PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
container_name: spark-master
environment:
SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
SPARK_MASTER_WEBUI_PORT: ${SPARK_MASTER_WEBUI_PORT:-4040}
command: >
/bin/sh -c "
$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.master.Master \
--port $${SPARK_MASTER_PORT} \
--webui-port $${SPARK_MASTER_WEBUI_PORT}"
ports:
- ${SPARK_MASTER_PORT:-7077}:7077
- ${SPARK_MASTER_WEBUI_PORT:-4040}:4040
volumes:
- type: bind
source: .
target: /project
read_only: false
spark-worker:
profiles:
- spark # must pass --profile spark to docker-compose for this to come up
- test
image: spark-base # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
# build context path needs to be relative to project root, from where docker-compose will be run
build:
context: .
dockerfile: Dockerfile.spark
args:
PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
container_name: spark-worker
depends_on:
- spark-master
environment:
SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
SPARK_WORKER_WEBUI_PORT: ${SPARK_WORKER_WEBUI_PORT:-4041}
command: /bin/sh -c "$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $${SPARK_WORKER_WEBUI_PORT} spark://$${SPARK_MASTER_HOST}:$${SPARK_MASTER_PORT}"
ports:
- ${SPARK_WORKER_WEBUI_PORT:-4041}:4041
volumes:
- type: bind
source: .
target: /project
read_only: false
spark-history-server:
profiles:
- spark # must pass --profile spark to docker-compose for this to come up
- test
image: spark-base # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
# build context path needs to be relative to project root, from where docker-compose will be run
build:
context: .
dockerfile: Dockerfile.spark
args:
PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
container_name: spark-history-server
environment:
SPARK_HISTORY_SERVER_PORT: ${SPARK_HISTORY_SERVER_PORT:-18080}
command: /bin/sh -c "$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.history.HistoryServer"
ports:
- ${SPARK_HISTORY_SERVER_PORT:-18080}:18080
volumes:
- type: bind
source: .
target: /project
read_only: false
# Example of running spark-submit container:
# NOTE: double check package dependency versions here with those used in unit tests (conftest_spark.py), as these docs could have gotten stale
# (1) Review config values in usaspending_api/config/envs/local.py and override any as needed in a .env file or -e environment variable
# (2) Deploy minio in docker container (see README.md)
# (3) Deploy the postgres DB docker container, if your script connects to a DB.
# - If so, also export a JDBC_URL environment variable to pass in to the spark-submit container so it can find its connection
# (4) If reading or writing to S3, make sure the bucket given by the value of config setting CONFIG.AWS_S3_BUCKET exists
# - e.g. create via UI at http://localhost:10001
# - or use MinIO client CLI: mc mb local/data
# (4) Run the spark-submit container, citing the dependent packages:
# (NOTEs:
# - postgresql is needed as a JDBC driver, if connecting to a Postgres DB
# - delta-core is needed to read/write in Delta Lake format
# - hadoop-aws is needed for the S3AFileSystem.java, used to write data to S3,
# - and should use the same hadoop version in your local setup
# - NOTE that specifying hadoop-awas should pull in on its own the required version of the aws-java-sdk
# - spark-hive is needed to use a hive metastore_db of schemas and tables
# - the Docker image at this time only installs spark and hadoop standalone, which does not seem to include all the needed Hive jars
#
# make docker-compose-run profiles="--profile spark" args="--rm -e MINIO_HOST=minio -e JDBC_URL -e COMPONENT_NAME='My Spark Prototype Script' spark-submit \
# --packages org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1 \
# /project/usaspending_api/etl/tests/path_to_your_spark_prototype_script.py"
spark-submit:
profiles:
- spark # must pass --profile spark to docker-compose for this to come up
- test
image: spark-base # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
# build context path needs to be relative to project root, from where docker-compose will be run
build:
context: .
dockerfile: Dockerfile.spark
args:
PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
container_name: spark-submit
depends_on:
- spark-master
- spark-worker
- spark-history-server
- minio
environment:
SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
# i.e. target where host warehouse dir is bound in below volume config.
# This env var needs to be picked up as the config for the spark.sql.warehouse.dir spark conf setting when SparkSessions are created inside of a spark-submitted job
SPARK_SQL_WAREHOUSE_DIR: /spark-warehouse
# i.e. a metastore_db sub dir of the target where host warehouse dir is bound in below volume config.
# This env var needs to be picked up as the path part of the config for the spark.hadoop.javax.jdo.option.ConnectionURL spark conf setting when SparkSessions are created inside of a spark-submitted job
HIVE_METASTORE_DERBY_DB_DIR: /spark-warehouse/metastore_db
PYTHONPATH: "/project"
# NOTE: entrypoint CANNOT interpolate env vars when processed. They are passed through literally.
# So in using 1 $ rather than 2 $$, the var is evaluated based on the current SHELL ENV when docker-compose is run,
# and interpolated before accessed as the entrypoint.
# While this service has values for these interpolated vars in the environment: element, those are not used here,
# but merely passed into the container. KEEP the two references to these vars and their defaults consistent!
# To see what it will be, you can run docker-compose config (i.e. make docker-compose-config in this project's Makefile)
entrypoint: ./bin/spark-submit --master spark://${SPARK_MASTER_HOST:-spark-master}:${SPARK_MASTER_PORT:-7077}
command: --help
volumes:
- type: bind
source: .
target: /project
read_only: false
# NOTE: The hive metastore_db Derby database folder is expected to be configured to show up as a subfolder of the spark-warehouse dir
- type: bind
source: ${SPARK_SQL_WAREHOUSE_DIR:-./spark-warehouse}
target: /spark-warehouse
# Mount the JAR dependencies local repo on host into container to take advantage of caching/reuse
# i.e., to download the dependencies only once and reuse on subsequent docker-compose run calls
- type: bind
source: ${HOME}/.ivy2
target: /root/.ivy2
read_only: false