diff --git a/Dockerfile b/Dockerfile index fb46a635..19f88e28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ENV TERM linux # Work around initramfs-tools running on kernel 'upgrade': ENV INITRD No -ENV AIRFLOW_VERSION 1.5.1 +ENV AIRFLOW_VERSION 1.5.2 ENV AIRFLOW_HOME /usr/local/airflow ENV PYTHONLIBPATH /usr/lib/python2.7/dist-packages @@ -34,6 +34,7 @@ RUN apt-get update -yqq \ build-essential \ && pip install --install-option="--install-purelib=$PYTHONLIBPATH" cryptography \ && pip install --install-option="--install-purelib=$PYTHONLIBPATH" airflow==${AIRFLOW_VERSION} \ + && pip install --install-option="--install-purelib=$PYTHONLIBPATH" airflow[celery]==${AIRFLOW_VERSION} \ && pip install --install-option="--install-purelib=$PYTHONLIBPATH" airflow[mysql]==${AIRFLOW_VERSION} \ && apt-get clean \ && rm -rf \ diff --git a/circle.yml b/circle.yml index a83efdb3..4d9eb3a4 100644 --- a/circle.yml +++ b/circle.yml @@ -10,4 +10,4 @@ test: pre: - sleep 5 override: - - docker run puckel/docker-airflow version + - docker run puckel/docker-airflow version |grep '1.5.2' diff --git a/config/airflow.cfg b/config/airflow.cfg index 53a959ab..3092e9bc 100644 --- a/config/airflow.cfg +++ b/config/airflow.cfg @@ -1,46 +1,66 @@ [core] # The home folder for airflow, default is ~/airflow -airflow_home = /usr/local/airflow +airflow_home = /usr/local/airflow + # The folder where your airflow pipelines live, most likely a # subfolder in a code repository dags_folder = /usr/local/airflow/dags + # The folder where airflow should store its log files base_log_folder = /usr/local/airflow/logs + # The executor class that airflow should use. Choices include # SequentialExecutor, LocalExecutor, CeleryExecutor -executor = CeleryExecutor +executor = SequentialExecutor + # The SqlAlchemy connection string to the metadata database. # SqlAlchemy supports many different database engine, more information # their website -sql_alchemy_conn = mysql://airflow:airflow@mysqldb/airflow +sql_alchemy_conn = mysql://airflow:airflow@mysql/airflow + # The amount of parallelism as a setting to the executor. This defines # the max number of task instances that should run simultaneously # on this airflow installation parallelism = 32 + # Whether to load the examples that ship with Airflow. It's good to # get started, but you probably want to set this to False in a production # environment load_examples = True + # Where your Airflow plugins are stored plugins_folder = /usr/local/airflow/plugins + # Secret key to save connection passwords in the db -fernet_key = {FERNET_KEY} +fernet_key = $FERNET_KEY + +# Whether to disable pickling dags +donot_pickle = False [webserver] # The base url of your website as airflow cannot guess what domain or # cname you are using. This is use in automated emails that # airflow sends to point links to the right web server base_url = http://localhost:8080 + # The ip specified when starting the web server web_server_host = 0.0.0.0 + # The port on which to run the web server web_server_port = 8080 + # Secret key used to run your flask app secret_key = temporary_key + +# number of threads to run the Gunicorn web server +thread = 4 + # Expose the configuration file in the web server expose_config = true + # Set to true to turn on authentication : http://pythonhosted.org/airflow/installation.html#web-authentication authenticate = False + # Filter the list of dags by owner name (requires authentication to be enabled) filter_by_owner = False @@ -58,28 +78,35 @@ smtp_mail_from = airflow@airflow.com [celery] # This section only applies if you are using the CeleryExecutor in # [core] section above + # The app name that will be used by celery celery_app_name = airflow.executors.celery_executor + # The concurrency that will be used when starting workers with the # "airflow worker" command. This defines the number of task instances that # a worker will take, so size up your workers based on the resources on # your worker box and the nature of your tasks celeryd_concurrency = 16 + # When you start an airflow worker, airflow starts a tiny web server # subprocess to serve the workers local log files to the airflow main # web server, who then builds pages and sends them to users. This defines # the port on which the logs are served. It needs to be unused, and open # visible from the main web server to connect into the workers. worker_log_server_port = 8793 + # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally # a sqlalchemy database. Refer to the Celery documentation for more # information. broker_url = amqp://airflow:airflow@rabbitmq:5672/airflow + # Another key Celery setting celery_result_backend = amqp://airflow:airflow@rabbitmq:5672/airflow + # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start # it `airflow flower`. This defines the port that Celery Flower runs on flower_port = 5555 + # Default queue that tasks get assigned to and that worker listen on. default_queue = default @@ -88,12 +115,50 @@ default_queue = default # from the CLI or the UI), this defines the frequency at which they should # listen (in seconds). job_heartbeat_sec = 5 + # The scheduler constantly tries to trigger new tasks (look at the # scheduler section in the docs for more information). This defines # how often the scheduler should run (in seconds). scheduler_heartbeat_sec = 5 + # Statsd (https://github.com/etsy/statsd) integration settings # statsd_on = False # statsd_host = localhost # statsd_port = 8125 # statsd_prefix = airflow + +[mesos] +# Mesos master address which MesosExecutor will connect to. +master = localhost:5050 + +# The framework name which Airflow scheduler will register itself as on mesos +framework_name = Airflow + +# Number of cpu cores required for running one task instance using +# 'airflow run --local -p ' +# command on a mesos slave +task_cpu = 1 + +# Memory in MB required for running one task instance using +# 'airflow run --local -p ' +# command on a mesos slave +task_memory = 256 + +# Enable framework checkpointing for mesos +# See http://mesos.apache.org/documentation/latest/slave-recovery/ +checkpoint = False + +# Failover timeout in milliseconds. +# When checkpointing is enabled and this option is set, Mesos waits until the configured timeout for +# the MesosExecutor framework to re-register after a failover. Mesos shuts down running tasks if the +# MesosExecutor framework fails to re-register within this timeframe. +# failover_timeout = 604800 + +# Enable framework authentication for mesos +# See http://mesos.apache.org/documentation/latest/configuration/ +authenticate = False + +# Mesos credentials, if authentication is enabled +# default_principal = admin +# default_secret = admin + diff --git a/docker-compose.yml b/docker-compose.yml index 4e83da7d..f1225e4a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,5 @@ -mysqldb: - container_name: mysqldb +mysql: + container_name: mysql image: tutum/mysql restart: always ports: @@ -11,15 +11,15 @@ mysqldb: rabbitmq: container_name: rabbitmq - image: puckel/docker-rabbitmq + image: rabbitmq:3-management restart: always ports: - "15672:15672" - "5672:5672" environment: - - RABBITMQ_USER=airflow - - RABBITMQ_PASSWORD=airflow - - RABBITMQ_VHOST=airflow + - RABBITMQ_DEFAULT_USER=airflow + - RABBITMQ_DEFAULT_PASS=airflow + - RABBITMQ_DEFAULT_VHOST=airflow webserver: container_name: webserver @@ -32,9 +32,10 @@ webserver: ports: - "8080:8080" links: - - mysqldb:mysqldb + - mysql:mysql - rabbitmq:rabbitmq - worker:worker + - scheduler:scheduler command: webserver flower: @@ -58,7 +59,6 @@ worker: ports: - "8793:8793" links: - - mysqldb:mysqldb - rabbitmq:rabbitmq command: worker @@ -68,7 +68,4 @@ scheduler: restart: always environment: - AIRFLOW_HOME=/usr/local/airflow - links: - - mysqldb:mysqldb - - rabbitmq:rabbitmq command: scheduler diff --git a/script/entrypoint.sh b/script/entrypoint.sh index 559e3e33..815dabdf 100644 --- a/script/entrypoint.sh +++ b/script/entrypoint.sh @@ -2,7 +2,7 @@ CMD="airflow" DB_LOOPS="10" -MYSQL_HOST="mysqldb" +MYSQL_HOST="mysql" MYSQL_PORT="3306" RABBITMQ_HOST="rabbitmq" RABBITMQ_CREDS="airflow:airflow" @@ -20,10 +20,11 @@ if [ "$@" = "webserver" ] || [ "$@" = "worker" ] || [ "$@" = "scheduler" ] || [ echo "$(date) - $RABBITMQ_HOST still not reachable, giving up" exit 1 fi - echo "$(date) - waiting for RabbitMQ..." + echo "$(date) - waiting for RabbitMQ... $j/$DB_LOOPS" sleep 2 done fi + if [ "$@" = "flower" ]; then sleep 10 fi @@ -37,11 +38,14 @@ if [ "$@" = "webserver" ] || [ "$@" = "worker" ] || [ "$@" = "scheduler" ] ; the echo "$(date) - ${MYSQL_HOST}:${MYSQL_PORT} still not reachable, giving up" exit 1 fi - echo "$(date) - waiting for ${MYSQL_HOST}:${MYSQL_PORT}..." + echo "$(date) - waiting for ${MYSQL_HOST}:${MYSQL_PORT}... $i/$DB_LOOPS" sleep 1 done - sleep 2 - $CMD initdb + if [ "$@" = "webserver" ]; then + echo "Initialize database..." + $CMD initdb + fi + sleep 5 fi exec $CMD "$@"