From 755efded49b91ef81faa8f7ff0660b1f35b6ba9d Mon Sep 17 00:00:00 2001 From: Ivan Chernetsky Date: Thu, 25 Jul 2019 12:18:06 -0700 Subject: [PATCH] Make the list of excluded states complete (#645) JIRA issue: DCOS_OSS-5370 --- ci/ci-integration.sh | 16 ++++++------- marathon_lb.py | 54 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 12 deletions(-) mode change 100644 => 100755 ci/ci-integration.sh diff --git a/ci/ci-integration.sh b/ci/ci-integration.sh old mode 100644 new mode 100755 index 3acc7889..fc59f74b --- a/ci/ci-integration.sh +++ b/ci/ci-integration.sh @@ -41,7 +41,7 @@ random_string() { cluster_ids() { # dcos cluster command was added into the CLI starting in DC/OS 1.10 - if [ "${DCOS_VERSION}" != '1.9' ]; then + if [ "${DCOS_VERSION}" != '1.9' ]; then if ! dcos cluster list >/dev/null 2>&1; then return fi @@ -133,7 +133,7 @@ template_parameters: PublicSlaveInstanceCount: 1 SlaveInstanceCount: 1 EOF - + # DefaultInstanceType parameter has not been backported into 1.9 CF templates. # The templates actually hardcode in this parameter instead. if [ "${DCOS_VERSION}" != '1.9' ]; then @@ -149,7 +149,7 @@ EOF time wrapped_dcos_launch create time wrapped_dcos_launch wait wrapped_dcos_launch describe - + if [ "${SECURITY_MODE}" == 'disabled' ] || [ "${VARIANT}" == 'open' ]; then CLUSTER_URL=http://$(wrapped_dcos_launch describe | jq -r .masters[0].public_ip) else @@ -269,7 +269,7 @@ launch_marathonlb() { MLB_OPTIONS_JSON="mlb-options.json" RENDER_RESPONSE_JSON="response.json" - MLB_JSON="mlb.json" + MLB_JSON="mlb.json" # Strict mode requires having a service account. if [ "${SECURITY_MODE}" == 'strict' ]; then @@ -314,11 +314,11 @@ EOF else dcos package describe marathon-lb --app --render > $RENDER_RESPONSE_JSON fi - + cat $RENDER_RESPONSE_JSON | jq --arg DOCKER_IMAGE "$DOCKER_IMAGE" '.container.docker.image=$DOCKER_IMAGE' | jq --arg MLB_VERSION "$MLB_VERSION" '.labels.DCOS_PACKAGE_VERSION=$MLB_VERSION' > $MLB_JSON cat $MLB_JSON - echo "Launching Marathon-lb." + echo "Launching Marathon-lb." dcos marathon app add $MLB_JSON # Sleeping to wait for MLB to deploy. @@ -348,7 +348,7 @@ check_marathonlb_health() { status_line "Marathon-lb stdout:" dcos task log marathon-lb stdout --lines=50 - status_line "Marathon-lb stderr:" + status_line "Marathon-lb stderr:" dcos task log marathon-lb stderr --lines=50 delete_cluster @@ -388,4 +388,4 @@ launch_marathonlb run_integration_tests # Deleting cluster through dcos-launch. -delete_cluster \ No newline at end of file +delete_cluster diff --git a/marathon_lb.py b/marathon_lb.py index 7a4c4836..833e9752 100755 --- a/marathon_lb.py +++ b/marathon_lb.py @@ -1565,15 +1565,61 @@ def get_health_check(app, portIndex): healthCheckResultCache = LRUCache() +# Last time this variable was adjusted, the task states were extracted from +# https://github.com/apache/mesos/blob/e58f4b97b5d13ccc18ad9b1632d7e6409bdd0c55/include/mesos/mesos.proto +# +# Essentially, only there states, namely, TASK_RUNNING, TASK_LOST, and +# TASK_UNREACHABLE, are used in HAProxy config generation. And here is why: +# +# - TASK_RUNNING - No comments. +# - TASK_LOST - A task is lost as seen by Mesos due to, for example, +# a network partition between a Mesos master and the Mesos agent that is +# on the same node the task is running on. But the task itself may or +# may not be running. In fact, it might transition to TASK_RUNNING state +# when the Mesos agent reregisters with the Mesos master. Hence let's +# be optimistic here and route trafic to tasks in this state. +# - TASK_UNREACHABLE - The same line of reasoning as the one used for +# TASK_LOST state, is valid here too. +EXCLUDED_TASK_STATES = { + # A task in this state transitions into another state quickly, + # unless there is a bug in Mesos which causes it hang in this state + # for quite some time or even forever. If the latter is not the case, + # it moves either to a terminal state (for instance, TASK_ERROR), + # or TASK_STARTING. In any case, there is no need to pay attention + # to it yet. + 'TASK_STAGING', + # A similar reasoning applies here too. A task state becomes either + # terminal or `TASK_RUNNING` quickly. + 'TASK_STARTING', + # A task is being killed. No need to route traffic to it. + 'TASK_KILLING', + # A terminal state. + 'TASK_FINISHED', + # A terminal state. + 'TASK_FAILED', + # A terminal state. + 'TASK_KILLED', + # A terminal state. + 'TASK_ERROR', + # A terminal state. + 'TASK_DROPPED', + # A terminal state. + 'TASK_GONE', + # Though it is not a terminal state, let's trust the operator, and + # assume that the task is truly gone while it is in this state. + 'TASK_GONE_BY_OPERATOR', + # A task is in some funny state according to Mesos. Let's not route + # traffic to it. + 'TASK_UNKNOWN', +} + + def get_apps(marathon, apps=[]): if len(apps) == 0: apps = marathon.list() logger.debug("got apps %s", [app["id"] for app in apps]) - excluded_states = {'TASK_KILLING', 'TASK_KILLED', - 'TASK_FINISHED', 'TASK_ERROR'} - marathon_apps = [] # This process requires 2 passes: the first is to gather apps belonging # to a deployment group. @@ -1749,7 +1795,7 @@ def get_apps(marathon, apps=[]): # 'state' will not be present in test cases. # Should always be present in an actual cluster - if 'state' in task and task['state'] in excluded_states: + if 'state' in task and task['state'] in EXCLUDED_TASK_STATES: logger.warning("Ignoring non-running task " + task['id'] + " with state " + task['state']) continue