Skip to content

Commit

Permalink
Make the list of excluded states complete (#645)
Browse files Browse the repository at this point in the history
JIRA issue: DCOS_OSS-5370
  • Loading branch information
Ivan Chernetsky committed Jul 25, 2019
1 parent 87c281b commit 755efde
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 12 deletions.
16 changes: 8 additions & 8 deletions ci/ci-integration.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ random_string() {

cluster_ids() {
# dcos cluster command was added into the CLI starting in DC/OS 1.10
if [ "${DCOS_VERSION}" != '1.9' ]; then
if [ "${DCOS_VERSION}" != '1.9' ]; then
if ! dcos cluster list >/dev/null 2>&1; then
return
fi
Expand Down Expand Up @@ -133,7 +133,7 @@ template_parameters:
PublicSlaveInstanceCount: 1
SlaveInstanceCount: 1
EOF

# DefaultInstanceType parameter has not been backported into 1.9 CF templates.
# The templates actually hardcode in this parameter instead.
if [ "${DCOS_VERSION}" != '1.9' ]; then
Expand All @@ -149,7 +149,7 @@ EOF
time wrapped_dcos_launch create
time wrapped_dcos_launch wait
wrapped_dcos_launch describe

if [ "${SECURITY_MODE}" == 'disabled' ] || [ "${VARIANT}" == 'open' ]; then
CLUSTER_URL=http://$(wrapped_dcos_launch describe | jq -r .masters[0].public_ip)
else
Expand Down Expand Up @@ -269,7 +269,7 @@ launch_marathonlb() {

MLB_OPTIONS_JSON="mlb-options.json"
RENDER_RESPONSE_JSON="response.json"
MLB_JSON="mlb.json"
MLB_JSON="mlb.json"

# Strict mode requires having a service account.
if [ "${SECURITY_MODE}" == 'strict' ]; then
Expand Down Expand Up @@ -314,11 +314,11 @@ EOF
else
dcos package describe marathon-lb --app --render > $RENDER_RESPONSE_JSON
fi

cat $RENDER_RESPONSE_JSON | jq --arg DOCKER_IMAGE "$DOCKER_IMAGE" '.container.docker.image=$DOCKER_IMAGE' | jq --arg MLB_VERSION "$MLB_VERSION" '.labels.DCOS_PACKAGE_VERSION=$MLB_VERSION' > $MLB_JSON
cat $MLB_JSON

echo "Launching Marathon-lb."
echo "Launching Marathon-lb."
dcos marathon app add $MLB_JSON

# Sleeping to wait for MLB to deploy.
Expand Down Expand Up @@ -348,7 +348,7 @@ check_marathonlb_health() {
status_line "Marathon-lb stdout:"
dcos task log marathon-lb stdout --lines=50

status_line "Marathon-lb stderr:"
status_line "Marathon-lb stderr:"
dcos task log marathon-lb stderr --lines=50

delete_cluster
Expand Down Expand Up @@ -388,4 +388,4 @@ launch_marathonlb
run_integration_tests

# Deleting cluster through dcos-launch.
delete_cluster
delete_cluster
54 changes: 50 additions & 4 deletions marathon_lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -1565,15 +1565,61 @@ def get_health_check(app, portIndex):
healthCheckResultCache = LRUCache()


# Last time this variable was adjusted, the task states were extracted from
# https://github.com/apache/mesos/blob/e58f4b97b5d13ccc18ad9b1632d7e6409bdd0c55/include/mesos/mesos.proto
#
# Essentially, only there states, namely, TASK_RUNNING, TASK_LOST, and
# TASK_UNREACHABLE, are used in HAProxy config generation. And here is why:
#
# - TASK_RUNNING - No comments.
# - TASK_LOST - A task is lost as seen by Mesos due to, for example,
# a network partition between a Mesos master and the Mesos agent that is
# on the same node the task is running on. But the task itself may or
# may not be running. In fact, it might transition to TASK_RUNNING state
# when the Mesos agent reregisters with the Mesos master. Hence let's
# be optimistic here and route trafic to tasks in this state.
# - TASK_UNREACHABLE - The same line of reasoning as the one used for
# TASK_LOST state, is valid here too.
EXCLUDED_TASK_STATES = {
# A task in this state transitions into another state quickly,
# unless there is a bug in Mesos which causes it hang in this state
# for quite some time or even forever. If the latter is not the case,
# it moves either to a terminal state (for instance, TASK_ERROR),
# or TASK_STARTING. In any case, there is no need to pay attention
# to it yet.
'TASK_STAGING',
# A similar reasoning applies here too. A task state becomes either
# terminal or `TASK_RUNNING` quickly.
'TASK_STARTING',
# A task is being killed. No need to route traffic to it.
'TASK_KILLING',
# A terminal state.
'TASK_FINISHED',
# A terminal state.
'TASK_FAILED',
# A terminal state.
'TASK_KILLED',
# A terminal state.
'TASK_ERROR',
# A terminal state.
'TASK_DROPPED',
# A terminal state.
'TASK_GONE',
# Though it is not a terminal state, let's trust the operator, and
# assume that the task is truly gone while it is in this state.
'TASK_GONE_BY_OPERATOR',
# A task is in some funny state according to Mesos. Let's not route
# traffic to it.
'TASK_UNKNOWN',
}


def get_apps(marathon, apps=[]):
if len(apps) == 0:
apps = marathon.list()

logger.debug("got apps %s", [app["id"] for app in apps])

excluded_states = {'TASK_KILLING', 'TASK_KILLED',
'TASK_FINISHED', 'TASK_ERROR'}

marathon_apps = []
# This process requires 2 passes: the first is to gather apps belonging
# to a deployment group.
Expand Down Expand Up @@ -1749,7 +1795,7 @@ def get_apps(marathon, apps=[]):

# 'state' will not be present in test cases.
# Should always be present in an actual cluster
if 'state' in task and task['state'] in excluded_states:
if 'state' in task and task['state'] in EXCLUDED_TASK_STATES:
logger.warning("Ignoring non-running task " + task['id'] +
" with state " + task['state'])
continue
Expand Down

0 comments on commit 755efde

Please sign in to comment.