Skip to content

Commit

Permalink
(Internally) distinguish between timeout termination and Checker Scri…
Browse files Browse the repository at this point in the history
…pt errors

Closes: #80

Co-authored-by: Simon Ruderich <[email protected]>
  • Loading branch information
F30 and rudis committed May 25, 2024
1 parent a8c2695 commit d64864b
Show file tree
Hide file tree
Showing 12 changed files with 77 additions and 28 deletions.
15 changes: 8 additions & 7 deletions src/ctf_gameserver/checker/database.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging

from ctf_gameserver.lib.checkresult import STATUS_TIMEOUT
from ctf_gameserver.lib.database import transaction_cursor
from ctf_gameserver.lib.exceptions import DBDataError

Expand Down Expand Up @@ -181,13 +182,13 @@ def commit_result(db_conn, service_id, team_net_no, tick, result, prohibit_chang
cursor.execute('INSERT INTO scoring_statuscheck'
' (service_id, team_id, tick, status, timestamp)'
' VALUES (%s, %s, %s, %s, NOW())', (service_id, team_id, tick, result))
# (In case of `prohibit_changes`,) PostgreSQL checks the database grants even if nothing is matched
# by `WHERE`
cursor.execute('UPDATE scoring_flag'
' SET placement_end = NOW()'
' WHERE service_id = %s AND protecting_team_id = %s AND tick = %s', (service_id,
team_id,
tick))
if result != STATUS_TIMEOUT:
# (In case of `prohibit_changes`,) PostgreSQL checks the database grants even if nothing is
# matched by `WHERE`
cursor.execute('UPDATE scoring_flag'
' SET placement_end = NOW()'
' WHERE service_id = %s AND protecting_team_id = %s AND tick = %s',
(service_id, team_id, tick))


def set_flagid(db_conn, service_id, team_net_no, tick, flagid, prohibit_changes=False, fake_team_id=None):
Expand Down
16 changes: 13 additions & 3 deletions src/ctf_gameserver/checker/master.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

from ctf_gameserver.lib.args import get_arg_parser_with_db, parse_host_port
from ctf_gameserver.lib import daemon
from ctf_gameserver.lib.checkresult import CheckResult, STATUS_TIMEOUT
from ctf_gameserver.lib.database import transaction_cursor
from ctf_gameserver.lib.checkresult import CheckResult
from ctf_gameserver.lib.exceptions import DBDataError
import ctf_gameserver.lib.flag as flag_lib

Expand Down Expand Up @@ -264,11 +264,13 @@ def step(self):
# We can't signal an error to the Checker Script (which might be waiting for a response),
# so our only option is to kill it
self.supervisor.terminate_runner(req['runner_id'])
metrics.inc(self.metrics_queue, 'killed_tasks')
send_resp = False
except: # noqa, pylint: disable=bare-except
logging.exception('Checker Script communication error for team %d (net number %d) in tick '
'%d:', req['info']['_team_id'], req['info']['team'], req['info']['tick'])
self.supervisor.terminate_runner(req['runner_id'])
metrics.inc(self.metrics_queue, 'killed_tasks')
else:
if send_resp:
req['send'].send(resp)
Expand Down Expand Up @@ -332,8 +334,16 @@ def handle_result_request(self, task_info, param):
result)

def launch_tasks(self):
def timeout_runners():
for task_info in self.supervisor.terminate_runners():
logging.info('Forcefully terminated Checker Script for team %d (net number %d) in tick %d',
task_info['_team_id'], task_info['team'], task_info['tick'])
metrics.inc(self.metrics_queue, 'timeout_tasks')
database.commit_result(self.db_conn, self.service['id'], task_info['team'],
task_info['tick'], STATUS_TIMEOUT)

def change_tick(new_tick):
self.supervisor.terminate_runners()
timeout_runners()
self.update_launch_params(new_tick)
self.known_tick = new_tick

Expand All @@ -345,7 +355,7 @@ def change_tick(new_tick):
change_tick(current_tick)
elif cancel_checks:
# Competition over
self.supervisor.terminate_runners()
timeout_runners()
return

tasks = database.get_new_tasks(self.db_conn, self.service['id'], self.tasks_per_launch)
Expand Down
4 changes: 3 additions & 1 deletion src/ctf_gameserver/checker/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,16 @@ def checker_metrics_factory(registry, service):

counters = [
('started_tasks', 'Number of started Checker Script instances'),
('terminated_tasks', 'Number of Checker Script instances forcibly terminated')
('timeout_tasks', 'Number of Checker Script instances forcibly terminated at end of tick'),
('killed_tasks', 'Number of Checker Script instances forcibly terminated because of misbehavior')
]
for name, doc in counters:
metrics[name] = prometheus_client.Counter(metric_prefix+name, doc, ['service'], registry=registry)
# Pre-declare possible label value
metrics[name].labels(service)

metrics['completed_tasks'] = prometheus_client.Counter(
# Timeouts do not count as successfully completed checks
metric_prefix+'completed_tasks', 'Number of successfully completed checks', ['result', 'service'],
registry=registry
)
Expand Down
8 changes: 6 additions & 2 deletions src/ctf_gameserver/checker/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,21 @@ def terminate_runner(self, runner_id):
logging.info('Terminating Runner process, info: %s', self.processes[runner_id][2])
self.processes[runner_id][0].terminate()
# Afterwards, get_request() will join the child and remove it from `self.processes`
metrics.inc(self.metrics_queue, 'terminated_tasks')

def terminate_runners(self):
terminated_infos = []

if len(self.processes) > 0:
logging.warning('Terminating all %d Runner processes', len(self.processes))
for runner_id in self.processes:
for runner_id, process in self.processes.items():
self.terminate_runner(runner_id)
terminated_infos.append(process[2])

# Prevent memory leaks
self._reset()

return terminated_infos

def get_request(self):
# Use a loop to not leak our implementation detail for ACTION_RUNNER_EXIT: Only return None when the
# queue is really empty (barring non-critical race conditions)
Expand Down
4 changes: 4 additions & 0 deletions src/ctf_gameserver/lib/checkresult.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import enum


STATUS_TIMEOUT = 5


class CheckResult(enum.Enum):
"""
Maps possible Checker results to their integer values.
Expand All @@ -12,6 +15,7 @@ class CheckResult(enum.Enum):
FAULTY = 2 # Service is available, but not behaving as expected
FLAG_NOT_FOUND = 3
RECOVERING = 4
# TIMEOUT (5) is only used internally and not exposed here, especially not to Checker Scripts

def __str__(self):
return self.name
4 changes: 3 additions & 1 deletion src/ctf_gameserver/web/scoring/calculations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import defaultdict, OrderedDict

from django.core.cache import cache
from django.utils.translation import gettext_lazy as _

from ctf_gameserver.web.registration.models import Team
from . import models
Expand Down Expand Up @@ -89,7 +90,8 @@ def team_statuses(from_tick, to_tick, select_related_team_fields=None, only_team
statuses[team] = defaultdict(lambda: {})
teams[team.pk] = team

status_checks = models.StatusCheck.objects.filter(tick__gte=from_tick, tick__lte=to_tick)
status_checks = models.StatusCheck.objects.filter(tick__gte=from_tick, tick__lte=to_tick) \
.exclude(status=models.StatusCheck.STATUSES[_('timeout')])
for check in status_checks:
statuses[teams[check.team_id]][check.tick][check.service_id] = check.status

Expand Down
9 changes: 6 additions & 3 deletions src/ctf_gameserver/web/scoring/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,18 +78,21 @@ class StatusCheck(models.Model):
# Mapping from human-readable status texts to their integer values in the database
# For a discussion on the plural form of "status", refer to https://english.stackexchange.com/q/877
STATUSES = {
_('up'): 0, # Maps to "OK" from the checkers' perspective
# "up" maps to "OK" from the checkers' perspective
_('up'): 0,
_('down'): 1,
_('faulty'): 2,
_('flag not found'): 3,
_('recovering'): 4
_('recovering'): 4,
# Script has been forcefully terminated at end of tick, publicly displayed as "Not checked"
_('timeout'): 5
}

service = models.ForeignKey(Service, on_delete=models.CASCADE)
team = models.ForeignKey(Team, on_delete=models.CASCADE)
tick = models.PositiveIntegerField(db_index=True)
# REVISIT: Add check constraint for the values as soon as we have Django >= 2.2
status = models.PositiveSmallIntegerField(choices=[(i, t) for t, i in STATUSES.items()])
status = models.SmallIntegerField(choices=[(i, t) for t, i in STATUSES.items()])
timestamp = models.DateTimeField(auto_now_add=True)

class Meta:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
_('down'): 'danger',
_('faulty'): 'danger',
_('flag not found'): 'warning',
_('recovering'): 'info'
_('recovering'): 'info',
_('timeout'): 'active'
}


Expand Down
23 changes: 18 additions & 5 deletions src/ctf_gameserver/web/scoring/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from django.db.models import Max
from django.http import JsonResponse
from django.shortcuts import render
from django.utils.translation import gettext_lazy as _
from django.views.decorators.cache import cache_page

import ctf_gameserver.web.registration.models as registration_models
Expand Down Expand Up @@ -331,8 +332,8 @@ def missing_checks(request):
@staff_member_required
def missing_checks_json(request):
"""
View which returns the teams with status "not checked" per tick for a specific service. The result is in
JSON format as required by the JavaScript code in def missing_checks().
View which returns the teams with status "not checked" or "timeout" per tick for a specific service. The
result is in JSON format as required by the JavaScript code in missing_checks().
This can help to find unhandled exceptions in checker scripts, as "not checked" normally shouldn't occur.
"""

Expand All @@ -352,14 +353,24 @@ def missing_checks_json(request):
except ValueError:
return JsonResponse({'error': 'Ticks must be integers'})

status_timeout = models.StatusCheck.STATUSES[_('timeout')]

all_flags = models.Flag.objects.filter(service=service) \
.filter(tick__gte=from_tick, tick__lt=to_tick) \
.values_list('tick', 'protecting_team')
all_status_checks = models.StatusCheck.objects.filter(service=service) \
.filter(tick__gte=from_tick, tick__lt=to_tick) \
.exclude(status=status_timeout) \
.values_list('tick', 'team')
checks_missing = all_flags.difference(all_status_checks).order_by('-tick', 'protecting_team')

checks_timeout = defaultdict(set)
for tick, team in models.StatusCheck.objects.filter(service=service) \
.filter(tick__gte=from_tick, tick__lt=to_tick) \
.filter(status=status_timeout) \
.values_list('tick', 'team'):
checks_timeout[tick].add(team)

result = []
current_tick = {'tick': -1}

Expand All @@ -371,12 +382,14 @@ def append_result():
result.append(current_tick)

for check in checks_missing:
check_tick, check_team = check

# Status checks are ordered by tick, finalize result for a tick when it changes
if current_tick['tick'] != check[0]:
if current_tick['tick'] != check_tick:
append_result()
current_tick = {'tick': check[0], 'teams': []}
current_tick = {'tick': check_tick, 'teams': []}

current_tick['teams'].append(check[1])
current_tick['teams'].append((check_team, check_team in checks_timeout[check_tick]))

# Add result from last iteration
append_result()
Expand Down
6 changes: 5 additions & 1 deletion src/ctf_gameserver/web/static/missing_checks.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ function buildList(data) {
tickEntry.appendChild(prefix)

for (let i = 0; i < check['teams'].length; i++) {
const teamID = check['teams'][i]
const teamID = check['teams'][i][0]
const isTimeout = check['teams'][i][1]
const teamName = data['all-teams'][teamID]['name']
const teamNetNo = data['all-teams'][teamID]['net-number']

Expand All @@ -45,6 +46,9 @@ function buildList(data) {
'q=service:' + data['service-slug'] + ' AND team:' + teamNetNo +
' AND tick:' + check['tick'])
teamEntry.target = '_blank'
if (isTimeout) {
teamEntry.classList.add('text-muted')
}
}
teamEntry.textContent = teamName + ' (' + teamNetNo + ')'
tickEntry.appendChild(teamEntry)
Expand Down
3 changes: 2 additions & 1 deletion src/ctf_gameserver/web/static/service_util.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ const statusClasses = {
1: 'danger',
2: 'danger',
3: 'warning',
4: 'info'
4: 'info',
5: 'active'
}


Expand Down
10 changes: 7 additions & 3 deletions tests/checker/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,10 @@ def test_unfinished(self, monotonic_mock, warning_mock):
' WHERE placement_start IS NOT NULL AND placement_end IS NULL')
self.assertEqual(cursor.fetchone()[0], 1)
cursor.execute('SELECT COUNT(*) FROM scoring_statuscheck')
self.assertEqual(cursor.fetchone()[0], 0)
self.assertEqual(cursor.fetchone()[0], 1)
cursor.execute('SELECT status FROM scoring_statuscheck'
' WHERE service_id=1 AND team_id=2 AND tick=0')
self.assertEqual(cursor.fetchone()[0], 5)

warning_mock.assert_called_with('Terminating all %d Runner processes', 1)

Expand Down Expand Up @@ -596,8 +599,9 @@ def signal_script():
cursor.execute('SELECT COUNT(*) FROM scoring_flag'
' WHERE placement_start IS NOT NULL AND placement_end IS NULL')
self.assertEqual(cursor.fetchone()[0], 1)
cursor.execute('SELECT COUNT(*) FROM scoring_statuscheck')
self.assertEqual(cursor.fetchone()[0], 0)
cursor.execute('SELECT status FROM scoring_statuscheck'
' WHERE service_id=1 AND team_id=2 AND tick=0')
self.assertEqual(cursor.fetchone()[0], 5)

warning_mock.assert_called_with('Terminating all %d Runner processes', 1)

Expand Down

0 comments on commit d64864b

Please sign in to comment.