Skip to content

Commit

Permalink
Fix hang in local_smoke_test.py
Browse files Browse the repository at this point in the history
When terminate fails because of a bot fail, the call would hang forever. Add a
20 seconds timeout so the test completes in a reasonable time even in case of
complete failure.

Fix a race condition in the smoke test that would cause it to hang. Always wait
for the bot to be started before calling wipe_cache().

Change 'swarming.py terminate' without --wait to print the task ID.

[email protected]
BUG=

Review-Url: https://codereview.chromium.org/2929353002
  • Loading branch information
maruel authored and Commit Bot committed Jun 10, 2017
1 parent 29dc6f6 commit a7f417c
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 8 deletions.
35 changes: 27 additions & 8 deletions appengine/swarming/local_smoke_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@
SIGNAL_TERM = -1073741510 if sys.platform == 'win32' else -signal.SIGTERM


# Timeout to wait for the operations, so that the smoke test doesn't hang
# indefinitely.
TIMEOUT_SECS = 20


# For the isolated tests that outputs a file named result.txt containing 'hey'.
ISOLATE_HELLO_WORLD = {
'variables': {
Expand Down Expand Up @@ -178,7 +183,7 @@ def task_collect(self, task_id):
# swarming.py collect will return the exit code of the task.
args = [
'--task-summary-json', tmp, task_id, '--task-output-dir', tmpdir,
'--timeout', '20', '--perf',
'--timeout', str(TIMEOUT_SECS), '--perf',
]
self._run('collect', args)
with open(tmp, 'rb') as f:
Expand All @@ -202,7 +207,11 @@ def task_collect(self, task_id):
os.remove(tmp)

def terminate(self, bot_id):
return self._run('terminate', ['--wait', bot_id])
task_id = self._capture('terminate', [bot_id]).strip()
logging.info('swarming.py terminate returned %r', task_id)
if not task_id:
return 1
return self._run('collect', ['--timeout', str(TIMEOUT_SECS), task_id])

def cleanup(self):
if self._tmpdir:
Expand Down Expand Up @@ -308,15 +317,25 @@ def setUp(self):
#
# TODO(maruel): 'isolated_upload' is not deterministic because the isolate
# server not cleared.
old = self.client.query_bot()
started_ts = json.loads(old['state'])['started_ts'] if old else None
start = time.time()
while True:
old = self.client.query_bot()
if old:
break
if time.time() - start > TIMEOUT_SECS:
self.fail('Bot took too long to start')

started_ts = json.loads(old['state'])['started_ts']
logging.info('setUp: started_ts was %s', started_ts)
had_cache = any(
u'caches' == i['key'] for i in old['dimensions']) if old else False
had_cache = any(u'caches' == i['key'] for i in old['dimensions'])
self.bot.wipe_cache(had_cache)
# The bot restarts due to wipe_cache() so wait for the bot to come back

# The bot restarts due to wipe_cache(True) so wait for the bot to come back
# online. It may takes a few loop.
start = time.time()
while True:
if time.time() - start > TIMEOUT_SECS:
self.fail('Bot took too long to start after wipe_cache()')
state = self.client.query_bot()
if not state:
time.sleep(0.1)
Expand Down Expand Up @@ -910,7 +929,7 @@ def main():
# completed, the bot process should have terminated. Give it a few
# seconds due to delay between sending the event that the process is
# shutting down vs the process is shut down.
if client.terminate(bot.bot_id) is not 0:
if client.terminate(bot.bot_id) != 0:
print >> sys.stderr, 'swarming.py terminate failed'
failed = True
try:
Expand Down
2 changes: 2 additions & 0 deletions client/swarming.py
Original file line number Diff line number Diff line change
Expand Up @@ -1601,6 +1601,8 @@ def CMDterminate(parser, args):
return collect(
options.swarming, [request['task_id']], 0., False, False, None, None,
False)
else:
print request['task_id']
return 0


Expand Down

0 comments on commit a7f417c

Please sign in to comment.