Skip to content

Commit feccb11

Browse files
[Logging] Add structured log context to Triage (#4787)
### Description This adds structured logging to the triage job. To enable this, the triage code was instrumented with the cron log context and with the testcase-based context, which wraps each triage execution for a given testcase. To avoid indenting the whole function, a new private method was created for triaging a single testcase. Note: grouper will be instrumented in a next PR after #4781 is merged. ### Test * Running the cronjob locally with the debugger ([link to logs](https://cloudlogging.app.goo.gl/dh6JbXqhcSDRkWDK6)). ![image](https://github.com/user-attachments/assets/5afada1c-801e-491f-9111-36962c28936c)
1 parent 1845750 commit feccb11

File tree

1 file changed

+130
-126
lines changed

1 file changed

+130
-126
lines changed

src/clusterfuzz/_internal/cron/triage.py

+130-126
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,133 @@ def _emit_untriaged_testcase_age_metric(testcase: data_types.Testcase,
416416
})
417417

418418

419+
def _triage_testcase(testcase, excluded_jobs, all_jobs, throttler):
420+
"""Files a bug for a given testcase."""
421+
testcase_id = testcase.key.id()
422+
critical_tasks_completed = data_handler.critical_tasks_completed(testcase)
423+
424+
# Skip if testcase's job is removed.
425+
if testcase.job_type not in all_jobs:
426+
_set_testcase_stuck_state(testcase, False)
427+
logs.info(f'Skipping testcase {testcase_id}, since its job was removed '
428+
f' ({testcase.job_type})')
429+
return
430+
431+
# Skip if testcase's job is in exclusions list.
432+
if testcase.job_type in excluded_jobs:
433+
_set_testcase_stuck_state(testcase, False)
434+
logs.info(f'Skipping testcase {testcase_id}, since its job is in the'
435+
f' exclusion list ({testcase.job_type})')
436+
return
437+
438+
# Skip if we are running progression task at this time.
439+
if testcase.get_metadata('progression_pending'):
440+
_set_testcase_stuck_state(testcase, True)
441+
logs.info(f'Skipping testcase {testcase_id}, progression pending')
442+
_emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
443+
_increment_untriaged_testcase_count(testcase.job_type, PENDING_PROGRESSION)
444+
return
445+
446+
# If the testcase has a bug filed already, no triage is needed.
447+
if _is_bug_filed(testcase):
448+
_set_testcase_stuck_state(testcase, False)
449+
logs.info(
450+
f'Skipping testcase {testcase_id}, since a bug was already filed.')
451+
return
452+
453+
# Check if the crash is important, i.e. it is either a reproducible crash
454+
# or an unreproducible crash happening frequently.
455+
if not _is_crash_important(testcase):
456+
# Check if the crash is a startup crash, i.e. it is causing the fuzzer
457+
# to crash on startup and not allowing the fuzzer to run longer
458+
if testcase.platform == "android" and is_crash_important_android(testcase):
459+
logs.info(
460+
f'Considering testcase {testcase_id}, since it is a startup crash'
461+
' on android platform.')
462+
else:
463+
_set_testcase_stuck_state(testcase, False)
464+
logs.info(
465+
f'Skipping testcase {testcase_id}, as the crash is not important.')
466+
return
467+
468+
# Require that all tasks like minimizaton, regression testing, etc have
469+
# finished.
470+
if not critical_tasks_completed:
471+
status = PENDING_CRITICAL_TASKS
472+
if testcase.analyze_pending:
473+
status = PENDING_ANALYZE
474+
_emit_untriaged_testcase_age_metric(testcase, status)
475+
_set_testcase_stuck_state(testcase, True)
476+
_increment_untriaged_testcase_count(testcase.job_type, status)
477+
logs.info(f'Skipping testcase {testcase_id}, critical tasks still pending.')
478+
return
479+
480+
# For testcases that are not part of a group, wait an additional time to
481+
# make sure it is grouped.
482+
# The grouper runs prior to this step in the same cron, but there is a
483+
# window of time where new testcases can come in after the grouper starts.
484+
# This delay needs to be longer than the maximum time the grouper can take
485+
# to account for that.
486+
# FIXME: In future, grouping might be dependent on regression range, so we
487+
# would have to add an additional wait time.
488+
# TODO(ochang): Remove this after verifying that the `ran_grouper`
489+
# metadata works well.
490+
if not testcase.group_id and not dates.time_has_expired(
491+
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
492+
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
493+
_set_testcase_stuck_state(testcase, True)
494+
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
495+
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
496+
return
497+
498+
if not testcase.get_metadata('ran_grouper'):
499+
# Testcase should be considered by the grouper first before filing.
500+
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
501+
_set_testcase_stuck_state(testcase, True)
502+
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
503+
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
504+
return
505+
506+
# If this project does not have an associated issue tracker, we cannot
507+
# file this crash anywhere.
508+
try:
509+
issue_tracker = issue_tracker_utils.get_issue_tracker_for_testcase(testcase)
510+
except ValueError:
511+
issue_tracker = None
512+
if not issue_tracker:
513+
logs.info(f'No issue tracker detected for testcase {testcase_id}, '
514+
'publishing message.')
515+
issue_filer.notify_issue_update(testcase, 'new')
516+
return
517+
518+
# If there are similar issues to this test case already filed or recently
519+
# closed, skip filing a duplicate bug.
520+
if _check_and_update_similar_bug(testcase, issue_tracker):
521+
_set_testcase_stuck_state(testcase, False)
522+
logs.info(f'Skipping testcase {testcase_id}, since a similar bug'
523+
' was already filed.')
524+
return
525+
526+
# Clean up old triage messages that would be not applicable now.
527+
testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)
528+
529+
# File the bug first and then create filed bug metadata.
530+
if not _file_issue(testcase, issue_tracker, throttler):
531+
_emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
532+
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
533+
logs.info(f'Issue filing failed for testcase id {testcase_id}')
534+
return
535+
536+
_set_testcase_stuck_state(testcase, False)
537+
538+
_create_filed_bug_metadata(testcase)
539+
issue_filer.notify_issue_update(testcase, 'new')
540+
541+
logs.info('Filed new issue %s for testcase %d.' % (testcase.bug_information,
542+
testcase_id))
543+
544+
545+
@logs.cron_log_context()
419546
def main():
420547
"""Files bugs."""
421548
try:
@@ -439,7 +566,6 @@ def main():
439566
throttler = Throttler()
440567

441568
for testcase_id in data_handler.get_open_testcase_id_iterator():
442-
logs.info(f'Triaging {testcase_id}')
443569
try:
444570
testcase = data_handler.get_testcase_by_id(testcase_id)
445571
except errors.InvalidTestcaseError:
@@ -448,131 +574,9 @@ def main():
448574
f'Skipping testcase {testcase_id}, since it was already deleted.')
449575
continue
450576

451-
critical_tasks_completed = data_handler.critical_tasks_completed(testcase)
452-
453-
# Skip if testcase's job is removed.
454-
if testcase.job_type not in all_jobs:
455-
_set_testcase_stuck_state(testcase, False)
456-
logs.info(f'Skipping testcase {testcase_id}, since its job was removed '
457-
f' ({testcase.job_type})')
458-
continue
459-
460-
# Skip if testcase's job is in exclusions list.
461-
if testcase.job_type in excluded_jobs:
462-
_set_testcase_stuck_state(testcase, False)
463-
logs.info(f'Skipping testcase {testcase_id}, since its job is in the'
464-
f' exclusion list ({testcase.job_type})')
465-
continue
466-
467-
# Skip if we are running progression task at this time.
468-
if testcase.get_metadata('progression_pending'):
469-
_set_testcase_stuck_state(testcase, True)
470-
logs.info(f'Skipping testcase {testcase_id}, progression pending')
471-
_emit_untriaged_testcase_age_metric(testcase, PENDING_PROGRESSION)
472-
_increment_untriaged_testcase_count(testcase.job_type,
473-
PENDING_PROGRESSION)
474-
continue
475-
476-
# If the testcase has a bug filed already, no triage is needed.
477-
if _is_bug_filed(testcase):
478-
_set_testcase_stuck_state(testcase, False)
479-
logs.info(
480-
f'Skipping testcase {testcase_id}, since a bug was already filed.')
481-
continue
482-
483-
# Check if the crash is important, i.e. it is either a reproducible crash
484-
# or an unreproducible crash happening frequently.
485-
if not _is_crash_important(testcase):
486-
# Check if the crash is a startup crash, i.e. it is causing the fuzzer
487-
# to crash on startup and not allowing the fuzzer to run longer
488-
if testcase.platform == "android" and is_crash_important_android(
489-
testcase):
490-
logs.info(
491-
f'Considering testcase {testcase_id}, since it is a startup crash'
492-
' on android platform.')
493-
else:
494-
_set_testcase_stuck_state(testcase, False)
495-
logs.info(
496-
f'Skipping testcase {testcase_id}, as the crash is not important.')
497-
continue
498-
499-
# Require that all tasks like minimizaton, regression testing, etc have
500-
# finished.
501-
if not critical_tasks_completed:
502-
status = PENDING_CRITICAL_TASKS
503-
if testcase.analyze_pending:
504-
status = PENDING_ANALYZE
505-
_emit_untriaged_testcase_age_metric(testcase, status)
506-
_set_testcase_stuck_state(testcase, True)
507-
_increment_untriaged_testcase_count(testcase.job_type, status)
508-
logs.info(
509-
f'Skipping testcase {testcase_id}, critical tasks still pending.')
510-
continue
511-
512-
# For testcases that are not part of a group, wait an additional time to
513-
# make sure it is grouped.
514-
# The grouper runs prior to this step in the same cron, but there is a
515-
# window of time where new testcases can come in after the grouper starts.
516-
# This delay needs to be longer than the maximum time the grouper can take
517-
# to account for that.
518-
# FIXME: In future, grouping might be dependent on regression range, so we
519-
# would have to add an additional wait time.
520-
# TODO(ochang): Remove this after verifying that the `ran_grouper`
521-
# metadata works well.
522-
if not testcase.group_id and not dates.time_has_expired(
523-
testcase.timestamp, hours=data_types.MIN_ELAPSED_TIME_SINCE_REPORT):
524-
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
525-
_set_testcase_stuck_state(testcase, True)
526-
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
527-
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
528-
continue
529-
530-
if not testcase.get_metadata('ran_grouper'):
531-
# Testcase should be considered by the grouper first before filing.
532-
_emit_untriaged_testcase_age_metric(testcase, PENDING_GROUPING)
533-
_set_testcase_stuck_state(testcase, True)
534-
_increment_untriaged_testcase_count(testcase.job_type, PENDING_GROUPING)
535-
logs.info(f'Skipping testcase {testcase_id}, pending grouping.')
536-
continue
537-
538-
# If this project does not have an associated issue tracker, we cannot
539-
# file this crash anywhere.
540-
try:
541-
issue_tracker = issue_tracker_utils.get_issue_tracker_for_testcase(
542-
testcase)
543-
except ValueError:
544-
issue_tracker = None
545-
if not issue_tracker:
546-
logs.info(f'No issue tracker detected for testcase {testcase_id}, '
547-
'publishing message.')
548-
issue_filer.notify_issue_update(testcase, 'new')
549-
continue
550-
551-
# If there are similar issues to this test case already filed or recently
552-
# closed, skip filing a duplicate bug.
553-
if _check_and_update_similar_bug(testcase, issue_tracker):
554-
_set_testcase_stuck_state(testcase, False)
555-
logs.info(f'Skipping testcase {testcase_id}, since a similar bug'
556-
' was already filed.')
557-
continue
558-
559-
# Clean up old triage messages that would be not applicable now.
560-
testcase.delete_metadata(TRIAGE_MESSAGE_KEY, update_testcase=False)
561-
562-
# File the bug first and then create filed bug metadata.
563-
if not _file_issue(testcase, issue_tracker, throttler):
564-
_emit_untriaged_testcase_age_metric(testcase, PENDING_FILING)
565-
_increment_untriaged_testcase_count(testcase.job_type, PENDING_FILING)
566-
logs.info(f'Issue filing failed for testcase id {testcase_id}')
567-
continue
568-
569-
_set_testcase_stuck_state(testcase, False)
570-
571-
_create_filed_bug_metadata(testcase)
572-
issue_filer.notify_issue_update(testcase, 'new')
573-
574-
logs.info('Filed new issue %s for testcase %d.' % (testcase.bug_information,
575-
testcase_id))
577+
with logs.testcase_log_context(testcase, testcase.get_fuzz_target()):
578+
logs.info(f'Triaging {testcase_id}')
579+
_triage_testcase(testcase, excluded_jobs, all_jobs, throttler)
576580

577581
_emit_untriaged_testcase_count_metric()
578582
logs.info('Triage testcases succeeded.')

0 commit comments

Comments
 (0)