Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Give the option to terminate the engine without firing Events.COMPLET… #3309

Merged
merged 20 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions ignite/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import time
import warnings
import weakref
from collections import OrderedDict, defaultdict
from collections import defaultdict, OrderedDict
from collections.abc import Mapping
from typing import Any, Callable, Dict, Generator, Iterable, Iterator, List, Optional, Tuple, Union

Expand Down Expand Up @@ -549,8 +549,8 @@ def terminate(self, skip_completed: bool = False) -> None:
- :attr:`~ignite.engine.events.Events.COMPLETED`

Args:
skip_completed: if True, the event `~ignite.engine.events.Events.COMPLETED` is not fired after
`~ignite.engine.events.Events.TERMINATE`. Default is False.
skip_completed: if True, the event :attr:`~ignite.engine.events.Events.COMPLETED` is not fired after
:attr:`~ignite.engine.events.Events.TERMINATE`. Default is False.

Examples:
.. testcode::
Expand Down Expand Up @@ -905,8 +905,6 @@ def switch_batch(engine):
# If engine was terminated and now is resuming from terminated state
# we need to initialize iter_counter as 0
self._init_iter = 0
# And we reset the skip_completed_after_termination to its default value
self.skip_completed_after_termination = False

if self._dataloader_iter is None:
self.state.dataloader = data
Expand Down Expand Up @@ -1002,17 +1000,19 @@ def _internal_run_as_gen(self) -> Generator[Any, None, State]:
time_taken = time.time() - start_time
# time is available for handlers but must be updated after fire
self.state.times[Events.COMPLETED.name] = time_taken
handlers_start_time = time.time()

# do not fire Events.COMPLETED if we terminated the run with flag `skip_completed_after_termination=True`
if self.should_terminate and not self.skip_completed_after_termination or not self.should_terminate:
if self.should_terminate and self.skip_completed_after_termination:
# do not fire Events.COMPLETED if we terminated the run with flag `skip_completed_after_termination=True`
hours, mins, secs = _to_hours_mins_secs(time_taken)
self.logger.info(f"Engine run terminated. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}")
else:
handlers_start_time = time.time()
self._fire_event(Events.COMPLETED)

time_taken += time.time() - handlers_start_time
# update time wrt handlers
self.state.times[Events.COMPLETED.name] = time_taken
hours, mins, secs = _to_hours_mins_secs(time_taken)
self.logger.info(f"Engine run complete. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}")
time_taken += time.time() - handlers_start_time
# update time wrt handlers
self.state.times[Events.COMPLETED.name] = time_taken
hours, mins, secs = _to_hours_mins_secs(time_taken)
self.logger.info(f"Engine run completed. Time taken: {hours:02d}:{mins:02d}:{secs:06.3f}")

except BaseException as e:
self._dataloader_iter = None
Expand Down Expand Up @@ -1192,7 +1192,7 @@ def _internal_run_legacy(self) -> State:
# do not fire Events.COMPLETED if we terminated the run with flag `skip_completed_after_termination=True`
if self.should_terminate and not self.skip_completed_after_termination or not self.should_terminate:
self._fire_event(Events.COMPLETED)

time_taken += time.time() - handlers_start_time
# update time wrt handlers
self.state.times[Events.COMPLETED.name] = time_taken
Expand Down
2 changes: 1 addition & 1 deletion ignite/engine/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ class Events(EventEnum):
- EPOCH_COMPLETED : triggered when the epoch is ended. Note that this is triggered even
when :meth:`~ignite.engine.engine.Engine.terminate_epoch()` is called.
- COMPLETED : triggered when engine's run is completed or terminated with :meth:`~ignite.engine.engine.Engine.terminate()`,
unless the flag `skip_event_completed` is set to True.
unless the flag `skip_completed` is set to True.

The table below illustrates which events are triggered when various termination methods are called.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bonassifabio actually, I find myself as well this table very misleading and hard to understand.
I think we can improve it in the following way:

  • EVENT_COMPLETED -> EPOCH_COMPLETED
  • Let's add new column on the right after "TERMINATE" and call it "COMPLETED"
  • Check symbol for terminate() line on TERMINATE_SINGLE_EPOCH column is wrong actually and should be replaced with x.

By the way, here is how it is generated now: https://deploy-preview-3309--pytorch-ignite-preview.netlify.app/generated/ignite.engine.events.events#ignite.engine.events.Events

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I was interpreting the first column as Events.COMPLETED, but from your comment it would seem that's not the case.

What if we change the table to something like this?

Method TERMINATE_SINGLE_EPOCH EPOCH_COMPLETED TERMINATE COMPLETED
No termination x x
terminate_epoch() x x x
terminate() x x
terminate(skip_completed=True) x x x

Few comments:

  1. To my understanding (please correct me if I'm wrong) if terminate() is called, the epoch is not necessarily completed. If this statement is true, it would perhaps make more sense to move EPOCH_COMPLETED before TERMINATE in the list of Events, as I did in the table, since EPOCH_COMPLETED would almost never be fired after EPOCH_COMPLETED.
  2. The columns would thus be ordered so that the first two are "epoch wise", while the last two happen at the end of the engine run. Not sure I
  3. I included a new row for terminate(skip_completed=True). Not sure if that's really necessary.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the new table, looks good!
Few corrections however about its content:

The line terminate_epoch(), column "EPOCH_COMPLETED" and "COMPLETED" should be both checked as v.

About your comments, I'm OK with all your suggestions, let's keep this order and the new line.

Checking code:

from ignite.engine import Engine, Events
from ignite.utils import setup_logger, logging


train_data = range(10)
max_epochs = 5


def train_step(engine, batch):
    pass

trainer = Engine(train_step)

# Enable trainer logger for a debug mode
trainer.logger = setup_logger("trainer", level=logging.DEBUG)

@trainer.on(Events.ITERATION_COMPLETED(once=12))
def call_terminate_epoch():
    trainer.terminate_epoch()

trainer.run(train_data, max_epochs=max_epochs)

Output:

...
2024-12-03 09:06:33,056 trainer INFO: Terminate current epoch is signaled. Current epoch iteration will stop after current iteration is finished.
2024-12-03 09:06:33,058 trainer DEBUG: 2 | 12, Firing handlers for event Events.TERMINATE_SINGLE_EPOCH
2024-12-03 09:06:33,060 trainer DEBUG: 2 | 12, Firing handlers for event Events.EPOCH_COMPLETED
2024-12-03 09:06:33,061 trainer INFO: Epoch[2] Complete. Time taken: 00:00:00.023
2024-12-03 09:06:33,064 trainer DEBUG: 3 | 12, Firing handlers for event Events.EPOCH_STARTED
2024-12-03 09:06:33,066 trainer DEBUG: 3 | 12, Firing handlers for event Events.GET_BATCH_STARTED
...

You can run it in https://pytorch-ignite.ai/playground

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for pointing this out!

  1. Do you think we should add an argument skip_completed to terminate_epoch() with the same behavior of suppressing the firing of Events.EPOCH_COMPLETED? That might be useful, for example, to avoid running the evaluator, the checkpointer, and the LR scheduler if an epoch has been terminated.
  2. I agree on checking EPOCH_COMPLETED for terminate_epoch(). However, I think that checking COMPLETED might seem to imply that Events.COMPLETED is also fired by terminate_epoch() right after Events.EPOCH_COMPLETED. What if we leave that cell empty?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think we should add an argument skip_completed to terminate_epoch() with the same behavior of suppressing the firing of Events.EPOCH_COMPLETED?

Good idea! Let's make this in a separate PR such that this one does not become too large

I agree on checking EPOCH_COMPLETED for terminate_epoch(). However, I think that checking COMPLETED might seem to imply that Events.COMPLETED is also fired by terminate_epoch() right after Events.EPOCH_COMPLETED. What if we leave that cell empty?

well, I was seeing this table as showing which events are triggered when we call terminate*() functions. I did not understand v mark on COMPLETED for terminate_epoch as the sequence of triggered events, but just whether an event is triggered or not. To avoid misunderstanding we can add a column between EPOCH_COMPLETED and TERMINATE named as for example "Other events" and check it where it is appropriate. What do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea! Let's make this in a separate PR such that this one does not become too large

Cool, I'll create a new PR when we're done with this one

I did not understand v mark on COMPLETED for terminate_epoch as the sequence of triggered events, but just whether an event is triggered or not.

I see the point, but doesn't the firing of TERMINATE and COMPLETED independent of terminate_epoch()?
See for example this code.

from ignite.engine import Engine, Events
from ignite.utils import setup_logger, logging

train_data = range(10)
max_epochs = 5


def train_step(engine, batch):
    pass

trainer = Engine(train_step)

# Enable trainer logger for a debug mode
trainer.logger = setup_logger("trainer", level=logging.DEBUG)

@trainer.on(Events.ITERATION_COMPLETED(once=12))
def call_terminate_epoch():
    trainer.terminate_epoch()

@trainer.on(Events.ITERATION_COMPLETED(once=15))
def call_terminate():
    trainer.terminate(skip_completed=True)

trainer.run(train_data, max_epochs=max_epochs)

This is probably an edge case. As a newbie of ignite, it would honestly make more sense to see no entry for TERMINATE and COMPLETED for terminate_epoch(). However, you are for sure more experienced than me, so we can go ahead with your solution if you prefer 🙂

Solution 1

Method TERMINATE_SINGLE_EPOCH EPOCH_COMPLETED TERMINATE COMPLETED
No termination x x
terminate_epoch() x
terminate() x x
terminate(skip_completed=True) x x x

Solution 2

Method TERMINATE_SINGLE_EPOCH EPOCH_COMPLETED TERMINATE COMPLETED
No termination x x
terminate_epoch() x x
terminate() x x
terminate(skip_completed=True) x x x

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the point, but doesn't the firing of TERMINATE and COMPLETED independent of terminate_epoch()?

yes, they should be independent, i would say. If this is not the case, it should be a bug...

As a newbie of ignite, it would honestly make more sense to see no entry for TERMINATE and COMPLETED for terminate_epoch(). However, you are for sure more experienced than me, so we can go ahead with your solution if you prefer 🙂

Actually, your feedback is more important as other users may think the same (and those who know how it works do not read the docs :) ) !

Initially, the goal of the table is to provide visual representation of the info on which events are triggered in each case (when call terminate*()) and also compared with regular run.
Now, seeing its content I find this tabular representation more misleading than useful as triggered events depend on where the terminate function was called.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would propose to keep the table (either Table 1 or Table 2), and then to open a new issue to seek some input also from other developers/users and discuss the problem more in detail there.
We could for example think to a flow chart diagram or something like that...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally I would remove it and maybe added some code output to show exactly how it works...
OK, up to you, either let's revert the changes or keep the most understandable version and update it later.


Expand Down
2 changes: 1 addition & 1 deletion tests/ignite/contrib/engines/test_common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import sys
from unittest.mock import MagicMock, call
from unittest.mock import call, MagicMock

import pytest
import torch
Expand Down
11 changes: 5 additions & 6 deletions tests/ignite/engine/test_engine.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import time
from unittest.mock import MagicMock, Mock, call
from unittest.mock import call, MagicMock, Mock

import numpy as np
import pytest
Expand Down Expand Up @@ -49,7 +49,7 @@ def test_terminate(self):
def test_terminate_and_not_complete(self):
engine = Engine(lambda e, b: 1)
assert not engine.should_terminate and not engine.skip_completed_after_termination
engine.terminate(skip_event_completed=True)
engine.terminate(skip_completed=True)
assert engine.should_terminate and engine.skip_completed_after_termination

def test_invalid_process_raises_with_invalid_signature(self):
Expand Down Expand Up @@ -260,14 +260,14 @@ def check_iter_and_data():
(Events.ITERATION_COMPLETED(once=14), None, 14, False),
],
)
def test_terminate_events_sequence(self, terminate_event, e, i, skip_event_completed):
def test_terminate_events_sequence(self, terminate_event, e, i, skip_completed):
engine = RecordedEngine(MagicMock(return_value=1))
data = range(10)
max_epochs = 5

@engine.on(terminate_event)
def call_terminate():
engine.terminate(skip_event_completed)
engine.terminate(skip_completed)

@engine.on(Events.EXCEPTION_RAISED)
def assert_no_exceptions(ee):
Expand All @@ -284,7 +284,7 @@ def assert_no_exceptions(ee):
if e is None:
e = i // len(data) + 1

if skip_event_completed:
if skip_completed:
assert engine.called_events[-1] == (e, i, Events.TERMINATE)
assert engine.called_events[-2] == (e, i, terminate_event)
else:
Expand Down Expand Up @@ -1425,4 +1425,3 @@ def check_iter_epoch():
state = engine.run(data, max_epochs=max_epochs)
assert state.iteration == max_epochs * len(data) and state.epoch == max_epochs
assert num_calls_check_iter_epoch == 1

Loading