Skip to content

Commit ce54f0d

Browse files
d4l3kfacebook-github-bot
authored andcommitted
Back out "Revert D20449887: [dt][caffe2] enable using smart exceptions in async nets" (pytorch#36172)
Summary: Pull Request resolved: pytorch#36172 Original commit changeset: 3d7801613f86 D20449887 broke some OSS tests as the OSS export sync wasn't working correctly. Test Plan: Manually export latest version to OSS to trigger the tests + test plan in D20449887 verified onnx tests are passing in pytorch#36172 Reviewed By: andrewwdye Differential Revision: D20902279 fbshipit-source-id: bc30fcc9f5cc8076f69a5d92675fd27455948372
1 parent d591a7b commit ce54f0d

File tree

5 files changed

+41
-24
lines changed

5 files changed

+41
-24
lines changed

caffe2/core/event.h

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ class CAFFE2_API Event {
107107
event_resetter_[type_](this);
108108
#ifdef CAFFE2_USE_EXCEPTION_PTR
109109
caught_exception_ = nullptr;
110-
exception_timestamp_ = 0;
111110
#endif // CAFFE2_USE_EXCEPTION_PTR
111+
error_timestamp_ = 0;
112112
}
113113

114114
const DeviceOption& GetDeviceOption() const {
@@ -126,6 +126,11 @@ class CAFFE2_API Event {
126126
}
127127

128128
void SetFinished(const char* err_msg = nullptr) {
129+
typedef std::chrono::high_resolution_clock clock;
130+
error_timestamp_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
131+
clock::now().time_since_epoch())
132+
.count();
133+
129134
CAFFE_ENFORCE(event_finished_setter_[type_]);
130135
return event_finished_setter_[type_](this, err_msg);
131136
}
@@ -175,9 +180,6 @@ class CAFFE2_API Event {
175180
#ifdef CAFFE2_USE_EXCEPTION_PTR
176181
if (!caught_exception_) {
177182
caught_exception_ = std::current_exception();
178-
typedef std::chrono::high_resolution_clock clock;
179-
exception_timestamp_ =
180-
clock::now().time_since_epoch() / std::chrono::milliseconds(1);
181183
}
182184
CAFFE_ENFORCE(caught_exception_, "No exception found");
183185
#else
@@ -199,13 +201,8 @@ class CAFFE2_API Event {
199201
#endif // CAFFE2_USE_EXCEPTION_PTR
200202
}
201203

202-
int64_t ExceptionTimestamp() const {
203-
#ifdef CAFFE2_USE_EXCEPTION_PTR
204-
return exception_timestamp_;
205-
#else
206-
VLOG(1) << "No support for exceptions in Event";
207-
return 0;
208-
#endif // CAFFE2_USE_EXCEPTION_PTR
204+
int64_t ErrorTimestamp() const {
205+
return error_timestamp_;
209206
}
210207

211208
void RethrowException() const {
@@ -229,20 +226,17 @@ class CAFFE2_API Event {
229226

230227
#ifdef CAFFE2_USE_EXCEPTION_PTR
231228
std::exception_ptr caught_exception_;
232-
int64_t exception_timestamp_{};
233229
#endif // CAFFE2_USE_EXCEPTION_PTR
230+
int64_t error_timestamp_{};
234231

235232
static EventCreateFunction event_creator_[MaxDeviceTypes];
236233
static EventRecordFunction event_recorder_[MaxDeviceTypes];
237-
static EventWaitFunction event_waiter_[MaxDeviceTypes]
238-
[MaxDeviceTypes];
234+
static EventWaitFunction event_waiter_[MaxDeviceTypes][MaxDeviceTypes];
239235
static EventFinishFunction event_finisher_[MaxDeviceTypes];
240236

241237
static EventQueryFunction event_querier_[MaxDeviceTypes];
242-
static EventErrorMessageFunction
243-
event_err_msg_getter_[MaxDeviceTypes];
244-
static EventSetFinishedFunction
245-
event_finished_setter_[MaxDeviceTypes];
238+
static EventErrorMessageFunction event_err_msg_getter_[MaxDeviceTypes];
239+
static EventSetFinishedFunction event_finished_setter_[MaxDeviceTypes];
246240
static EventResetFunction event_resetter_[MaxDeviceTypes];
247241

248242
static EventSetCallbackFunction event_callback_setter_[MaxDeviceTypes];

caffe2/core/net_async_base.cc

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,14 @@ bool AsyncNetBase::handleRunError() {
119119
for (int task_id = 0; task_id < tasksNum(); ++task_id) {
120120
if (event(task_id).HasException()) {
121121
if (first_exc_task_id >= 0) {
122-
auto exc_ts = event(task_id).ExceptionTimestamp();
122+
auto exc_ts = event(task_id).ErrorTimestamp();
123123
if (exc_ts < first_exc_ts) {
124124
first_exc_task_id = task_id;
125125
first_exc_ts = exc_ts;
126126
}
127127
} else {
128128
first_exc_task_id = task_id;
129-
first_exc_ts = event(task_id).ExceptionTimestamp();
129+
first_exc_ts = event(task_id).ErrorTimestamp();
130130
}
131131
}
132132
}
@@ -497,7 +497,13 @@ void AsyncNetBase::finalizeEvents() {
497497
if (op != pending_op) {
498498
try {
499499
op->CancelAsyncCallback();
500-
op->event().SetFinished("Cancelled");
500+
501+
// throw and catch exception to preserve stack trace
502+
try {
503+
throw AsyncNetCancelled();
504+
} catch (const AsyncNetCancelled& e) {
505+
op->event().SetFinishedWithException(e.what());
506+
}
501507
} catch (const EnforceNotMet&) {
502508
// ignore
503509
}

caffe2/core/net_async_base.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ struct ExecutionOptions {
5757
bool run_root_tasks_inline_ = false;
5858
};
5959

60+
struct CAFFE2_API AsyncNetCancelled : public std::exception {
61+
const char* what() const noexcept override {
62+
return "Cancelled";
63+
}
64+
};
65+
6066
class CAFFE2_API AsyncNetBase : public NetBase {
6167
public:
6268
AsyncNetBase(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);

caffe2/core/net_async_scheduling.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,13 @@ void AsyncSchedulingNet::CancelAndFinishAsyncTasks() {
280280
// event, and in some other cases (CUDA)
281281
try {
282282
lastTaskOp(tid)->CancelAsyncCallback();
283-
event(tid).SetFinished("Cancelled");
283+
284+
// throw and catch exception to preserve stack trace
285+
try {
286+
throw AsyncNetCancelled();
287+
} catch (const AsyncNetCancelled& e) {
288+
event(tid).SetFinishedWithException(e.what());
289+
}
284290
} catch (const EnforceNotMet&) {
285291
// ignore
286292
}

caffe2/core/net_test.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -787,8 +787,13 @@ TEST(NetTest, PendingOpsAndNetFailure) {
787787
Workspace ws;
788788
std::unique_ptr<NetBase> net(CreateNet(net_def, &ws));
789789

790-
// net is not stuck and returns false
791-
ASSERT_FALSE(net->Run());
790+
try {
791+
// net is not stuck and returns false
792+
ASSERT_FALSE(net->Run());
793+
} catch (const caffe2::AsyncNetCancelled&) {
794+
// Cancellation exception is fine since if the ops run concurrently the
795+
// NotFinishingOp may be cancelled with an exception.
796+
}
792797
}
793798

794799
class AsyncErrorOp final : public Operator<CPUContext> {

0 commit comments

Comments
 (0)