-
Notifications
You must be signed in to change notification settings - Fork 272
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Retry platform-level errors in the isolated process for .NET isolated #2922
Changes from 10 commits
a335871
f2942b2
f2ed1e9
d6bb8e4
d333530
d1a3bca
72428b9
3aaa919
079f112
065fdd7
090f8e7
a70d014
2ff4b79
68e7503
1cdd4b9
c927268
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,15 +71,15 @@ internal bool TryGetOrchestrationErrorDetails(out Exception? failure) | |
return hasError; | ||
} | ||
|
||
internal void SetResult(IEnumerable<OrchestratorAction> actions, string customStatus) | ||
internal void TrySetResult(IEnumerable<OrchestratorAction> actions, string customStatus) | ||
{ | ||
var result = new OrchestratorExecutionResult | ||
{ | ||
CustomStatus = customStatus, | ||
Actions = actions, | ||
}; | ||
|
||
this.SetResultInternal(result); | ||
this.TrySetResultInternal(result); | ||
} | ||
|
||
// TODO: This method should be considered deprecated because SDKs should no longer be returning results as JSON. | ||
|
@@ -117,10 +117,39 @@ internal void SetResult(string orchestratorResponseJsonText) | |
innerException: jsonReaderException); | ||
} | ||
|
||
this.SetResultInternal(result); | ||
this.TrySetResultInternal(result); | ||
} | ||
|
||
private void SetResultInternal(OrchestratorExecutionResult result) | ||
/// <summary> | ||
/// Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected. | ||
/// </summary> | ||
/// <remarks> | ||
/// Today, this method only checks for <see cref="OutOfMemoryException"/>. In the future, we may want to add more cases. | ||
/// Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield | ||
/// a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be | ||
/// handled in this method. | ||
/// However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult` | ||
/// by the isolated process, and thus need special handling. | ||
/// It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level | ||
/// errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them | ||
/// to this method as we encounter them. | ||
/// </remarks> | ||
/// <param name="failureDetails">The failure details of the orchestrator.</param> | ||
/// <exception cref="OutOfMemoryException">If an OOM error is detected.</exception> | ||
private void ThrowIfPlatformLevelException(FailureDetails failureDetails) | ||
{ | ||
if (failureDetails.InnerFailure?.IsCausedBy<OutOfMemoryException>() ?? false) | ||
{ | ||
throw new OutOfMemoryException(failureDetails.ErrorMessage); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Manually throwing I strongly recommend we throw something else, like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the pointer. I'm now throwing |
||
} | ||
|
||
if (failureDetails.InnerFailure != null) | ||
{ | ||
this.ThrowIfPlatformLevelException(failureDetails.InnerFailure); | ||
} | ||
} | ||
|
||
private void TrySetResultInternal(OrchestratorExecutionResult result) | ||
{ | ||
// Look for an orchestration completion action to see if we need to grab the output. | ||
foreach (OrchestratorAction action in result.Actions) | ||
|
@@ -133,6 +162,14 @@ private void SetResultInternal(OrchestratorExecutionResult result) | |
|
||
if (completeAction.OrchestrationStatus == OrchestrationStatus.Failed) | ||
{ | ||
// If the orchestrator failed due to a platform-level error in the isolated process, | ||
// we should re-throw that exception in the host (this process) invocation pipeline, | ||
// so the invocation can be retried. | ||
if (completeAction.FailureDetails != null) | ||
{ | ||
this.ThrowIfPlatformLevelException(completeAction.FailureDetails); | ||
} | ||
|
||
string message = completeAction switch | ||
{ | ||
{ FailureDetails: { } f } => f.ErrorMessage, | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -138,10 +138,15 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( | |
|
||
byte[] triggerReturnValueBytes = Convert.FromBase64String(triggerReturnValue); | ||
P.OrchestratorResponse response = P.OrchestratorResponse.Parser.ParseFrom(triggerReturnValueBytes); | ||
context.SetResult( | ||
|
||
// TrySetResult may throw if a platform-level error is encountered (like an out of memory exception). | ||
context.TrySetResult( | ||
response.Actions.Select(ProtobufUtils.ToOrchestratorAction), | ||
response.CustomStatus); | ||
|
||
// Here we throw if the orchestrator completed with an application-level error. When we do this, | ||
// the function's result type will be of type `OrchestrationFailureException` which is reserved | ||
// for application-level errors that do not need to be re-tried. | ||
context.ThrowIfFailed(); | ||
}, | ||
#pragma warning restore CS0618 // Type or member is obsolete (not intended for general public use) | ||
|
@@ -159,6 +164,19 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync( | |
// Re-throw so we can abort this invocation. | ||
this.HostLifetimeService.OnStopping.ThrowIfCancellationRequested(); | ||
} | ||
|
||
// we abort the invocation on "platform level errors" such as: | ||
// - a timeout | ||
// - an out of memory exception | ||
// - a worker process exit | ||
if (functionResult.Exception is Host.FunctionTimeoutException | ||
|| functionResult.Exception?.InnerException is OutOfMemoryException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling | ||
|| (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you verify that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's how we stumbled upon this error type in the first place. We found this error by forcing a process exit ( But yeah I admit that I also don't see the explicit throw here. We can investigate further, or I can show you the runtime behavior through the debugger. |
||
{ | ||
// TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script. | ||
// Should we add that dependency or should it be exposed in WebJobs.Host? | ||
Comment on lines
+174
to
+177
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't love the string-based comparison, but it'll be string-based unless we add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
throw functionResult.Exception; | ||
} | ||
} | ||
catch (Exception hostRuntimeException) | ||
{ | ||
|
@@ -238,8 +256,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( | |
else | ||
{ | ||
// the function failed for some other reason | ||
|
||
string exceptionDetails = functionResult.Exception.ToString(); | ||
string exceptionDetails = functionResult.Exception?.ToString() ?? "Framework-internal message: exception details could not be extracted"; | ||
|
||
this.TraceHelper.FunctionFailed( | ||
this.Options.HubName, | ||
|
@@ -258,7 +275,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync( | |
|
||
orchestratorResult = OrchestratorExecutionResult.ForFailure( | ||
message: $"Function '{functionName}' failed with an unhandled exception.", | ||
functionResult.Exception); | ||
functionResult.Exception ?? new Exception($"Function '{functionName}' failed with an unknown unhandled exception")); | ||
} | ||
|
||
// Send the result of the orchestrator function to the DTFx dispatch pipeline. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
| ||
Microsoft Visual Studio Solution File, Format Version 12.00 | ||
# Visual Studio Version 17 | ||
VisualStudioVersion = 17.5.002.0 | ||
MinimumVisualStudioVersion = 10.0.40219.1 | ||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DotNetIsolated", "DotNetIsolated.csproj", "{B2DBA49D-9D25-46DB-8968-15D5E83B4060}" | ||
EndProject | ||
Global | ||
GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||
Debug|Any CPU = Debug|Any CPU | ||
Release|Any CPU = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(ProjectConfigurationPlatforms) = postSolution | ||
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.Build.0 = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(SolutionProperties) = preSolution | ||
HideSolutionNode = FALSE | ||
EndGlobalSection | ||
GlobalSection(ExtensibilityGlobals) = postSolution | ||
SolutionGuid = {0954D7B4-582F-4F85-AE3E-5D503FB07DB1} | ||
EndGlobalSection | ||
EndGlobal |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Non-blocking nit: Any method named
TryXXX
is usually expected to return abool
value indicating whether it succeeded or failed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, you're right. II chose
Try
to signal somehow that the method may throw, but I know the convention makes this confusing. I'll revert this change.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed here: 090f8e7