Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry platform-level errors in the isolated process for .NET isolated #2922

Merged
merged 16 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 75 additions & 3 deletions .github/workflows/smoketest-dotnet-isolated-v4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,79 @@ jobs:
steps:
- uses: actions/checkout@v2

# Validation is blocked on https://github.com/Azure/azure-functions-host/issues/7995
- name: Run V4 .NET Isolated Smoke Test
run: test/SmokeTests/e2e-test.ps1 -DockerfilePath test/SmokeTests/OOProcSmokeTests/DotNetIsolated/Dockerfile -HttpStartPath api/StartHelloCitiesTyped -NoValidation
# Install .NET versions
- name: Set up .NET Core 3.1
uses: actions/setup-dotnet@v3
with:
dotnet-version: '3.1.x'

- name: Set up .NET Core 2.1
uses: actions/setup-dotnet@v3
with:
dotnet-version: '2.1.x'

- name: Set up .NET Core 6.x
uses: actions/setup-dotnet@v3
with:
dotnet-version: '6.x'

- name: Set up .NET Core 8.x
uses: actions/setup-dotnet@v3
with:
dotnet-version: '8.x'

# Install Azurite
- name: Set up Node.js (needed for Azurite)
uses: actions/setup-node@v3
with:
node-version: '18.x' # Azurite requires at least Node 18

- name: Install Azurite
run: npm install -g azurite

- name: Restore WebJobs extension
run: dotnet restore $solution

- name: Build and pack WebJobs extension
run: cd ./src/WebJobs.Extensions.DurableTask &&
mkdir ./out &&
dotnet build -c Release WebJobs.Extensions.DurableTask.csproj --output ./out &&
mkdir ~/packages &&
dotnet nuget push ./out/Microsoft.Azure.WebJobs.Extensions.DurableTask.*.nupkg --source ~/packages &&
dotnet nuget add source ~/packages

- name: Build .NET Isolated Smoke Test
run: cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated &&
dotnet restore --verbosity normal &&
dotnet build -c Release

- name: Install core tools
run: npm i -g azure-functions-core-tools@4 --unsafe-perm true

# Run smoke tests
# Unlike other smoke tests, the .NET isolated smoke tests run outside of a docker container, but to race conditions
# when building the smoke test app in docker, causing the build to fail. This is a temporary workaround until the
# root cause is identified and fixed.

- name: Run smoke tests (Hello Cities)
shell: pwsh
run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 &
cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 &
./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/StartHelloCitiesTyped

- name: Run smoke tests (Process Exit)
shell: pwsh
run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 &
./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartProcessExitOrchestrator

- name: Run smoke tests (Timeout)
shell: pwsh
run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 &
cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 &
./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartTimeoutOrchestrator

- name: Run smoke tests (OOM)
shell: pwsh
run: azurite --silent --blobPort 10000 --queuePort 10001 --tablePort 10002 &
cd ./test/SmokeTests/OOProcSmokeTests/DotNetIsolated && func host start --port 7071 &
./test/SmokeTests/OOProcSmokeTests/DotNetIsolated/run-smoke-tests.ps1 -HttpStartPath api/durable_HttpStartOOMOrchestrator
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,15 @@ internal bool TryGetOrchestrationErrorDetails(out Exception? failure)
return hasError;
}

internal void SetResult(IEnumerable<OrchestratorAction> actions, string customStatus)
internal void TrySetResult(IEnumerable<OrchestratorAction> actions, string customStatus)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non-blocking nit: Any method named TryXXX is usually expected to return a bool value indicating whether it succeeded or failed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, you're right. II chose Try to signal somehow that the method may throw, but I know the convention makes this confusing. I'll revert this change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed here: 090f8e7

{
var result = new OrchestratorExecutionResult
{
CustomStatus = customStatus,
Actions = actions,
};

this.SetResultInternal(result);
this.TrySetResultInternal(result);
}

// TODO: This method should be considered deprecated because SDKs should no longer be returning results as JSON.
Expand Down Expand Up @@ -117,10 +117,39 @@ internal void SetResult(string orchestratorResponseJsonText)
innerException: jsonReaderException);
}

this.SetResultInternal(result);
this.TrySetResultInternal(result);
}

private void SetResultInternal(OrchestratorExecutionResult result)
/// <summary>
/// Recursively inspect the FailureDetails of the failed orchestrator and throw if a platform-level exception is detected.
/// </summary>
/// <remarks>
/// Today, this method only checks for <see cref="OutOfMemoryException"/>. In the future, we may want to add more cases.
/// Other known platform-level exceptions, like timeouts or process exists due to `Environment.FailFast`, do not yield
/// a `OrchestratorExecutionResult` as the isolated invocation is abruptly terminated. Therefore, they don't need to be
/// handled in this method.
/// However, our tests reveal that OOMs are, surprisngly, caught and returned as a `OrchestratorExecutionResult`
/// by the isolated process, and thus need special handling.
/// It's unclear if all OOMs are caught by the isolated process (probably not), and also if there are other platform-level
/// errors that are also caught in the isolated process and returned as a `OrchestratorExecutionResult`. Let's add them
/// to this method as we encounter them.
/// </remarks>
/// <param name="failureDetails">The failure details of the orchestrator.</param>
/// <exception cref="OutOfMemoryException">If an OOM error is detected.</exception>
private void ThrowIfPlatformLevelException(FailureDetails failureDetails)
{
if (failureDetails.InnerFailure?.IsCausedBy<OutOfMemoryException>() ?? false)
{
throw new OutOfMemoryException(failureDetails.ErrorMessage);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Manually throwing OutOfMemoryException is a bit of a red flag. It's really only supposed to be thrown by the runtime. It's partly problematic because there are various pieces of code in .NET and elsewhere that treat OOM and similar "fatal" exceptions differently. In some cases, I think it's supposed to crash the process.

I strongly recommend we throw something else, like SessionAbortedException, which is designed for this exact scenario.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the pointer. I'm now throwing SessionAbortedException, see: 090f8e7

}

if (failureDetails.InnerFailure != null)
{
this.ThrowIfPlatformLevelException(failureDetails.InnerFailure);
}
}

private void TrySetResultInternal(OrchestratorExecutionResult result)
{
// Look for an orchestration completion action to see if we need to grab the output.
foreach (OrchestratorAction action in result.Actions)
Expand All @@ -133,6 +162,14 @@ private void SetResultInternal(OrchestratorExecutionResult result)

if (completeAction.OrchestrationStatus == OrchestrationStatus.Failed)
{
// If the orchestrator failed due to a platform-level error in the isolated process,
// we should re-throw that exception in the host (this process) invocation pipeline,
// so the invocation can be retried.
if (completeAction.FailureDetails != null)
{
this.ThrowIfPlatformLevelException(completeAction.FailureDetails);
}

string message = completeAction switch
{
{ FailureDetails: { } f } => f.ErrorMessage,
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 21 additions & 4 deletions src/WebJobs.Extensions.DurableTask/OutOfProcMiddleware.cs
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,15 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync(

byte[] triggerReturnValueBytes = Convert.FromBase64String(triggerReturnValue);
P.OrchestratorResponse response = P.OrchestratorResponse.Parser.ParseFrom(triggerReturnValueBytes);
context.SetResult(

// TrySetResult may throw if a platform-level error is encountered (like an out of memory exception).
context.TrySetResult(
response.Actions.Select(ProtobufUtils.ToOrchestratorAction),
response.CustomStatus);

// Here we throw if the orchestrator completed with an application-level error. When we do this,
// the function's result type will be of type `OrchestrationFailureException` which is reserved
// for application-level errors that do not need to be re-tried.
context.ThrowIfFailed();
},
#pragma warning restore CS0618 // Type or member is obsolete (not intended for general public use)
Expand All @@ -159,6 +164,19 @@ await this.LifeCycleNotificationHelper.OrchestratorStartingAsync(
// Re-throw so we can abort this invocation.
this.HostLifetimeService.OnStopping.ThrowIfCancellationRequested();
}

// we abort the invocation on "platform level errors" such as:
// - a timeout
// - an out of memory exception
// - a worker process exit
if (functionResult.Exception is Host.FunctionTimeoutException
|| functionResult.Exception?.InnerException is OutOfMemoryException // see RemoteOrchestrationContext.TrySetResultInternal for details on OOM-handling
|| (functionResult.Exception?.InnerException?.GetType().ToString().Contains("WorkerProcessExitException") ?? false))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you verify that WorkerProcessExitException is actually available here? Looking at the function host code, I don't see the exception actually thrown: https://github.com/Azure/azure-functions-host/blob/1088f24c3ae3a6275f18cbf091fa525c2477be91/src/WebJobs.Script/Workers/ProcessManagement/WorkerProcess.cs#L179-L184

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's how we stumbled upon this error type in the first place. We found this error by forcing a process exit (Environment.FailFast) and placing a breakpoint at this position.

But yeah I admit that I also don't see the explicit throw here. We can investigate further, or I can show you the runtime behavior through the debugger.

{
// TODO: the `WorkerProcessExitException` type is not exposed in our dependencies, it's part of WebJobs.Host.Script.
// Should we add that dependency or should it be exposed in WebJobs.Host?
Comment on lines +174 to +177
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't love the string-based comparison, but it'll be string-based unless we add WebJobs.Host.Script to our dependencies. Open to a discussion here, but please note we have a customer waiting so let's look to be practical. I have no strong feelings, just trying to deploy this fix.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WebJobs.Host.Script is not available for public consumption. We will need to work with the host team / WebJobs SDK to expose this information to extensions.

throw functionResult.Exception;
}
}
catch (Exception hostRuntimeException)
{
Expand Down Expand Up @@ -238,8 +256,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync(
else
{
// the function failed for some other reason

string exceptionDetails = functionResult.Exception.ToString();
string exceptionDetails = functionResult.Exception?.ToString() ?? "Framework-internal message: exception details could not be extracted";

this.TraceHelper.FunctionFailed(
this.Options.HubName,
Expand All @@ -258,7 +275,7 @@ await this.LifeCycleNotificationHelper.OrchestratorFailedAsync(

orchestratorResult = OrchestratorExecutionResult.ForFailure(
message: $"Function '{functionName}' failed with an unhandled exception.",
functionResult.Exception);
functionResult.Exception ?? new Exception($"Function '{functionName}' failed with an unknown unhandled exception"));
}

// Send the result of the orchestrator function to the DTFx dispatch pipeline.
Expand Down
25 changes: 25 additions & 0 deletions test/SmokeTests/OOProcSmokeTests/DotNetIsolated/DotNetIsolated.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.5.002.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DotNetIsolated", "DotNetIsolated.csproj", "{B2DBA49D-9D25-46DB-8968-15D5E83B4060}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B2DBA49D-9D25-46DB-8968-15D5E83B4060}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {0954D7B4-582F-4F85-AE3E-5D503FB07DB1}
EndGlobalSection
EndGlobal
Loading
Loading