Skip to content

Commit

Permalink
Restart cluster when needed
Browse files Browse the repository at this point in the history
  • Loading branch information
BMurri committed Feb 13, 2025
1 parent cf274cb commit cb6f6fd
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/deploy-cromwell-on-azure/Deployer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,7 @@ await Task.WhenAll(

await kubernetesManager.UpgradeValuesYamlAsync(storageAccountData, settings, containersToMount, installedVersion);
kubernetesClient = await PerformHelmDeploymentAsync(aksCluster, manualPrecommands, asyncTask);
await kubernetesManager.ProcessClusterUpdatesAsync(kubernetesClient, installedVersion);
await kubernetesManager.ProcessClusterUpdatesAsync(kubernetesClient, aksCluster, installedVersion, Execute);

await WriteNonPersonalizedFilesToStorageAccountAsync(storageAccountData);
}
Expand Down
19 changes: 18 additions & 1 deletion src/deploy-cromwell-on-azure/KubernetesManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -269,18 +269,35 @@ private static void ProcessHelmValuesUpdates(HelmValues values, Version previous
}
}

public async Task ProcessClusterUpdatesAsync(IKubernetes kubernetes, Version previousVersion)
public async Task ProcessClusterUpdatesAsync(IKubernetes kubernetes, ContainerServiceManagedClusterResource aksCluster, Version previousVersion, Func<string, Func<Task>, bool, Task> execute)
{
var restartCluster = false;

if (Deployer.IsStorageInPublicCloud && (previousVersion > new Version(5, 5, 0) && previousVersion < new Version(5, 5, 3)))
{
// AKS is supposed to remove all unused storage classes, persistent volues, and persistent volume claims.
// For some reason, this persisent volume remains (even though its storage class is removed) and breaks Cromwell (not during startup but during workflow execution)

foreach (var volume in (await kubernetes.CoreV1.ListPersistentVolumeWithHttpMessagesAsync(cancellationToken: cancellationToken)).Body)
{
if ("coa-blob-cromwell-executions".Equals(volume.Spec.StorageClassName))
{
_ = await kubernetes.CoreV1.DeletePersistentVolumeAsync(volume.Name(), orphanDependents: true, cancellationToken: cancellationToken);
restartCluster = true; // Needed because new pod exists but the old volume incorrectly persists. Entire cluster reset is more effective at getting back to a good configuration.
}
}
}

if (restartCluster)
{
await execute("Restarting AKS cluster...", async () =>
{
_ = await aksCluster.StopAsync(Azure.WaitUntil.Completed, cancellationToken);
await Task.Delay(TimeSpan.FromSeconds(15));
_ = await aksCluster.StartAsync(Azure.WaitUntil.Completed, cancellationToken);
},
/* cancelOnException */ true);
}
}

public async Task<Dictionary<string, string>> GetAKSSettingsAsync(Azure.ResourceManager.Storage.StorageAccountData storageAccount)
Expand Down

0 comments on commit cb6f6fd

Please sign in to comment.