Skip to content

Commit 8b23150

Browse files
authored
RSDK-9588 - Take reconfigurationLock at a higher level (viamrobotics#4645)
1 parent 4cda320 commit 8b23150

File tree

3 files changed

+25
-17
lines changed

3 files changed

+25
-17
lines changed

robot/impl/local_robot.go

+19-1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ type localRobot struct {
5858
cloudConnSvc icloud.ConnectionService
5959
logger logging.Logger
6060
activeBackgroundWorkers sync.WaitGroup
61+
62+
// reconfigurationLock manages access to the resource graph and nodes. If either may change, this lock should be taken.
63+
reconfigurationLock sync.Mutex
6164
// reconfigureWorkers tracks goroutines spawned by reconfiguration functions. we only
6265
// wait on this group in tests to prevent goleak-related failures. however, we do not
6366
// wait on this group outside of testing, since the related goroutines may be running
@@ -172,7 +175,9 @@ func (r *localRobot) Close(ctx context.Context) error {
172175
err = multierr.Combine(err, r.cloudConnSvc.Close(ctx))
173176
}
174177
if r.manager != nil {
178+
r.reconfigurationLock.Lock()
175179
err = multierr.Combine(err, r.manager.Close(ctx))
180+
r.reconfigurationLock.Unlock()
176181
}
177182
if r.packageManager != nil {
178183
err = multierr.Combine(err, r.packageManager.Close(ctx))
@@ -307,6 +312,7 @@ func (r *localRobot) completeConfigWorker() {
307312
trigger = "remote"
308313
r.logger.CDebugw(r.closeContext, "configuration attempt triggered by remote")
309314
}
315+
r.reconfigurationLock.Lock()
310316
anyChanges := r.manager.updateRemotesResourceNames(r.closeContext)
311317
if r.manager.anyResourcesNotConfigured() {
312318
anyChanges = true
@@ -316,6 +322,7 @@ func (r *localRobot) completeConfigWorker() {
316322
r.updateWeakDependents(r.closeContext)
317323
r.logger.CDebugw(r.closeContext, "configuration attempt completed with changes", "trigger", trigger)
318324
}
325+
r.reconfigurationLock.Unlock()
319326
}
320327
}
321328

@@ -440,6 +447,11 @@ func newWithResources(
440447
if err != nil {
441448
return nil, err
442449
}
450+
451+
// now that we're changing the resource graph, take the reconfigurationLock so
452+
// that other goroutines can't interleave
453+
r.reconfigurationLock.Lock()
454+
defer r.reconfigurationLock.Unlock()
443455
if err := r.manager.resources.AddNode(
444456
web.InternalServiceName,
445457
resource.NewConfiguredGraphNode(resource.Config{}, r.webSvc, builtinModel)); err != nil {
@@ -497,7 +509,7 @@ func newWithResources(
497509
}, r.activeBackgroundWorkers.Done)
498510
}
499511

500-
r.Reconfigure(ctx, cfg)
512+
r.reconfigure(ctx, cfg, false)
501513

502514
for name, res := range resources {
503515
node := resource.NewConfiguredGraphNode(resource.Config{}, res, unknownModel)
@@ -529,6 +541,8 @@ func New(
529541
func (r *localRobot) removeOrphanedResources(ctx context.Context,
530542
rNames []resource.Name,
531543
) {
544+
r.reconfigurationLock.Lock()
545+
defer r.reconfigurationLock.Unlock()
532546
r.manager.markResourcesRemoved(rNames, nil)
533547
if err := r.manager.removeMarkedAndClose(ctx, nil); err != nil {
534548
r.logger.CErrorw(ctx, "error removing and closing marked resources",
@@ -1096,6 +1110,8 @@ func dialRobotClient(
10961110
// a best effort to remove no longer in use parts, but if it fails to do so, they could
10971111
// possibly leak resources. The given config may be modified by Reconfigure.
10981112
func (r *localRobot) Reconfigure(ctx context.Context, newConfig *config.Config) {
1113+
r.reconfigurationLock.Lock()
1114+
defer r.reconfigurationLock.Unlock()
10991115
r.reconfigure(ctx, newConfig, false)
11001116
}
11011117

@@ -1361,6 +1377,8 @@ func (r *localRobot) restartSingleModule(ctx context.Context, mod *config.Module
13611377
Modified: &config.ModifiedConfigDiff{},
13621378
Removed: &config.Config{},
13631379
}
1380+
r.reconfigurationLock.Lock()
1381+
defer r.reconfigurationLock.Unlock()
13641382
// note: if !isRunning (i.e. the module is in config but it crashed), putting it in diff.Modified
13651383
// results in a no-op; we use .Added instead.
13661384
if isRunning {

robot/impl/local_robot_test.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -4435,8 +4435,7 @@ func TestRemovingOfflineRemote(t *testing.T) {
44354435
// prevents that behavior and removes the remote correctly.
44364436
func TestRemovingOfflineRemotes(t *testing.T) {
44374437
// Close the robot to stop the background workers from processing any messages to triggerConfig
4438-
r := setupLocalRobot(t, context.Background(), &config.Config{}, logging.NewTestLogger(t))
4439-
r.Close(context.Background())
4438+
r := setupLocalRobot(t, context.Background(), &config.Config{}, logging.NewTestLogger(t), withDisableCompleteConfigWorker())
44404439
localRobot := r.(*localRobot)
44414440

44424441
// Create a context that we can cancel to similuate the remote connection timeout
@@ -4469,6 +4468,9 @@ func TestRemovingOfflineRemotes(t *testing.T) {
44694468
wg.Add(1)
44704469
go func() {
44714470
defer wg.Done()
4471+
// manually grab the lock as completeConfig doesn't grab a lock
4472+
localRobot.reconfigurationLock.Lock()
4473+
defer localRobot.reconfigurationLock.Unlock()
44724474
localRobot.manager.completeConfig(ctxCompleteConfig, localRobot, false)
44734475
}()
44744476

@@ -4487,7 +4489,7 @@ func TestRemovingOfflineRemotes(t *testing.T) {
44874489
// Ensure that the remote is not marked for removal while trying to connect to the remote
44884490
remote, ok := localRobot.manager.resources.Node(remoteName)
44894491
test.That(t, ok, test.ShouldBeTrue)
4490-
test.That(t, remote.MarkedForRemoval(), test.ShouldBeTrue)
4492+
test.That(t, remote.MarkedForRemoval(), test.ShouldBeFalse)
44914493

44924494
// Simulate a timeout by canceling the context while trying to connect to the remote
44934495
cancelCompleteConfig()

robot/impl/resource_manager.go

+1-13
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"os"
99
"reflect"
1010
"strings"
11-
"sync"
1211
"time"
1312

1413
"github.com/jhump/protoreflect/desc"
@@ -55,9 +54,7 @@ type resourceManager struct {
5554
opts resourceManagerOptions
5655
logger logging.Logger
5756

58-
// resourceGraphLock manages access to the resource graph and nodes. If either may change, this lock should be taken.
59-
resourceGraphLock sync.Mutex
60-
viz resource.Visualizer
57+
viz resource.Visualizer
6158
}
6259

6360
type resourceManagerOptions struct {
@@ -320,9 +317,6 @@ func (manager *resourceManager) updateRemoteResourceNames(
320317
}
321318

322319
func (manager *resourceManager) updateRemotesResourceNames(ctx context.Context) bool {
323-
manager.resourceGraphLock.Lock()
324-
defer manager.resourceGraphLock.Unlock()
325-
326320
anythingChanged := false
327321
for _, name := range manager.resources.Names() {
328322
gNode, _ := manager.resources.Node(name)
@@ -552,9 +546,7 @@ func (manager *resourceManager) removeMarkedAndClose(
552546
ctx context.Context,
553547
excludeFromClose map[resource.Name]struct{},
554548
) error {
555-
manager.resourceGraphLock.Lock()
556549
defer func() {
557-
defer manager.resourceGraphLock.Unlock()
558550
if err := manager.viz.SaveSnapshot(manager.resources); err != nil {
559551
manager.logger.Warnw("failed to save graph snapshot", "error", err)
560552
}
@@ -608,12 +600,10 @@ func (manager *resourceManager) completeConfig(
608600
lr *localRobot,
609601
forceSync bool,
610602
) {
611-
manager.resourceGraphLock.Lock()
612603
defer func() {
613604
if err := manager.viz.SaveSnapshot(manager.resources); err != nil {
614605
manager.logger.Warnw("failed to save graph snapshot", "error", err)
615606
}
616-
manager.resourceGraphLock.Unlock()
617607
}()
618608

619609
// first handle remotes since they may reveal unresolved dependencies
@@ -1127,8 +1117,6 @@ func (manager *resourceManager) updateResources(
11271117
ctx context.Context,
11281118
conf *config.Diff,
11291119
) error {
1130-
manager.resourceGraphLock.Lock()
1131-
defer manager.resourceGraphLock.Unlock()
11321120
var allErrs error
11331121

11341122
// modules are not added into the resource tree as they belong to the module manager

0 commit comments

Comments
 (0)