Skip to content

Commit 8a00314

Browse files
committed
Reconnect to Typha on connection failure.
- Add optional callback for reconnection-aware clients. - Adjust Typha discovery to reset after all Typhas have been tried. - Make the dedupe buffer reconnection-aware. It now - Stores off the keys that it had previously seen when it gets the OnTyphaConnectionRestarted() call. - Discards those seen keys as the resync progresses. - Synthesises deletions for KVs that weren't seen during the resync. - Recalculates the UpdateType when sending keys downstream so that the calculation graph sees a resync as a sequence of updates for exisitng keys. - Refactor the client so that it - Does one connecction synchronously (including connection attempts to mutliple Typha instances as before) - Reconnects in the background after a failure. - Sends WaitForDatastore/ResyncInProgress messages when it's doing a reconnection. - Re-uses a single connection attempt tracker so that we cycle through Typha instances on reconnect. - Varous minor changes: - Add "done" channels to various components to avoid "log to testing.T after test finished" errors. - Add 32 bit random value to connection ID. Makes it a lot more greppable in logs.
1 parent a8fcd0a commit 8a00314

File tree

13 files changed

+913
-232
lines changed

13 files changed

+913
-232
lines changed

felix/daemon/daemon.go

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -591,15 +591,9 @@ configRetry:
591591
break
592592
}
593593
healthAggregator.Report(healthName, &health.HealthReport{Live: true, Ready: true})
594-
595-
supportsNodeResourceUpdates, err := typhaConnection.SupportsNodeResourceUpdates(10 * time.Second)
596-
if err != nil {
597-
time.Sleep(time.Second) // Avoid tight restart loop in case we didn't really wait 10s above.
598-
log.WithError(err).Fatal("Did not get hello message from Typha in time")
599-
return
600-
}
601-
log.Debugf("Typha supports node resource updates: %v", supportsNodeResourceUpdates)
602-
configParams.SetUseNodeResourceUpdates(supportsNodeResourceUpdates)
594+
// Up-to-date Typha client will refuse to connect unless Typha signals
595+
// that it supports node resource updates.
596+
configParams.SetUseNodeResourceUpdates(true)
603597

604598
go func() {
605599
typhaConnection.Finished.Wait()

libcalico-go/lib/backend/api/api.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,21 @@ const (
191191
UpdateTypeKVDeleted
192192
)
193193

194+
func (u UpdateType) String() string {
195+
switch u {
196+
case UpdateTypeKVUnknown:
197+
return "unknown"
198+
case UpdateTypeKVNew:
199+
return "new"
200+
case UpdateTypeKVUpdated:
201+
return "updated"
202+
case UpdateTypeKVDeleted:
203+
return "deleted"
204+
default:
205+
return fmt.Sprintf("Unknown<%v>", uint8(u))
206+
}
207+
}
208+
194209
// Interface can be implemented by anything that knows how to watch and report changes.
195210
type WatchInterface interface {
196211
// Stops watching. Will close the channel returned by ResultChan(). Releases

libcalico-go/lib/backend/syncersv1/dedupebuffer/dedupe_buffer.go

Lines changed: 121 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@ import (
1818
"container/list"
1919
"fmt"
2020
"sync"
21+
"time"
2122

2223
log "github.com/sirupsen/logrus"
2324

2425
"github.com/projectcalico/calico/libcalico-go/lib/backend/api"
2526
"github.com/projectcalico/calico/libcalico-go/lib/backend/model"
2627
"github.com/projectcalico/calico/libcalico-go/lib/set"
28+
"github.com/projectcalico/calico/typha/pkg/syncclient"
2729
)
2830

2931
// DedupeBuffer buffer implements the syncer callbacks API on its
@@ -50,7 +52,9 @@ type DedupeBuffer struct {
5052

5153
// liveResourceKeys Contains an entry for every key that we have sent to
5254
// the consumer and that we have not subsequently sent a deletion for.
53-
liveResourceKeys set.Set[string]
55+
liveResourceKeys set.Set[string]
56+
liveKeysNotSeenSinceReconnect set.Set[string]
57+
resyncStart time.Time
5458
// pendingUpdates is the queue of updates that we want to send to the
5559
// consumer. We use a linked list so that we can remove items from
5660
// the middle if they are deleted before making it off the queue.
@@ -69,11 +73,39 @@ func New() *DedupeBuffer {
6973
return d
7074
}
7175

76+
func (d *DedupeBuffer) OnTyphaConnectionRestarted() {
77+
d.lock.Lock()
78+
defer d.lock.Unlock()
79+
80+
// We're about to be sent a complete new snapshot of the data. Clear
81+
// our in-flight state and make a transient copy of the keys that we have
82+
// already sent so that we can figure out if any KVs were deleted while
83+
// we were disconnected.
84+
log.Info("Typha connection restarted, clearing pending update queue.")
85+
clear(d.keyToPendingUpdate)
86+
d.pendingUpdates = list.List{}
87+
if d.liveKeysNotSeenSinceReconnect == nil {
88+
// Not already doing a resync.
89+
d.resyncStart = time.Now()
90+
}
91+
d.liveKeysNotSeenSinceReconnect = d.liveResourceKeys.Copy()
92+
}
93+
7294
// OnStatusUpdated queues a status update to be sent to the sink.
7395
func (d *DedupeBuffer) OnStatusUpdated(status api.SyncStatus) {
7496
d.lock.Lock()
7597
defer d.lock.Unlock()
7698

99+
// Check if queue is empty before onInSyncAfterReconnection() since that
100+
// call may push things onto the queue.
101+
queueWasEmpty := d.pendingUpdates.Len() == 0
102+
103+
if status == api.InSync && d.liveKeysNotSeenSinceReconnect != nil {
104+
// We were processing a reconnection and now we're in sync. See if we
105+
// need to clean anything up.
106+
d.onInSyncAfterReconnection()
107+
}
108+
77109
// Statuses are idempotent so skip sending if the latest one in the queue
78110
// was the same.
79111
if d.mostRecentStatusReceived == status {
@@ -94,7 +126,6 @@ func (d *DedupeBuffer) OnStatusUpdated(status api.SyncStatus) {
94126
}
95127

96128
// Add the status to the queue.
97-
queueWasEmpty := d.pendingUpdates.Len() == 0
98129
d.pendingUpdates.PushBack(status)
99130
if queueWasEmpty {
100131
// Only need to signal when the first item goes on the queue.
@@ -146,40 +177,11 @@ func (d *DedupeBuffer) OnUpdatesKeysKnown(updates []api.Update, keys []string) {
146177
continue
147178
}
148179
}
149-
150-
if element, ok := d.keyToPendingUpdate[key]; ok {
151-
// Already got an in-flight update for this key.
152-
if u.Value == nil && !d.liveResourceKeys.Contains(key) {
153-
// This is a deletion, but the key in question never made it
154-
// off the queue, remove it entirely.
155-
if debug {
156-
log.WithField("key", key).Debug("Key deleted before being sent.")
157-
}
158-
delete(d.keyToPendingUpdate, key)
159-
d.pendingUpdates.Remove(element)
160-
} else {
161-
// Update to a key that's already on the queue, swap in the
162-
// most recent value.
163-
if debug {
164-
log.WithField("key", key).Debug("Key updated before being sent.")
165-
}
166-
usk := element.Value.(updateWithStringKey)
167-
usk.update = u
168-
element.Value = usk
169-
}
170-
} else {
171-
// No in-flight entry for this key. Add to queue and record that
172-
// it's in flight.
173-
if debug {
174-
log.WithField("key", key).Debug("No in flight value for key, adding to queue.")
175-
}
176-
element = d.pendingUpdates.PushBack(updateWithStringKey{
177-
key: key,
178-
update: u,
179-
})
180-
d.keyToPendingUpdate[key] = element
181-
d.peakPendingUpdatesLen = max(len(d.keyToPendingUpdate), d.peakPendingUpdatesLen)
180+
if d.liveKeysNotSeenSinceReconnect != nil {
181+
d.liveKeysNotSeenSinceReconnect.Discard(key)
182182
}
183+
184+
d.queueUpdate(key, u)
183185
}
184186
queueNowEmpty := d.pendingUpdates.Len() == 0
185187
if queueWasEmpty && !queueNowEmpty {
@@ -191,6 +193,57 @@ func (d *DedupeBuffer) OnUpdatesKeysKnown(updates []api.Update, keys []string) {
191193
}
192194
}
193195

196+
func (d *DedupeBuffer) queueUpdate(key string, u api.Update) {
197+
debug := log.IsLevelEnabled(log.DebugLevel)
198+
199+
if u.Value != nil {
200+
// A new KV or an update. Since we dedupe sequences of updates for the
201+
// same key, we need to recalculate the update type to make sense to the
202+
// downstream receiver. We do this even if the update is not on the
203+
// queue in order to handle resyncs with Typha.
204+
if d.liveResourceKeys.Contains(key) {
205+
u.UpdateType = api.UpdateTypeKVUpdated
206+
} else {
207+
u.UpdateType = api.UpdateTypeKVNew
208+
}
209+
}
210+
211+
if element, ok := d.keyToPendingUpdate[key]; ok {
212+
// Already got an in-flight update for this key.
213+
if u.Value == nil && !d.liveResourceKeys.Contains(key) {
214+
// This is a deletion, but the key in question never made it
215+
// off the queue, remove it entirely.
216+
if debug {
217+
log.WithField("key", key).Debug("Key deleted before being sent.")
218+
}
219+
delete(d.keyToPendingUpdate, key)
220+
d.pendingUpdates.Remove(element)
221+
} else {
222+
// Update to a key that's already on the queue, swap in the
223+
// most recent value.
224+
if debug {
225+
log.WithField("key", key).Debug("Key updated before being sent.")
226+
}
227+
228+
usk := element.Value.(updateWithStringKey)
229+
usk.update = u
230+
element.Value = usk
231+
}
232+
} else {
233+
// No in-flight entry for this key. Add to queue and record that
234+
// it's in flight.
235+
if debug {
236+
log.WithField("key", key).Debug("No in flight value for key, adding to queue.")
237+
}
238+
element = d.pendingUpdates.PushBack(updateWithStringKey{
239+
key: key,
240+
update: u,
241+
})
242+
d.keyToPendingUpdate[key] = element
243+
d.peakPendingUpdatesLen = max(len(d.keyToPendingUpdate), d.peakPendingUpdatesLen)
244+
}
245+
}
246+
194247
func (d *DedupeBuffer) SendToSinkForever(sink api.SyncerCallbacks) {
195248
d.lock.Lock()
196249
defer d.lock.Unlock()
@@ -314,4 +367,37 @@ func (d *DedupeBuffer) dropLockAndSendBatch(sink api.SyncerCallbacks, buf []any)
314367
}
315368
}
316369

370+
func (d *DedupeBuffer) onInSyncAfterReconnection() {
371+
defer func() {
372+
log.Infof("Resync with Typha complete; dropping resync-tracking state. Resync took %v.",
373+
time.Since(d.resyncStart).Round(time.Millisecond))
374+
d.liveKeysNotSeenSinceReconnect = nil
375+
}()
376+
377+
if d.liveKeysNotSeenSinceReconnect.Len() == 0 {
378+
return
379+
}
380+
381+
log.Infof("In sync with Typha, synthesizing deletions for %d "+
382+
"resources not seen during the resync.",
383+
d.liveKeysNotSeenSinceReconnect.Len())
384+
d.liveKeysNotSeenSinceReconnect.Iter(func(key string) error {
385+
parsedKey := model.KeyFromDefaultPath(key)
386+
if parsedKey == nil {
387+
// Not clear how this could happen since these keys came from the
388+
// set that we'd already parsed and passed downstream!
389+
log.WithField("key", key).Panic("Failed to parse key during reconnection to Typha.")
390+
}
391+
d.queueUpdate(key, api.Update{
392+
KVPair: model.KVPair{
393+
Key: parsedKey,
394+
Value: nil,
395+
},
396+
UpdateType: api.UpdateTypeKVDeleted,
397+
})
398+
return nil
399+
})
400+
}
401+
317402
var _ api.SyncerCallbacks = (*DedupeBuffer)(nil)
403+
var _ syncclient.RestartAwareCallbacks = (*DedupeBuffer)(nil)

0 commit comments

Comments
 (0)