diff --git a/dataplane/linux/int_dataplane.go b/dataplane/linux/int_dataplane.go index 8d37ca80e4..cc3ceced5d 100644 --- a/dataplane/linux/int_dataplane.go +++ b/dataplane/linux/int_dataplane.go @@ -29,7 +29,6 @@ import ( "github.com/projectcalico/felix/bpf/state" "github.com/projectcalico/felix/bpf/tc" - "github.com/projectcalico/felix/idalloc" "k8s.io/client-go/kubernetes" @@ -43,6 +42,8 @@ import ( cprometheus "github.com/projectcalico/libcalico-go/lib/prometheus" "github.com/projectcalico/libcalico-go/lib/set" + lclogutils "github.com/projectcalico/libcalico-go/lib/logutils" + "github.com/projectcalico/felix/bpf" "github.com/projectcalico/felix/bpf/arp" "github.com/projectcalico/felix/bpf/conntrack" @@ -60,7 +61,6 @@ import ( "github.com/projectcalico/felix/rules" "github.com/projectcalico/felix/throttle" "github.com/projectcalico/felix/wireguard" - lclogutils "github.com/projectcalico/libcalico-go/lib/logutils" ) const ( @@ -271,6 +271,8 @@ type InternalDataplane struct { endpointsSourceV4 endpointsSource ipsetsSourceV4 ipsetsSource callbacks *callbacks + + logAccumulator *LogAccumulator } const ( @@ -323,6 +325,7 @@ func NewIntDataplaneDriver(config Config) *InternalDataplane { ifaceAddrUpdates: make(chan *ifaceAddrsUpdate, 100), config: config, applyThrottle: throttle.New(10), + logAccumulator: NewLogAccumulator(), } dp.applyThrottle.Refill() // Allow the first apply() immediately. dp.ifaceMonitor.StateCallback = dp.onIfaceStateChange @@ -683,7 +686,8 @@ func NewIntDataplaneDriver(config Config) *InternalDataplane { dp.fromDataplane <- &proto.WireguardStatusUpdate{PublicKey: publicKey.String()} } return nil - }) + }, + dp.logAccumulator) dp.wireguardManager = newWireguardManager(cryptoRouteTableWireguard) dp.RegisterManager(dp.wireguardManager) // IPv4-only @@ -767,6 +771,19 @@ func NewIntDataplaneDriver(config Config) *InternalDataplane { dp.allIptablesTables = append(dp.allIptablesTables, dp.iptablesFilterTables...) dp.allIptablesTables = append(dp.allIptablesTables, dp.iptablesRawTables...) + for _, t := range dp.allIptablesTables { + t.OpReporter = dp.logAccumulator + } + for _, ips := range dp.ipSets { + ips.OpReporter = dp.logAccumulator + } + for _, r := range dp.routeTableSyncers() { + switch r := r.(type) { + case *routetable.RouteTable: + r.OpReporter = dp.logAccumulator + } + } + // Register that we will report liveness and readiness. if config.HealthAggregator != nil { log.Info("Registering to report health.") @@ -1450,7 +1467,7 @@ func (d *InternalDataplane) loopUpdatingDataplane() { log.Info("Dataplane updates no longer throttled") beingThrottled = false } - log.Info("Applying dataplane updates") + log.Debug("Applying dataplane updates") applyStart := time.Now() // Actually apply the changes to the dataplane. @@ -1464,13 +1481,16 @@ func (d *InternalDataplane) loopUpdatingDataplane() { // Dataplane is still dirty, record an error. countDataplaneSyncErrors.Inc() } - log.WithField("msecToApply", applyTime.Seconds()*1000.0).Info( + + d.logAccumulator.EndOfIteration(applyTime) + log.WithField("msecToApply", applyTime.Seconds()*1000.0).Debug( "Finished applying updates to dataplane.") if !d.doneFirstApply { log.WithField( "secsSinceStart", time.Since(processStartTime).Seconds(), ).Info("Completed first update to dataplane.") + d.logAccumulator.RecordOperation("first-update") d.doneFirstApply = true if d.config.PostInSyncCallback != nil { d.config.PostInSyncCallback() diff --git a/dataplane/linux/summary.go b/dataplane/linux/summary.go new file mode 100644 index 0000000000..6480a4e17e --- /dev/null +++ b/dataplane/linux/summary.go @@ -0,0 +1,106 @@ +// Copyright (c) 2020 Tigera, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package intdataplane + +import ( + "sort" + "strings" + "sync" + "time" + + log "github.com/sirupsen/logrus" + + "github.com/projectcalico/libcalico-go/lib/set" +) + +type DurationStat struct { + Name string + LogIfAbove time.Duration +} + +type LogAccumulator struct { + lock sync.Mutex + lastLogTime time.Time + + currentIteration *iteration + iterations []*iteration +} + +func (l *LogAccumulator) Reset() { + l.iterations = l.iterations[:0] +} + +type iteration struct { + Operations []string + Duration time.Duration +} + +func (i *iteration) RecordOperation(name string) { + i.Operations = append(i.Operations, name) +} + +func NewLogAccumulator() *LogAccumulator { + return &LogAccumulator{ + currentIteration: &iteration{}, + lastLogTime: time.Now(), + } +} + +func (l *LogAccumulator) RecordOperation(name string) { + l.lock.Lock() + defer l.lock.Unlock() + + l.currentIteration.RecordOperation(name) +} + +// EndOfIteration should be called at the end of the loop, it will trigger logging of noteworthy logs. +func (l *LogAccumulator) EndOfIteration(duration time.Duration) { + l.lock.Lock() + defer l.lock.Unlock() + + lastIteration := l.currentIteration + lastIteration.Duration = duration + l.iterations = append(l.iterations, lastIteration) + l.currentIteration = &iteration{} + if time.Since(l.lastLogTime) > time.Minute { + l.DoLog() + l.Reset() + l.lastLogTime = time.Now() + } +} + +func (l *LogAccumulator) DoLog() { + numUpdates := len(l.iterations) + allOps := set.New() + var longestIteration *iteration + var sumOfDurations time.Duration + for _, it := range l.iterations { + allOps.AddAll(it.Operations) + sumOfDurations += it.Duration + if longestIteration == nil || it.Duration > longestIteration.Duration { + longestIteration = it + } + } + if longestIteration == nil { + return + } + avgDuration := (sumOfDurations / time.Duration(numUpdates)).Round(time.Millisecond) + longestOps := longestIteration.Operations + sort.Strings(longestOps) + log.Infof("Summarising %d dataplane reconciliation loops over %v: avg=%v longest=%v (%v)", + numUpdates, time.Since(l.lastLogTime).Round(100*time.Millisecond), avgDuration, + longestIteration.Duration.Round(time.Millisecond), + strings.Join(longestOps, ",")) +} diff --git a/dataplane/linux/xdp_state.go b/dataplane/linux/xdp_state.go index 5652de5b51..c002e55126 100644 --- a/dataplane/linux/xdp_state.go +++ b/dataplane/linux/xdp_state.go @@ -169,7 +169,7 @@ func (x *xdpState) ResyncIfNeeded(ipsSourceV4 ipsetsSource) error { if i > 0 { log.Info("Retrying after an XDP update failure...") } - log.Info("Resyncing XDP state with dataplane.") + log.Debug("Resyncing XDP state with dataplane.") err = x.tryResync(newConvertingIPSetsSource(ipsSourceV4)) if err == nil { success = true @@ -558,7 +558,7 @@ func (s *xdpIPState) getIPSetMembers(setID string, ipsSource ipsetsSource) (set. func (s *xdpIPState) tryResync(common *xdpStateCommon, ipsSource ipsetsSource) error { resyncStart := time.Now() defer func() { - s.logCxt.WithField("resyncDuration", time.Since(resyncStart)).Info("Finished XDP resync.") + s.logCxt.WithField("resyncDuration", time.Since(resyncStart)).Debug("Finished XDP resync.") }() s.ipsetIDsToMembers.Clear() resyncState, err := s.newXDPResyncState(common.bpfLib, ipsSource, common.programTag, common.xdpModes) diff --git a/fv/named_ports_test.go b/fv/named_ports_test.go index 6d515d6c7a..41a4f20bd1 100644 --- a/fv/named_ports_test.go +++ b/fv/named_ports_test.go @@ -146,7 +146,6 @@ func describeNamedPortTests(testSourcePorts bool, protocol string) { }) AfterEach(func() { - if CurrentGinkgoTestDescription().Failed { log.Warn("Test failed, dumping diags...") utils.Run("docker", "logs", felix.Name) diff --git a/ipsets/ipset_defs.go b/ipsets/ipset_defs.go index 6670d1a601..f11cf52128 100644 --- a/ipsets/ipset_defs.go +++ b/ipsets/ipset_defs.go @@ -206,6 +206,15 @@ func (f IPFamily) IsValid() bool { } return false } +func (f IPFamily) Version() int { + switch f { + case IPFamilyV4: + return 4 + case IPFamilyV6: + return 6 + } + return 0 +} // IPSetMetadata contains the metadata for a particular IP set, such as its name, type and size. type IPSetMetadata struct { diff --git a/ipsets/ipsets.go b/ipsets/ipsets.go index cfcf0475a1..1345a0b6b5 100644 --- a/ipsets/ipsets.go +++ b/ipsets/ipsets.go @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2019 Tigera, Inc. All rights reserved. +// Copyright (c) 2017-2020 Tigera, Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -67,6 +67,12 @@ type IPSets struct { // stderrCopy holds a copy of the the stderr emitted by ipset restore. It is reset after // each use. stderrCopy bytes.Buffer + + OpReporter OpReporter +} + +type OpReporter interface { + RecordOperation(name string) } func NewIPSets(ipVersionConfig *IPVersionConfig) *IPSets { @@ -220,7 +226,7 @@ func (s *IPSets) RemoveMembers(setID string, removedMembers []string) { // QueueResync forces a resync with the dataplane on the next ApplyUpdates() call. func (s *IPSets) QueueResync() { - s.logCxt.Info("Asked to resync with the dataplane on next update.") + s.logCxt.Debug("Asked to resync with the dataplane on next update.") s.resyncRequired = true } @@ -303,7 +309,10 @@ func (s *IPSets) ApplyUpdates() { if s.resyncRequired { // Compare our in-memory state against the dataplane and queue up // modifications to fix any inconsistencies. - s.logCxt.Info("Resyncing ipsets with dataplane.") + s.logCxt.Debug("Resyncing ipsets with dataplane.") + if s.OpReporter != nil { + s.OpReporter.RecordOperation(fmt.Sprint("resync-ipsets-v", s.IPVersionConfig.Family.Version())) + } numProblems, err := s.tryResync() if err != nil { s.logCxt.WithError(err).Warning("Failed to resync with dataplane") @@ -311,8 +320,8 @@ func (s *IPSets) ApplyUpdates() { continue } if numProblems > 0 { - s.logCxt.WithField("numProblems", numProblems).Info( - "Found inconsistencies in dataplane") + s.logCxt.WithField("numProblems", numProblems).Warn( + "Found inconsistencies in IP sets in dataplane") } s.resyncRequired = false } @@ -353,7 +362,7 @@ func (s *IPSets) tryResync() (numProblems int, err error) { s.logCxt.WithFields(log.Fields{ "resyncDuration": time.Since(resyncStart), "numInconsistenciesFound": numProblems, - }).Info("Finished resync") + }).Debug("Finished IPSets resync") }() // Start an 'ipset list' child process, which will emit output of the following form: @@ -622,6 +631,10 @@ func (s *IPSets) tryUpdates() error { return nil } + if s.OpReporter != nil { + s.OpReporter.RecordOperation(fmt.Sprint("update-ipsets-", s.IPVersionConfig.Family.Version())) + } + // Set up an ipset restore session. countNumIPSetCalls.Inc() cmd := s.newCmd("ipset", "restore") diff --git a/iptables/table.go b/iptables/table.go index 1125b5f994..6eeb8dea89 100644 --- a/iptables/table.go +++ b/iptables/table.go @@ -38,6 +38,10 @@ const ( minPostWriteInterval = 50 * time.Millisecond ) +type OpReporter interface { + RecordOperation(name string) +} + var ( // List of all the top-level kernel-created chains by iptables table. tableToKernelChains = map[string][]string{ @@ -275,6 +279,7 @@ type Table struct { lookPath func(file string) (string, error) onStillAlive func() + OpReporter OpReporter } type TableOptions struct { @@ -593,7 +598,10 @@ func (t *Table) loadDataplaneState() { t.featureDetector.RefreshFeatures() // Load the hashes from the dataplane. - t.logCxt.Info("Loading current iptables state and checking it is correct.") + t.logCxt.Debug("Loading current iptables state and checking it is correct.") + if t.OpReporter != nil { + t.OpReporter.RecordOperation(fmt.Sprintf("resync-%v-v%d", t.Name, t.IPVersion)) + } t.lastReadTime = t.timeNow() dataplaneHashes, dataplaneRules := t.getHashesAndRulesFromDataplane() @@ -931,7 +939,7 @@ func (t *Table) InvalidateDataplaneCache(reason string) { logCxt.Debug("Would invalidate dataplane cache but it was already invalid.") return } - logCxt.Info("Invalidating dataplane cache") + logCxt.Debug("Invalidating dataplane cache") t.inSyncWithDataPlane = false } @@ -1259,6 +1267,9 @@ func (t *Table) applyUpdates() error { } else { // Get the contents of the buffer ready to send to iptables-restore. Warning: for perf, this is directly // accessing the buffer's internal array; don't touch the buffer after this point. + if t.OpReporter != nil { + t.OpReporter.RecordOperation(fmt.Sprintf("update-%v-v%d", t.Name, t.IPVersion)) + } inputBytes := buf.GetBytesAndReset() if log.GetLevel() >= log.DebugLevel { diff --git a/k8sfv/pod_test.go b/k8sfv/pod_test.go index 6fb2d9550a..3237b59f05 100644 --- a/k8sfv/pod_test.go +++ b/k8sfv/pod_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2017 Tigera, Inc. All rights reserved. +// Copyright (c) 2017,2020 Tigera, Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,8 +21,8 @@ import ( . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" - "k8s.io/client-go/kubernetes" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" ) var _ = Context("with a k8s clientset", func() { diff --git a/routerule/route_rule.go b/routerule/route_rule.go index 7a12620f90..3e69291e24 100644 --- a/routerule/route_rule.go +++ b/routerule/route_rule.go @@ -16,6 +16,7 @@ package routerule import ( "errors" + "fmt" "time" log "github.com/sirupsen/logrus" @@ -74,6 +75,12 @@ type RouteRules struct { // Testing shims, swapped with mock versions for UT newNetlinkHandle func() (HandleIface, error) + + OpReporter OpReporter +} + +type OpReporter interface { + RecordOperation(name string) } func New(ipVersion int, priority int, tableIndexSet set.Set, updateFunc, removeFunc RulesMatchFunc, netlinkTimeout time.Duration) (*RouteRules, error) { @@ -169,7 +176,7 @@ func (r *RouteRules) RemoveRule(rule *Rule) { } func (r *RouteRules) QueueResync() { - r.logCxt.Info("Queueing a resync of routing rules.") + r.logCxt.Debug("Queueing a resync of routing rules.") r.inSync = false } @@ -227,6 +234,10 @@ func (r *RouteRules) Apply() error { return nil } + if r.OpReporter != nil { + r.OpReporter.RecordOperation(fmt.Sprint("resync-rules-v", r.IPVersion)) + } + nl, err := r.getNetlinkHandle() if err != nil { r.logCxt.WithError(err).Error("Failed to connect to netlink, retrying...") diff --git a/routetable/route_table.go b/routetable/route_table.go index e511cad66b..e04cc57314 100644 --- a/routetable/route_table.go +++ b/routetable/route_table.go @@ -16,6 +16,7 @@ package routetable import ( "errors" + "fmt" "net" "reflect" "regexp" @@ -184,6 +185,12 @@ type RouteTable struct { addStaticARPEntry func(cidr ip.CIDR, destMAC net.HardwareAddr, ifaceName string) error conntrack conntrackIface time timeshim.Interface + + OpReporter OpReporter +} + +type OpReporter interface { + RecordOperation(name string) } func New( @@ -395,7 +402,7 @@ func (r *RouteTable) SetL2Routes(ifaceName string, targets []L2Target) { } func (r *RouteTable) QueueResync() { - r.logCxt.Info("Queueing a resync of routing table.") + r.logCxt.Debug("Queueing a resync of routing table.") r.reSync = true } @@ -441,6 +448,9 @@ func (r *RouteTable) closeNetlink() { func (r *RouteTable) Apply() error { if r.reSync { + if r.OpReporter != nil { + r.OpReporter.RecordOperation(fmt.Sprint("resync-routes-v", r.ipVersion)) + } listStartTime := time.Now() nl, err := r.getNetlink() diff --git a/wireguard/wireguard.go b/wireguard/wireguard.go index ec205b55a2..d2e711f2bb 100644 --- a/wireguard/wireguard.go +++ b/wireguard/wireguard.go @@ -24,13 +24,14 @@ import ( "github.com/vishvananda/netlink" "golang.zx2c4.com/wireguard/wgctrl/wgtypes" + "github.com/projectcalico/libcalico-go/lib/set" + "github.com/projectcalico/felix/ifacemonitor" "github.com/projectcalico/felix/ip" "github.com/projectcalico/felix/netlinkshim" "github.com/projectcalico/felix/routerule" "github.com/projectcalico/felix/routetable" "github.com/projectcalico/felix/timeshim" - "github.com/projectcalico/libcalico-go/lib/set" ) const ( @@ -153,6 +154,7 @@ type Wireguard struct { // Callback function used to notify of public key updates for the local nodeData statusCallback func(publicKey wgtypes.Key) error + opReporter routerule.OpReporter } func New( @@ -161,6 +163,7 @@ func New( netlinkTimeout time.Duration, deviceRouteProtocol int, statusCallback func(publicKey wgtypes.Key) error, + opReporter routerule.OpReporter, ) *Wireguard { return NewWithShims( hostname, @@ -173,6 +176,7 @@ func New( timeshim.RealTime(), deviceRouteProtocol, statusCallback, + opReporter, ) } @@ -188,6 +192,7 @@ func NewWithShims( timeShim timeshim.Interface, deviceRouteProtocol int, statusCallback func(publicKey wgtypes.Key) error, + opReporter routerule.OpReporter, ) *Wireguard { // Create routetable. We provide dummy callbacks for ARP and conntrack processing. rt := routetable.NewWithShims( @@ -199,9 +204,9 @@ func NewWithShims( func(cidr ip.CIDR, destMAC net.HardwareAddr, ifaceName string) error { return nil }, // addStaticARPEntry &noOpConnTrack{}, timeShim, - nil, //deviceRouteSourceAddress + nil, // deviceRouteSourceAddress deviceRouteProtocol, - true, //removeExternalRoutes + true, // removeExternalRoutes config.RoutingTableIndex, ) // Create routerule. @@ -220,6 +225,9 @@ func NewWithShims( // Wireguard is enabled, but could not create a routerule manager. This is unexpected. log.WithError(err).Panic("Unexpected error creating rule manager") } + if rr != nil { + rr.OpReporter = opReporter + } return &Wireguard{ hostname: hostname, @@ -236,6 +244,7 @@ func NewWithShims( statusCallback: statusCallback, localIPs: set.New(), localCIDRs: set.New(), + opReporter: opReporter, } } @@ -531,7 +540,10 @@ func (w *Wireguard) EndpointWireguardRemove(name string) { } func (w *Wireguard) QueueResync() { - log.Info("Queueing a resync of wireguard configuration") + log.Debug("Queueing a resync of wireguard configuration") + if w.opReporter != nil { + w.opReporter.RecordOperation("resync-wg") + } // Flag for resync to ensure everything is still configured correctly. // No need to resync the key. This will happen if the dataplane resync detects an inconsistency. diff --git a/wireguard/wireguard_test.go b/wireguard/wireguard_test.go index 025faf6472..0d5e04c440 100644 --- a/wireguard/wireguard_test.go +++ b/wireguard/wireguard_test.go @@ -172,6 +172,7 @@ var _ = Describe("Enable wireguard", func() { t, FelixRouteProtocol, s.status, + nil, ) rule = netlink.NewRule() @@ -1296,6 +1297,7 @@ var _ = Describe("Wireguard (disabled)", func() { t, FelixRouteProtocol, s.status, + nil, ) }) @@ -1471,6 +1473,7 @@ var _ = Describe("Wireguard (with no table index)", func() { t, FelixRouteProtocol, s.status, + nil, ) } })