Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mutex debug #866

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e6eeef7
mutex_debug
wadey May 8, 2023
afde208
Merge remote-tracking branch 'origin/master' into mutex-debug
wadey May 9, 2023
3e5e48f
use mutex_debug during Github Actions run
wadey May 9, 2023
9105eba
also validate hostinfo locks
wadey May 9, 2023
90e9a8e
use delete
wadey May 9, 2023
a83f0ca
Merge remote-tracking branch 'origin/master' into mutex-debug
wadey May 9, 2023
e578977
keep track of what file/line the locks were grabbed on
wadey May 9, 2023
92c4245
Merge remote-tracking branch 'origin/master' into mutex-debug
wadey May 9, 2023
5cc43ea
Merge branch 'master' into mutex-debug
wadey Aug 21, 2023
4c89b3c
cleanup
wadey Aug 21, 2023
fdb7804
Merge remote-tracking branch 'origin/master' into mutex-debug
wadey Dec 17, 2023
5ce8279
update to work with the latest locks
wadey Dec 19, 2023
4d88c07
gofmt
wadey Dec 19, 2023
540a171
WIP more locks
wadey Dec 19, 2023
bcaefce
more types
wadey Dec 19, 2023
6f27f46
simplify
wadey Dec 19, 2023
26f7a9f
use terraform dag impl
wadey Dec 19, 2023
91ec6bb
Merge remote-tracking branch 'origin/master' into mutex-debug
wadey Dec 19, 2023
94dd14c
Merge remote-tracking branch 'origin/master' into mutex-debug
wadey Jan 31, 2024
1be8dc4
more
wadey Feb 5, 2024
0ccfad1
Merge remote-tracking branch 'origin/master' into mutex-debug
wadey Apr 11, 2024
c7f1bed
avoid deadlock in lighthouse queryWorker
wadey Apr 11, 2024
2ff26b2
need to hold lock during cacheCb
wadey Apr 11, 2024
f225164
chanDebug
wadey Apr 11, 2024
dffaaf3
Merge branch 'lighthouse-query-chan-lock' into mutex-debug
wadey Apr 11, 2024
2030cbf
Merge remote-tracking branch 'origin/master' into mutex-debug
wadey May 28, 2024
1704d7f
allow more locks
wadey May 28, 2024
77eced3
run smoke test with mutex_debug
wadey May 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/smoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
check-latest: true

- name: build
run: make bin-docker CGO_ENABLED=1 BUILD_ARGS=-race
run: make bin-docker CGO_ENABLED=1 BUILD_ARGS="-race -tags=mutex_debug"

- name: setup docker image
working-directory: ./.github/workflows/smoke
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
run: make test

- name: End 2 end
run: make e2evv
run: make e2e-mutex-debug TEST_LOGS=1 TEST_FLAGS=-v

- name: Build test mobile
run: make build-test-mobile
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ ALL = $(ALL_LINUX) \
e2e:
$(TEST_ENV) go test -tags=e2e_testing -count=1 $(TEST_FLAGS) ./e2e

e2e-mutex-debug:
$(TEST_ENV) go test -tags=mutex_debug,e2e_testing -count=1 $(TEST_FLAGS) ./e2e

e2ev: TEST_FLAGS = -v
e2ev: e2e

Expand Down Expand Up @@ -206,6 +209,7 @@ ifeq ($(words $(MAKECMDGOALS)),1)
@$(MAKE) service ${.DEFAULT_GOAL} --no-print-directory
endif

bin-docker: BUILD_ARGS = -tags=mutex_debug
bin-docker: bin build/linux-amd64/nebula build/linux-amd64/nebula-cert

smoke-docker: bin-docker
Expand Down
15 changes: 7 additions & 8 deletions connection_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package nebula
import (
"bytes"
"context"
"sync"
"time"

"github.com/rcrowley/go-metrics"
Expand All @@ -28,14 +27,14 @@ const (

type connectionManager struct {
in map[uint32]struct{}
inLock *sync.RWMutex
inLock syncRWMutex

out map[uint32]struct{}
outLock *sync.RWMutex
outLock syncRWMutex

// relayUsed holds which relay localIndexs are in use
relayUsed map[uint32]struct{}
relayUsedLock *sync.RWMutex
relayUsedLock syncRWMutex

hostMap *HostMap
trafficTimer *LockingTimerWheel[uint32]
Expand All @@ -60,12 +59,12 @@ func newConnectionManager(ctx context.Context, l *logrus.Logger, intf *Interface
nc := &connectionManager{
hostMap: intf.hostMap,
in: make(map[uint32]struct{}),
inLock: &sync.RWMutex{},
inLock: newSyncRWMutex("connection-manager-in"),
out: make(map[uint32]struct{}),
outLock: &sync.RWMutex{},
outLock: newSyncRWMutex("connection-manager-out"),
relayUsed: make(map[uint32]struct{}),
relayUsedLock: &sync.RWMutex{},
trafficTimer: NewLockingTimerWheel[uint32](time.Millisecond*500, max),
relayUsedLock: newSyncRWMutex("connection-manager-relay-used"),
trafficTimer: NewLockingTimerWheel[uint32]("connection-manager-timer", time.Millisecond*500, max),
intf: intf,
pendingDeletion: make(map[uint32]struct{}),
checkInterval: checkInterval,
Expand Down
4 changes: 2 additions & 2 deletions connection_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package nebula
import (
"crypto/rand"
"encoding/json"
"sync"
"sync/atomic"

"github.com/flynn/noise"
Expand All @@ -23,7 +22,7 @@ type ConnectionState struct {
initiator bool
messageCounter atomic.Uint64
window *Bits
writeLock sync.Mutex
writeLock syncMutex
}

func NewConnectionState(l *logrus.Logger, cipher string, certState *CertState, initiator bool, pattern noise.HandshakePattern, psk []byte, pskStage int) *ConnectionState {
Expand Down Expand Up @@ -71,6 +70,7 @@ func NewConnectionState(l *logrus.Logger, cipher string, certState *CertState, i
initiator: initiator,
window: b,
myCert: certState.Certificate,
writeLock: newSyncMutex("connection-state-write"),
}

return ci
Expand Down
8 changes: 4 additions & 4 deletions dns_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"net"
"strconv"
"strings"
"sync"

"github.com/miekg/dns"
"github.com/sirupsen/logrus"
Expand All @@ -20,15 +19,16 @@ var dnsServer *dns.Server
var dnsAddr string

type dnsRecords struct {
sync.RWMutex
syncRWMutex
dnsMap map[string]string
hostMap *HostMap
}

func newDnsRecords(hostMap *HostMap) *dnsRecords {
return &dnsRecords{
dnsMap: make(map[string]string),
hostMap: hostMap,
syncRWMutex: newSyncRWMutex("dns-records"),
dnsMap: make(map[string]string),
hostMap: hostMap,
}
}

Expand Down
4 changes: 2 additions & 2 deletions firewall.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"reflect"
"strconv"
"strings"
"sync"
"time"

"github.com/rcrowley/go-metrics"
Expand Down Expand Up @@ -73,7 +72,7 @@ type firewallMetrics struct {
}

type FirewallConntrack struct {
sync.Mutex
syncMutex

Conns map[firewall.Packet]*conn
TimerWheel *TimerWheel[firewall.Packet]
Expand Down Expand Up @@ -162,6 +161,7 @@ func NewFirewall(l *logrus.Logger, tcpTimeout, UDPTimeout, defaultTimeout time.D

return &Firewall{
Conntrack: &FirewallConntrack{
syncMutex: newSyncMutex("firewall-conntrack"),
Conns: make(map[firewall.Packet]*conn),
TimerWheel: NewTimerWheel[firewall.Packet](min, max),
},
Expand Down
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ require (
dario.cat/mergo v1.0.0
github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be
github.com/armon/go-radix v1.0.0
github.com/clarkmcc/go-dag v0.0.0-20220908000337-9c3ba5b365fc
github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432
github.com/flynn/noise v1.1.0
github.com/gogo/protobuf v1.3.2
Expand All @@ -21,6 +22,7 @@ require (
github.com/skip2/go-qrcode v0.0.0-20200617195104-da1b6568686e
github.com/songgao/water v0.0.0-20200317203138-2b4b6d7c09d8
github.com/stretchr/testify v1.9.0
github.com/timandy/routine v1.1.1
github.com/vishvananda/netlink v1.2.1-beta.2
golang.org/x/crypto v0.23.0
golang.org/x/exp v0.0.0-20230725093048-515e97ebf090
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/clarkmcc/go-dag v0.0.0-20220908000337-9c3ba5b365fc h1:6e91sWiDE69Jl0WUsY/LvTCBPRBe6b2j8H7W96JGJ4s=
github.com/clarkmcc/go-dag v0.0.0-20220908000337-9c3ba5b365fc/go.mod h1:RGIcF96ORCYAsdz60Ou9mPBNa4+DjoQFS8nelPniFoY=
github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432 h1:M5QgkYacWj0Xs8MhpIK/5uwU02icXpEoSo9sM2aRCps=
github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432/go.mod h1:xwIwAxMvYnVrGJPe2FKx5prTrnAjGOD8zvDOnxnrrkM=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -133,6 +135,8 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/timandy/routine v1.1.1 h1:6/Z7qLFZj3GrzuRksBFzIG8YGUh8CLhjnnMePBQTrEI=
github.com/timandy/routine v1.1.1/go.mod h1:OZHPOKSvqL/ZvqXFkNZyit0xIVelERptYXdAHH00adQ=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/vishvananda/netlink v1.2.1-beta.2 h1:Llsql0lnQEbHj0I1OuKyp8otXp0r3q0mPkuhwHfStVs=
Expand Down
2 changes: 2 additions & 0 deletions handshake_ix.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,15 @@ func ixHandshakeStage1(f *Interface, addr *udp.Addr, via *ViaSender, packet []by
}

hostinfo := &HostInfo{
syncRWMutex: newSyncRWMutex("hostinfo"),
ConnectionState: ci,
localIndexId: myIndex,
remoteIndexId: hs.Details.InitiatorIndex,
vpnIp: vpnIp,
HandshakePacket: make(map[uint8][]byte, 0),
lastHandshakeTime: hs.Details.Time,
relayState: RelayState{
syncRWMutex: newSyncRWMutex("relay-state"),
relays: map[iputil.VpnIp]struct{}{},
relayForByIp: map[iputil.VpnIp]*Relay{},
relayForByIdx: map[uint32]*Relay{},
Expand Down
11 changes: 7 additions & 4 deletions handshake_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"encoding/binary"
"errors"
"net"
"sync"
"time"

"github.com/rcrowley/go-metrics"
Expand Down Expand Up @@ -44,7 +43,7 @@ type HandshakeConfig struct {

type HandshakeManager struct {
// Mutex for interacting with the vpnIps and indexes maps
sync.RWMutex
syncRWMutex

vpnIps map[iputil.VpnIp]*HandshakeHostInfo
indexes map[uint32]*HandshakeHostInfo
Expand All @@ -65,7 +64,7 @@ type HandshakeManager struct {
}

type HandshakeHostInfo struct {
sync.Mutex
syncMutex

startTime time.Time // Time that we first started trying with this handshake
ready bool // Is the handshake ready
Expand Down Expand Up @@ -103,14 +102,15 @@ func (hh *HandshakeHostInfo) cachePacket(l *logrus.Logger, t header.MessageType,

func NewHandshakeManager(l *logrus.Logger, mainHostMap *HostMap, lightHouse *LightHouse, outside udp.Conn, config HandshakeConfig) *HandshakeManager {
return &HandshakeManager{
syncRWMutex: newSyncRWMutex("handshake-manager"),
vpnIps: map[iputil.VpnIp]*HandshakeHostInfo{},
indexes: map[uint32]*HandshakeHostInfo{},
mainHostMap: mainHostMap,
lightHouse: lightHouse,
outside: outside,
config: config,
trigger: make(chan iputil.VpnIp, config.triggerBuffer),
OutboundHandshakeTimer: NewLockingTimerWheel[iputil.VpnIp](config.tryInterval, hsTimeout(config.retries, config.tryInterval)),
OutboundHandshakeTimer: NewLockingTimerWheel[iputil.VpnIp]("handshake-manager-timer", config.tryInterval, hsTimeout(config.retries, config.tryInterval)),
messageMetrics: config.messageMetrics,
metricInitiated: metrics.GetOrRegisterCounter("handshake_manager.initiated", nil),
metricTimedOut: metrics.GetOrRegisterCounter("handshake_manager.timed_out", nil),
Expand Down Expand Up @@ -385,16 +385,19 @@ func (hm *HandshakeManager) StartHandshake(vpnIp iputil.VpnIp, cacheCb func(*Han
}

hostinfo := &HostInfo{
syncRWMutex: newSyncRWMutex("hostinfo"),
vpnIp: vpnIp,
HandshakePacket: make(map[uint8][]byte, 0),
relayState: RelayState{
syncRWMutex: newSyncRWMutex("relay-state"),
relays: map[iputil.VpnIp]struct{}{},
relayForByIp: map[iputil.VpnIp]*Relay{},
relayForByIdx: map[uint32]*Relay{},
},
}

hh := &HandshakeHostInfo{
syncMutex: newSyncMutex("handshake-hostinfo"),
hostinfo: hostinfo,
startTime: time.Now(),
}
Expand Down
7 changes: 4 additions & 3 deletions hostmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package nebula
import (
"errors"
"net"
"sync"
"sync/atomic"
"time"

Expand Down Expand Up @@ -53,7 +52,7 @@ type Relay struct {
}

type HostMap struct {
sync.RWMutex //Because we concurrently read and write to our maps
syncRWMutex //Because we concurrently read and write to our maps
Indexes map[uint32]*HostInfo
Relays map[uint32]*HostInfo // Maps a Relay IDX to a Relay HostInfo object
RemoteIndexes map[uint32]*HostInfo
Expand All @@ -67,7 +66,7 @@ type HostMap struct {
// struct, make a copy of an existing value, edit the fileds in the copy, and
// then store a pointer to the new copy in both realyForBy* maps.
type RelayState struct {
sync.RWMutex
syncRWMutex

relays map[iputil.VpnIp]struct{} // Set of VpnIp's of Hosts to use as relays to access this peer
relayForByIp map[iputil.VpnIp]*Relay // Maps VpnIps of peers for which this HostInfo is a relay to some Relay info
Expand Down Expand Up @@ -197,6 +196,7 @@ func (rs *RelayState) InsertRelay(ip iputil.VpnIp, idx uint32, r *Relay) {
}

type HostInfo struct {
syncRWMutex
remote *udp.Addr
remotes *RemoteList
promoteCounter atomic.Uint32
Expand Down Expand Up @@ -271,6 +271,7 @@ func NewHostMapFromConfig(l *logrus.Logger, vpnCIDR *net.IPNet, c *config.C) *Ho

func newHostMap(l *logrus.Logger, vpnCIDR *net.IPNet) *HostMap {
return &HostMap{
syncRWMutex: newSyncRWMutex("hostmap"),
Indexes: map[uint32]*HostInfo{},
Relays: map[uint32]*HostInfo{},
RemoteIndexes: map[uint32]*HostInfo{},
Expand Down
7 changes: 5 additions & 2 deletions lighthouse.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"fmt"
"net"
"net/netip"
"sync"
"sync/atomic"
"time"

Expand All @@ -33,7 +32,7 @@ type netIpAndPort struct {

type LightHouse struct {
//TODO: We need a timer wheel to kick out vpnIps that haven't reported in a long time
sync.RWMutex //Because we concurrently read and write to our maps
syncRWMutex //Because we concurrently read and write to our maps
ctx context.Context
amLighthouse bool
myVpnIp iputil.VpnIp
Expand Down Expand Up @@ -103,6 +102,7 @@ func NewLightHouseFromConfig(ctx context.Context, l *logrus.Logger, c *config.C,

ones, _ := myVpnNet.Mask.Size()
h := LightHouse{
syncRWMutex: newSyncRWMutex("lighthouse"),
ctx: ctx,
amLighthouse: amLighthouse,
myVpnIp: iputil.Ip2VpnIp(myVpnNet.IP),
Expand Down Expand Up @@ -468,6 +468,7 @@ func (lh *LightHouse) QueryServer(ip iputil.VpnIp) {
return
}

chanDebugSend("lighthouse-query-chan")
lh.queryChan <- ip
}

Expand Down Expand Up @@ -750,6 +751,8 @@ func (lh *LightHouse) startQueryWorker() {
nb := make([]byte, 12, 12)
out := make([]byte, mtu)

chanDebugRecv("lighthouse-query-chan")

for {
select {
case <-lh.ctx.Done():
Expand Down
Loading
Loading