Skip to content

Commit

Permalink
improvements to avoid potential segmentation fault (#4)
Browse files Browse the repository at this point in the history
* prevent seg fault when etcd client can't be created in member health check

* retry creating the temp cli to shutdown the server and refactor for cyclo
  • Loading branch information
charless-splunk authored Apr 5, 2019
1 parent b944585 commit bd65609
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 42 deletions.
14 changes: 11 additions & 3 deletions embetcd/memberInfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ func (m *Members) Get(member *membership.Member) (info *Member) {

// create the member if it doesn't
if !ok {
client, _ := NewClient(clientv3.Config{Endpoints: member.ClientURLs})
info = &Member{
Client: client,
Discovered: time.Now(),
LastHealth: time.Time{},
}
Expand All @@ -80,6 +78,13 @@ func (m *Members) Get(member *membership.Member) (info *Member) {
m.members[uint64(member.ID)] = info
}

// Try creating the client to the node if it doesn't exist
// If we get repeated errors creating the client to the member,
// then the member will eventually fail it's health check and be removed.
if info.Client == nil {
info.Client, _ = NewClient(clientv3.Config{Endpoints: member.ClientURLs})
}

// update the info with the incoming membership
info.Member = member

Expand All @@ -94,7 +99,10 @@ func (m *Members) Get(member *membership.Member) (info *Member) {
// Remove a member from the memberInfo list
func (m *Members) Remove(id uint64) {
if member, ok := m.members[id]; ok {
member.Client.Close()
// only close the client if it exists
if member.Client != nil {
member.Client.Close()
}
delete(m.members, id)
}
}
Expand Down
93 changes: 54 additions & 39 deletions embetcd/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ const (
// before they're subject to health checks
DefaultStartUpGracePeriod = time.Second * 60
// DefaultShutdownTimeout is the default time to wait for the server to shutdown cleanly
DefaultShutdownTimeout = time.Minute * 1
DefaultShutdownTimeout = time.Second * 60
// DefaultDialTimeout is the default etcd dial timeout
DefaultDialTimeout = time.Second * 5
// DefaultAutoSyncInterval is the default etcd autosync interval
DefaultAutoSyncInterval = time.Second * 1
DefaultAutoSyncInterval = time.Second * 5
)

// setupClusterNamespace configures the client with the EtcdClusterNamespace prefix
Expand Down Expand Up @@ -535,20 +535,10 @@ func (s *Server) waitForShutdown(ctx context.Context, done chan struct{}) (err e
return
}

// removeSelfFromCluster removes this server from it's cluster
func (s *Server) removeSelfFromCluster(ctx context.Context) (err error) {
members := s.Server.Cluster().Members()

if len(members) > 1 {
endpoints := make([]string, 0, len(members))
for _, member := range members {
if len(member.ClientURLs) > 0 {
endpoints = append(endpoints, member.ClientURLs...)
}
}

// use a temporary client to try removing ourselves from the cluster
var tempcli *Client
//getNamespacedClient returns an etcd client with the cluster's namespace
func (s *Server) getNamespacedClient(ctx context.Context, endpoints []string) (tempcli *Client, err error) {
for ctx.Err() == nil {
// try to create a client to the cluster to remove ourselves
tempcli, err = NewClient(cli.Config{
Endpoints: endpoints,
DialTimeout: DurationOrDefault(s.config.DialTimeout, DefaultDialTimeout),
Expand All @@ -557,38 +547,63 @@ func (s *Server) removeSelfFromCluster(ctx context.Context) (err error) {
Context: ctx,
})

// setup the cluster namespace if the client was created successfully
if err == nil {
setupClusterNamespace(tempcli)
defer tempcli.Close()
break
}
}
return tempcli, err
}

// loop while the context hasn't closed
for ctx.Err() == nil {

// create a child context with its own timeout
timeout, cancel := context.WithTimeout(ctx, DurationOrDefault(s.config.DialTimeout, DefaultDialTimeout))

// use the temporary client to try removing ourselves from the cluster
var unlock func(context.Context) error
if unlock, err = tempcli.Lock(timeout, s.Server.Cfg.Name); err == nil {
_, err = tempcli.MemberRemove(ctx, uint64(s.Server.ID()))
unlock(timeout)
// mask the member not found err because it could mean a cluster clean up routine cleaned us up already
if err == nil || err == rpctypes.ErrMemberNotFound {
err = nil
cancel()
break
}
}
// removeSelfFromCluster removes this server from it's cluster
func (s *Server) removeSelfFromCluster(ctx context.Context) (err error) {
members := s.Server.Cluster().Members()
// just return if we're the last member ...or somehow there are no members
if len(members) < 2 {
return err
}

endpoints := make([]string, 0, len(members))
for _, member := range members {
if len(member.ClientURLs) > 0 {
endpoints = append(endpoints, member.ClientURLs...)
}
}

// wait for the until timeout to try again
<-timeout.Done()
// use a temporary client to try removing ourselves from the cluster
var tempcli *Client
if tempcli, err = s.getNamespacedClient(ctx, endpoints); tempcli != nil {
defer tempcli.Close()
}

// cancel the timeout context
cancel()
// loop while the context hasn't closed
for ctx.Err() == nil {

// create a child context with its own timeout
timeout, cancel := context.WithTimeout(ctx, DurationOrDefault(s.config.DialTimeout, DefaultDialTimeout))

// use the temporary client to try removing ourselves from the cluster
var unlock func(context.Context) error
if unlock, err = tempcli.Lock(timeout, s.Server.Cfg.Name); err == nil {
_, err = tempcli.MemberRemove(ctx, uint64(s.Server.ID()))
unlock(timeout)
// mask the member not found err because it could mean a cluster clean up routine cleaned us up already
if err == nil || err == rpctypes.ErrMemberNotFound {
err = nil
cancel()
break
}
}

// wait for the until timeout to try again
<-timeout.Done()

// cancel the timeout context
cancel()

}

return err
}

Expand Down

0 comments on commit bd65609

Please sign in to comment.