From bd656097e7bf63c1afaa699e803f990dc9eb181e Mon Sep 17 00:00:00 2001 From: Charlie Smith Date: Fri, 5 Apr 2019 12:04:56 -0400 Subject: [PATCH] improvements to avoid potential segmentation fault (#4) * prevent seg fault when etcd client can't be created in member health check * retry creating the temp cli to shutdown the server and refactor for cyclo --- embetcd/memberInfo.go | 14 +++++-- embetcd/server.go | 93 +++++++++++++++++++++++++------------------ 2 files changed, 65 insertions(+), 42 deletions(-) diff --git a/embetcd/memberInfo.go b/embetcd/memberInfo.go index 978aad3..633418b 100644 --- a/embetcd/memberInfo.go +++ b/embetcd/memberInfo.go @@ -69,9 +69,7 @@ func (m *Members) Get(member *membership.Member) (info *Member) { // create the member if it doesn't if !ok { - client, _ := NewClient(clientv3.Config{Endpoints: member.ClientURLs}) info = &Member{ - Client: client, Discovered: time.Now(), LastHealth: time.Time{}, } @@ -80,6 +78,13 @@ func (m *Members) Get(member *membership.Member) (info *Member) { m.members[uint64(member.ID)] = info } + // Try creating the client to the node if it doesn't exist + // If we get repeated errors creating the client to the member, + // then the member will eventually fail it's health check and be removed. + if info.Client == nil { + info.Client, _ = NewClient(clientv3.Config{Endpoints: member.ClientURLs}) + } + // update the info with the incoming membership info.Member = member @@ -94,7 +99,10 @@ func (m *Members) Get(member *membership.Member) (info *Member) { // Remove a member from the memberInfo list func (m *Members) Remove(id uint64) { if member, ok := m.members[id]; ok { - member.Client.Close() + // only close the client if it exists + if member.Client != nil { + member.Client.Close() + } delete(m.members, id) } } diff --git a/embetcd/server.go b/embetcd/server.go index 188e9db..d43ca0a 100644 --- a/embetcd/server.go +++ b/embetcd/server.go @@ -29,11 +29,11 @@ const ( // before they're subject to health checks DefaultStartUpGracePeriod = time.Second * 60 // DefaultShutdownTimeout is the default time to wait for the server to shutdown cleanly - DefaultShutdownTimeout = time.Minute * 1 + DefaultShutdownTimeout = time.Second * 60 // DefaultDialTimeout is the default etcd dial timeout DefaultDialTimeout = time.Second * 5 // DefaultAutoSyncInterval is the default etcd autosync interval - DefaultAutoSyncInterval = time.Second * 1 + DefaultAutoSyncInterval = time.Second * 5 ) // setupClusterNamespace configures the client with the EtcdClusterNamespace prefix @@ -535,20 +535,10 @@ func (s *Server) waitForShutdown(ctx context.Context, done chan struct{}) (err e return } -// removeSelfFromCluster removes this server from it's cluster -func (s *Server) removeSelfFromCluster(ctx context.Context) (err error) { - members := s.Server.Cluster().Members() - - if len(members) > 1 { - endpoints := make([]string, 0, len(members)) - for _, member := range members { - if len(member.ClientURLs) > 0 { - endpoints = append(endpoints, member.ClientURLs...) - } - } - - // use a temporary client to try removing ourselves from the cluster - var tempcli *Client +//getNamespacedClient returns an etcd client with the cluster's namespace +func (s *Server) getNamespacedClient(ctx context.Context, endpoints []string) (tempcli *Client, err error) { + for ctx.Err() == nil { + // try to create a client to the cluster to remove ourselves tempcli, err = NewClient(cli.Config{ Endpoints: endpoints, DialTimeout: DurationOrDefault(s.config.DialTimeout, DefaultDialTimeout), @@ -557,38 +547,63 @@ func (s *Server) removeSelfFromCluster(ctx context.Context) (err error) { Context: ctx, }) + // setup the cluster namespace if the client was created successfully if err == nil { setupClusterNamespace(tempcli) - defer tempcli.Close() + break } + } + return tempcli, err +} - // loop while the context hasn't closed - for ctx.Err() == nil { - - // create a child context with its own timeout - timeout, cancel := context.WithTimeout(ctx, DurationOrDefault(s.config.DialTimeout, DefaultDialTimeout)) - - // use the temporary client to try removing ourselves from the cluster - var unlock func(context.Context) error - if unlock, err = tempcli.Lock(timeout, s.Server.Cfg.Name); err == nil { - _, err = tempcli.MemberRemove(ctx, uint64(s.Server.ID())) - unlock(timeout) - // mask the member not found err because it could mean a cluster clean up routine cleaned us up already - if err == nil || err == rpctypes.ErrMemberNotFound { - err = nil - cancel() - break - } - } +// removeSelfFromCluster removes this server from it's cluster +func (s *Server) removeSelfFromCluster(ctx context.Context) (err error) { + members := s.Server.Cluster().Members() + // just return if we're the last member ...or somehow there are no members + if len(members) < 2 { + return err + } + + endpoints := make([]string, 0, len(members)) + for _, member := range members { + if len(member.ClientURLs) > 0 { + endpoints = append(endpoints, member.ClientURLs...) + } + } - // wait for the until timeout to try again - <-timeout.Done() + // use a temporary client to try removing ourselves from the cluster + var tempcli *Client + if tempcli, err = s.getNamespacedClient(ctx, endpoints); tempcli != nil { + defer tempcli.Close() + } - // cancel the timeout context - cancel() + // loop while the context hasn't closed + for ctx.Err() == nil { + // create a child context with its own timeout + timeout, cancel := context.WithTimeout(ctx, DurationOrDefault(s.config.DialTimeout, DefaultDialTimeout)) + + // use the temporary client to try removing ourselves from the cluster + var unlock func(context.Context) error + if unlock, err = tempcli.Lock(timeout, s.Server.Cfg.Name); err == nil { + _, err = tempcli.MemberRemove(ctx, uint64(s.Server.ID())) + unlock(timeout) + // mask the member not found err because it could mean a cluster clean up routine cleaned us up already + if err == nil || err == rpctypes.ErrMemberNotFound { + err = nil + cancel() + break + } } + + // wait for the until timeout to try again + <-timeout.Done() + + // cancel the timeout context + cancel() + } + return err }