@@ -31,7 +31,6 @@ import (
31
31
"golang.org/x/exp/slices"
32
32
"golang.org/x/sync/errgroup"
33
33
"golang.org/x/xerrors"
34
- "storj.io/drpc"
35
34
"tailscale.com/net/speedtest"
36
35
"tailscale.com/tailcfg"
37
36
"tailscale.com/types/netlogtype"
@@ -94,7 +93,9 @@ type Options struct {
94
93
}
95
94
96
95
type Client interface {
97
- ConnectRPC (ctx context.Context ) (drpc.Conn , error )
96
+ ConnectRPC23 (ctx context.Context ) (
97
+ proto.DRPCAgentClient23 , tailnetproto.DRPCTailnetClient23 , error ,
98
+ )
98
99
RewriteDERPMap (derpMap * tailcfg.DERPMap )
99
100
}
100
101
@@ -410,7 +411,7 @@ func (t *trySingleflight) Do(key string, fn func()) {
410
411
fn ()
411
412
}
412
413
413
- func (a * agent ) reportMetadata (ctx context.Context , conn drpc. Conn ) error {
414
+ func (a * agent ) reportMetadata (ctx context.Context , aAPI proto. DRPCAgentClient23 ) error {
414
415
tickerDone := make (chan struct {})
415
416
collectDone := make (chan struct {})
416
417
ctx , cancel := context .WithCancel (ctx )
@@ -572,7 +573,6 @@ func (a *agent) reportMetadata(ctx context.Context, conn drpc.Conn) error {
572
573
reportTimeout = 30 * time .Second
573
574
reportError = make (chan error , 1 )
574
575
reportInFlight = false
575
- aAPI = proto .NewDRPCAgentClient (conn )
576
576
)
577
577
578
578
for {
@@ -627,8 +627,7 @@ func (a *agent) reportMetadata(ctx context.Context, conn drpc.Conn) error {
627
627
628
628
// reportLifecycle reports the current lifecycle state once. All state
629
629
// changes are reported in order.
630
- func (a * agent ) reportLifecycle (ctx context.Context , conn drpc.Conn ) error {
631
- aAPI := proto .NewDRPCAgentClient (conn )
630
+ func (a * agent ) reportLifecycle (ctx context.Context , aAPI proto.DRPCAgentClient23 ) error {
632
631
for {
633
632
select {
634
633
case <- a .lifecycleUpdate :
@@ -710,8 +709,7 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
710
709
// fetchServiceBannerLoop fetches the service banner on an interval. It will
711
710
// not be fetched immediately; the expectation is that it is primed elsewhere
712
711
// (and must be done before the session actually starts).
713
- func (a * agent ) fetchServiceBannerLoop (ctx context.Context , conn drpc.Conn ) error {
714
- aAPI := proto .NewDRPCAgentClient (conn )
712
+ func (a * agent ) fetchServiceBannerLoop (ctx context.Context , aAPI proto.DRPCAgentClient23 ) error {
715
713
ticker := time .NewTicker (a .announcementBannersRefreshInterval )
716
714
defer ticker .Stop ()
717
715
for {
@@ -737,7 +735,7 @@ func (a *agent) fetchServiceBannerLoop(ctx context.Context, conn drpc.Conn) erro
737
735
}
738
736
739
737
func (a * agent ) run () (retErr error ) {
740
- // This allows the agent to refresh it's token if necessary.
738
+ // This allows the agent to refresh its token if necessary.
741
739
// For instance identity this is required, since the instance
742
740
// may not have re-provisioned, but a new agent ID was created.
743
741
sessionToken , err := a .exchangeToken (a .hardCtx )
@@ -747,12 +745,12 @@ func (a *agent) run() (retErr error) {
747
745
a .sessionToken .Store (& sessionToken )
748
746
749
747
// ConnectRPC returns the dRPC connection we use for the Agent and Tailnet v2+ APIs
750
- conn , err := a .client .ConnectRPC (a .hardCtx )
748
+ aAPI , tAPI , err := a .client .ConnectRPC23 (a .hardCtx )
751
749
if err != nil {
752
750
return err
753
751
}
754
752
defer func () {
755
- cErr := conn .Close ()
753
+ cErr := aAPI . DRPCConn () .Close ()
756
754
if cErr != nil {
757
755
a .logger .Debug (a .hardCtx , "error closing drpc connection" , slog .Error (err ))
758
756
}
@@ -761,11 +759,10 @@ func (a *agent) run() (retErr error) {
761
759
// A lot of routines need the agent API / tailnet API connection. We run them in their own
762
760
// goroutines in parallel, but errors in any routine will cause them all to exit so we can
763
761
// redial the coder server and retry.
764
- connMan := newAPIConnRoutineManager (a .gracefulCtx , a .hardCtx , a .logger , conn )
762
+ connMan := newAPIConnRoutineManager (a .gracefulCtx , a .hardCtx , a .logger , aAPI , tAPI )
765
763
766
- connMan .start ("init notification banners" , gracefulShutdownBehaviorStop ,
767
- func (ctx context.Context , conn drpc.Conn ) error {
768
- aAPI := proto .NewDRPCAgentClient (conn )
764
+ connMan .startAgentAPI ("init notification banners" , gracefulShutdownBehaviorStop ,
765
+ func (ctx context.Context , aAPI proto.DRPCAgentClient23 ) error {
769
766
bannersProto , err := aAPI .GetAnnouncementBanners (ctx , & proto.GetAnnouncementBannersRequest {})
770
767
if err != nil {
771
768
return xerrors .Errorf ("fetch service banner: %w" , err )
@@ -781,9 +778,9 @@ func (a *agent) run() (retErr error) {
781
778
782
779
// sending logs gets gracefulShutdownBehaviorRemain because we want to send logs generated by
783
780
// shutdown scripts.
784
- connMan .start ("send logs" , gracefulShutdownBehaviorRemain ,
785
- func (ctx context.Context , conn drpc. Conn ) error {
786
- err := a .logSender .SendLoop (ctx , proto . NewDRPCAgentClient ( conn ) )
781
+ connMan .startAgentAPI ("send logs" , gracefulShutdownBehaviorRemain ,
782
+ func (ctx context.Context , aAPI proto. DRPCAgentClient23 ) error {
783
+ err := a .logSender .SendLoop (ctx , aAPI )
787
784
if xerrors .Is (err , agentsdk .LogLimitExceededError ) {
788
785
// we don't want this error to tear down the API connection and propagate to the
789
786
// other routines that use the API. The LogSender has already dropped a warning
@@ -795,10 +792,10 @@ func (a *agent) run() (retErr error) {
795
792
796
793
// part of graceful shut down is reporting the final lifecycle states, e.g "ShuttingDown" so the
797
794
// lifecycle reporting has to be via gracefulShutdownBehaviorRemain
798
- connMan .start ("report lifecycle" , gracefulShutdownBehaviorRemain , a .reportLifecycle )
795
+ connMan .startAgentAPI ("report lifecycle" , gracefulShutdownBehaviorRemain , a .reportLifecycle )
799
796
800
797
// metadata reporting can cease as soon as we start gracefully shutting down
801
- connMan .start ("report metadata" , gracefulShutdownBehaviorStop , a .reportMetadata )
798
+ connMan .startAgentAPI ("report metadata" , gracefulShutdownBehaviorStop , a .reportMetadata )
802
799
803
800
// channels to sync goroutines below
804
801
// handle manifest
@@ -819,55 +816,55 @@ func (a *agent) run() (retErr error) {
819
816
networkOK := newCheckpoint (a .logger )
820
817
manifestOK := newCheckpoint (a .logger )
821
818
822
- connMan .start ("handle manifest" , gracefulShutdownBehaviorStop , a .handleManifest (manifestOK ))
819
+ connMan .startAgentAPI ("handle manifest" , gracefulShutdownBehaviorStop , a .handleManifest (manifestOK ))
823
820
824
- connMan .start ("app health reporter" , gracefulShutdownBehaviorStop ,
825
- func (ctx context.Context , conn drpc. Conn ) error {
821
+ connMan .startAgentAPI ("app health reporter" , gracefulShutdownBehaviorStop ,
822
+ func (ctx context.Context , aAPI proto. DRPCAgentClient23 ) error {
826
823
if err := manifestOK .wait (ctx ); err != nil {
827
824
return xerrors .Errorf ("no manifest: %w" , err )
828
825
}
829
826
manifest := a .manifest .Load ()
830
827
NewWorkspaceAppHealthReporter (
831
- a .logger , manifest .Apps , agentsdk .AppHealthPoster (proto . NewDRPCAgentClient ( conn ) ),
828
+ a .logger , manifest .Apps , agentsdk .AppHealthPoster (aAPI ),
832
829
)(ctx )
833
830
return nil
834
831
})
835
832
836
- connMan .start ("create or update network" , gracefulShutdownBehaviorStop ,
833
+ connMan .startAgentAPI ("create or update network" , gracefulShutdownBehaviorStop ,
837
834
a .createOrUpdateNetwork (manifestOK , networkOK ))
838
835
839
- connMan .start ("coordination" , gracefulShutdownBehaviorStop ,
840
- func (ctx context.Context , conn drpc. Conn ) error {
836
+ connMan .startTailnetAPI ("coordination" , gracefulShutdownBehaviorStop ,
837
+ func (ctx context.Context , tAPI tailnetproto. DRPCTailnetClient23 ) error {
841
838
if err := networkOK .wait (ctx ); err != nil {
842
839
return xerrors .Errorf ("no network: %w" , err )
843
840
}
844
- return a .runCoordinator (ctx , conn , a .network )
841
+ return a .runCoordinator (ctx , tAPI , a .network )
845
842
},
846
843
)
847
844
848
- connMan .start ("derp map subscriber" , gracefulShutdownBehaviorStop ,
849
- func (ctx context.Context , conn drpc. Conn ) error {
845
+ connMan .startTailnetAPI ("derp map subscriber" , gracefulShutdownBehaviorStop ,
846
+ func (ctx context.Context , tAPI tailnetproto. DRPCTailnetClient23 ) error {
850
847
if err := networkOK .wait (ctx ); err != nil {
851
848
return xerrors .Errorf ("no network: %w" , err )
852
849
}
853
- return a .runDERPMapSubscriber (ctx , conn , a .network )
850
+ return a .runDERPMapSubscriber (ctx , tAPI , a .network )
854
851
})
855
852
856
- connMan .start ("fetch service banner loop" , gracefulShutdownBehaviorStop , a .fetchServiceBannerLoop )
853
+ connMan .startAgentAPI ("fetch service banner loop" , gracefulShutdownBehaviorStop , a .fetchServiceBannerLoop )
857
854
858
- connMan .start ("stats report loop" , gracefulShutdownBehaviorStop , func (ctx context.Context , conn drpc. Conn ) error {
855
+ connMan .startAgentAPI ("stats report loop" , gracefulShutdownBehaviorStop , func (ctx context.Context , aAPI proto. DRPCAgentClient23 ) error {
859
856
if err := networkOK .wait (ctx ); err != nil {
860
857
return xerrors .Errorf ("no network: %w" , err )
861
858
}
862
- return a .statsReporter .reportLoop (ctx , proto . NewDRPCAgentClient ( conn ) )
859
+ return a .statsReporter .reportLoop (ctx , aAPI )
863
860
})
864
861
865
862
return connMan .wait ()
866
863
}
867
864
868
865
// handleManifest returns a function that fetches and processes the manifest
869
- func (a * agent ) handleManifest (manifestOK * checkpoint ) func (ctx context.Context , conn drpc. Conn ) error {
870
- return func (ctx context.Context , conn drpc. Conn ) error {
866
+ func (a * agent ) handleManifest (manifestOK * checkpoint ) func (ctx context.Context , aAPI proto. DRPCAgentClient23 ) error {
867
+ return func (ctx context.Context , aAPI proto. DRPCAgentClient23 ) error {
871
868
var (
872
869
sentResult = false
873
870
err error
@@ -877,7 +874,6 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
877
874
manifestOK .complete (err )
878
875
}
879
876
}()
880
- aAPI := proto .NewDRPCAgentClient (conn )
881
877
mp , err := aAPI .GetManifest (ctx , & proto.GetManifestRequest {})
882
878
if err != nil {
883
879
return xerrors .Errorf ("fetch metadata: %w" , err )
@@ -977,8 +973,8 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
977
973
978
974
// createOrUpdateNetwork waits for the manifest to be set using manifestOK, then creates or updates
979
975
// the tailnet using the information in the manifest
980
- func (a * agent ) createOrUpdateNetwork (manifestOK , networkOK * checkpoint ) func (context.Context , drpc. Conn ) error {
981
- return func (ctx context.Context , _ drpc. Conn ) (retErr error ) {
976
+ func (a * agent ) createOrUpdateNetwork (manifestOK , networkOK * checkpoint ) func (context.Context , proto. DRPCAgentClient23 ) error {
977
+ return func (ctx context.Context , _ proto. DRPCAgentClient23 ) (retErr error ) {
982
978
if err := manifestOK .wait (ctx ); err != nil {
983
979
return xerrors .Errorf ("no manifest: %w" , err )
984
980
}
@@ -1325,9 +1321,8 @@ func (a *agent) createTailnet(ctx context.Context, agentID uuid.UUID, derpMap *t
1325
1321
1326
1322
// runCoordinator runs a coordinator and returns whether a reconnect
1327
1323
// should occur.
1328
- func (a * agent ) runCoordinator (ctx context.Context , conn drpc. Conn , network * tailnet.Conn ) error {
1324
+ func (a * agent ) runCoordinator (ctx context.Context , tClient tailnetproto. DRPCTailnetClient23 , network * tailnet.Conn ) error {
1329
1325
defer a .logger .Debug (ctx , "disconnected from coordination RPC" )
1330
- tClient := tailnetproto .NewDRPCTailnetClient (conn )
1331
1326
// we run the RPC on the hardCtx so that we have a chance to send the disconnect message if we
1332
1327
// gracefully shut down.
1333
1328
coordinate , err := tClient .Coordinate (a .hardCtx )
@@ -1373,11 +1368,10 @@ func (a *agent) runCoordinator(ctx context.Context, conn drpc.Conn, network *tai
1373
1368
}
1374
1369
1375
1370
// runDERPMapSubscriber runs a coordinator and returns if a reconnect should occur.
1376
- func (a * agent ) runDERPMapSubscriber (ctx context.Context , conn drpc. Conn , network * tailnet.Conn ) error {
1371
+ func (a * agent ) runDERPMapSubscriber (ctx context.Context , tClient tailnetproto. DRPCTailnetClient23 , network * tailnet.Conn ) error {
1377
1372
defer a .logger .Debug (ctx , "disconnected from derp map RPC" )
1378
1373
ctx , cancel := context .WithCancel (ctx )
1379
1374
defer cancel ()
1380
- tClient := tailnetproto .NewDRPCTailnetClient (conn )
1381
1375
stream , err := tClient .StreamDERPMaps (ctx , & tailnetproto.StreamDERPMapsRequest {})
1382
1376
if err != nil {
1383
1377
return xerrors .Errorf ("stream DERP Maps: %w" , err )
@@ -1981,13 +1975,17 @@ const (
1981
1975
1982
1976
type apiConnRoutineManager struct {
1983
1977
logger slog.Logger
1984
- conn drpc.Conn
1978
+ aAPI proto.DRPCAgentClient23
1979
+ tAPI tailnetproto.DRPCTailnetClient23
1985
1980
eg * errgroup.Group
1986
1981
stopCtx context.Context
1987
1982
remainCtx context.Context
1988
1983
}
1989
1984
1990
- func newAPIConnRoutineManager (gracefulCtx , hardCtx context.Context , logger slog.Logger , conn drpc.Conn ) * apiConnRoutineManager {
1985
+ func newAPIConnRoutineManager (
1986
+ gracefulCtx , hardCtx context.Context , logger slog.Logger ,
1987
+ aAPI proto.DRPCAgentClient23 , tAPI tailnetproto.DRPCTailnetClient23 ,
1988
+ ) * apiConnRoutineManager {
1991
1989
// routines that remain in operation during graceful shutdown use the remainCtx. They'll still
1992
1990
// exit if the errgroup hits an error, which usually means a problem with the conn.
1993
1991
eg , remainCtx := errgroup .WithContext (hardCtx )
@@ -2007,17 +2005,60 @@ func newAPIConnRoutineManager(gracefulCtx, hardCtx context.Context, logger slog.
2007
2005
stopCtx := eitherContext (remainCtx , gracefulCtx )
2008
2006
return & apiConnRoutineManager {
2009
2007
logger : logger ,
2010
- conn : conn ,
2008
+ aAPI : aAPI ,
2009
+ tAPI : tAPI ,
2011
2010
eg : eg ,
2012
2011
stopCtx : stopCtx ,
2013
2012
remainCtx : remainCtx ,
2014
2013
}
2015
2014
}
2016
2015
2017
- func (a * apiConnRoutineManager ) start (name string , b gracefulShutdownBehavior , f func (context.Context , drpc.Conn ) error ) {
2016
+ // startAgentAPI starts a routine that uses the Agent API. c.f. startTailnetAPI which is the same
2017
+ // but for Tailnet.
2018
+ func (a * apiConnRoutineManager ) startAgentAPI (
2019
+ name string , behavior gracefulShutdownBehavior ,
2020
+ f func (context.Context , proto.DRPCAgentClient23 ) error ,
2021
+ ) {
2022
+ logger := a .logger .With (slog .F ("name" , name ))
2023
+ var ctx context.Context
2024
+ switch behavior {
2025
+ case gracefulShutdownBehaviorStop :
2026
+ ctx = a .stopCtx
2027
+ case gracefulShutdownBehaviorRemain :
2028
+ ctx = a .remainCtx
2029
+ default :
2030
+ panic ("unknown behavior" )
2031
+ }
2032
+ a .eg .Go (func () error {
2033
+ logger .Debug (ctx , "starting agent routine" )
2034
+ err := f (ctx , a .aAPI )
2035
+ if xerrors .Is (err , context .Canceled ) && ctx .Err () != nil {
2036
+ logger .Debug (ctx , "swallowing context canceled" )
2037
+ // Don't propagate context canceled errors to the error group, because we don't want the
2038
+ // graceful context being canceled to halt the work of routines with
2039
+ // gracefulShutdownBehaviorRemain. Note that we check both that the error is
2040
+ // context.Canceled and that *our* context is currently canceled, because when Coderd
2041
+ // unilaterally closes the API connection (for example if the build is outdated), it can
2042
+ // sometimes show up as context.Canceled in our RPC calls.
2043
+ return nil
2044
+ }
2045
+ logger .Debug (ctx , "routine exited" , slog .Error (err ))
2046
+ if err != nil {
2047
+ return xerrors .Errorf ("error in routine %s: %w" , name , err )
2048
+ }
2049
+ return nil
2050
+ })
2051
+ }
2052
+
2053
+ // startTailnetAPI starts a routine that uses the Tailnet API. c.f. startAgentAPI which is the same
2054
+ // but for the Agent API.
2055
+ func (a * apiConnRoutineManager ) startTailnetAPI (
2056
+ name string , behavior gracefulShutdownBehavior ,
2057
+ f func (context.Context , tailnetproto.DRPCTailnetClient23 ) error ,
2058
+ ) {
2018
2059
logger := a .logger .With (slog .F ("name" , name ))
2019
2060
var ctx context.Context
2020
- switch b {
2061
+ switch behavior {
2021
2062
case gracefulShutdownBehaviorStop :
2022
2063
ctx = a .stopCtx
2023
2064
case gracefulShutdownBehaviorRemain :
@@ -2026,8 +2067,8 @@ func (a *apiConnRoutineManager) start(name string, b gracefulShutdownBehavior, f
2026
2067
panic ("unknown behavior" )
2027
2068
}
2028
2069
a .eg .Go (func () error {
2029
- logger .Debug (ctx , "starting routine" )
2030
- err := f (ctx , a .conn )
2070
+ logger .Debug (ctx , "starting tailnet routine" )
2071
+ err := f (ctx , a .tAPI )
2031
2072
if xerrors .Is (err , context .Canceled ) && ctx .Err () != nil {
2032
2073
logger .Debug (ctx , "swallowing context canceled" )
2033
2074
// Don't propagate context canceled errors to the error group, because we don't want the
0 commit comments