From fa37bf5a423121ba60f7d3c6f8136f02447a602c Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 3 Oct 2024 12:56:04 +0200 Subject: [PATCH 01/20] feat: PRT Block Hash Retry Archive --- protocol/chainlib/chain_message.go | 4 + protocol/chainlib/chainlib.go | 1 + protocol/chainlib/chainlib_mock.go | 8 ++ .../lavasession/consumer_session_manager.go | 25 ++-- .../consumer_session_manager_test.go | 50 +++---- protocol/lavasession/consumer_types.go | 9 +- .../end_to_end_lavasession_test.go | 8 +- protocol/lavasession/router_key.go | 25 +++- .../lavasession/single_consumer_session.go | 19 +-- protocol/lavasession/used_providers.go | 135 ++++++++++++------ protocol/lavasession/used_providers_test.go | 27 ++-- .../consumer_relay_state_machine.go | 30 +++- .../consumer_relay_state_machine_test.go | 11 +- protocol/rpcconsumer/relay_processor.go | 10 +- protocol/rpcconsumer/relay_processor_test.go | 50 +++---- protocol/rpcconsumer/rpcconsumer_server.go | 17 +-- 16 files changed, 264 insertions(+), 165 deletions(-) diff --git a/protocol/chainlib/chain_message.go b/protocol/chainlib/chain_message.go index 66f89c2555..dba23733cc 100644 --- a/protocol/chainlib/chain_message.go +++ b/protocol/chainlib/chain_message.go @@ -38,6 +38,10 @@ type baseChainMessageContainer struct { resultErrorParsingMethod func(data []byte, httpStatusCode int) (hasError bool, errorMessage string) } +func (bcnc *baseChainMessageContainer) GetRequestedBlocksHashes() []string { + return bcnc.requestedBlockHashes +} + func (bcnc *baseChainMessageContainer) SubscriptionIdExtractor(reply *rpcclient.JsonrpcMessage) string { return bcnc.msg.SubscriptionIdExtractor(reply) } diff --git a/protocol/chainlib/chainlib.go b/protocol/chainlib/chainlib.go index 44b8184405..5fe487a317 100644 --- a/protocol/chainlib/chainlib.go +++ b/protocol/chainlib/chainlib.go @@ -88,6 +88,7 @@ type ChainMessage interface { SetForceCacheRefresh(force bool) bool CheckResponseError(data []byte, httpStatusCode int) (hasError bool, errorMessage string) GetRawRequestHash() ([]byte, error) + GetRequestedBlocksHashes() []string ChainMessageForSend } diff --git a/protocol/chainlib/chainlib_mock.go b/protocol/chainlib/chainlib_mock.go index 8ba6c64410..6ee05a7f5c 100644 --- a/protocol/chainlib/chainlib_mock.go +++ b/protocol/chainlib/chainlib_mock.go @@ -282,6 +282,14 @@ func (m *MockChainMessage) EXPECT() *MockChainMessageMockRecorder { return m.recorder } +// AppendHeader mocks base method. +func (m *MockChainMessage) GetRequestedBlocksHashes() []string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetRequestedBlocksHashes") + ret0, _ := ret[0].([]string) + return ret0 +} + // AppendHeader mocks base method. func (m *MockChainMessage) AppendHeader(metadata []types.Metadata) { m.ctrl.T.Helper() diff --git a/protocol/lavasession/consumer_session_manager.go b/protocol/lavasession/consumer_session_manager.go index 7fef3af42e..bac3bc324e 100644 --- a/protocol/lavasession/consumer_session_manager.go +++ b/protocol/lavasession/consumer_session_manager.go @@ -431,8 +431,8 @@ func (csm *ConsumerSessionManager) GetSessions(ctx context.Context, cuNeededForS } return nil, utils.LavaFormatError("failed getting sessions from used Providers", nil, utils.LogAttr("usedProviders", usedProviders), utils.LogAttr("endpoint", csm.rpcEndpoint)) } - defer func() { usedProviders.AddUsed(consumerSessionMap, errRet) }() - initUnwantedProviders := usedProviders.GetUnwantedProvidersToSend() + defer func() { usedProviders.AddUsed(consumerSessionMap, extensions, errRet) }() + initUnwantedProviders := usedProviders.GetUnwantedProvidersToSend(extensions) extensionNames := common.GetExtensionNames(extensions) // if pairing list is empty we reset the state. @@ -528,7 +528,7 @@ func (csm *ConsumerSessionManager) GetSessions(ctx context.Context, cuNeededForS if MaxComputeUnitsExceededError.Is(err) { tempIgnoredProviders.providers[providerAddress] = struct{}{} // We must unlock the consumer session before continuing. - consumerSession.Free(nil) + consumerSession.Free(nil, extensions) continue } else { utils.LavaFormatFatal("Unsupported Error", err) @@ -696,7 +696,7 @@ func (csm *ConsumerSessionManager) tryGetConsumerSessionWithProviderFromBlockedP consumerSessionsWithProvider := csm.pairing[providerAddress] // Add to ignored (no matter what) ignoredProviders.providers[providerAddress] = struct{}{} - usedProviders.AddUnwantedAddresses(providerAddress) // add the address to our unwanted providers to avoid infinite recursion + usedProviders.AddUnwantedAddresses(providerAddress, extensions) // add the address to our unwanted providers to avoid infinite recursion // validate this provider has enough cu to be used if err := consumerSessionsWithProvider.validateComputeUnits(cuNeededForSession, virtualEpoch); err != nil { @@ -893,9 +893,9 @@ func (csm *ConsumerSessionManager) blockProvider(address string, reportProvider } // Report session failure, mark it as blocked from future usages, report if timeout happened. -func (csm *ConsumerSessionManager) OnSessionFailure(consumerSession *SingleConsumerSession, errorReceived error) error { +func (csm *ConsumerSessionManager) OnSessionFailure(consumerSession *SingleConsumerSession, errorReceived error, extensions []*spectypes.Extension) error { // consumerSession must be locked when getting here. - if err := consumerSession.VerifyLock(); err != nil { + if err := consumerSession.VerifyLock(extensions); err != nil { return sdkerrors.Wrapf(err, "OnSessionFailure, consumerSession.lock must be locked before accessing this method, additional info:") } // redemptionSession = true, if we got this provider from the blocked provider list. @@ -964,7 +964,7 @@ func (csm *ConsumerSessionManager) OnSessionFailure(consumerSession *SingleConsu parentConsumerSessionsWithProvider := consumerSession.Parent // must read this pointer before unlocking csm.updateMetricsManager(consumerSession, time.Duration(0), false) // finished with consumerSession here can unlock. - consumerSession.Free(errorReceived) // we unlock before we change anything in the parent ConsumerSessionsWithProvider + consumerSession.Free(errorReceived, extensions) // we unlock before we change anything in the parent ConsumerSessionsWithProvider err := parentConsumerSessionsWithProvider.decreaseUsedComputeUnits(cuToDecrease) // change the cu in parent if err != nil { @@ -1018,9 +1018,10 @@ func (csm *ConsumerSessionManager) OnSessionDone( numOfProviders int, providersCount uint64, isHangingApi bool, + extensions []*spectypes.Extension, ) error { // release locks, update CU, relaynum etc.. - if err := consumerSession.VerifyLock(); err != nil { + if err := consumerSession.VerifyLock(extensions); err != nil { return sdkerrors.Wrapf(err, "OnSessionDone, consumerSession.lock must be locked before accessing this method") } @@ -1034,7 +1035,7 @@ func (csm *ConsumerSessionManager) OnSessionDone( defer func() { go csm.validateAndReturnBlockedProviderToValidAddressesList(providerAddress) }() } - defer consumerSession.Free(nil) // we need to be locked here, if we didn't get it locked we try lock anyway + defer consumerSession.Free(nil, extensions) // we need to be locked here, if we didn't get it locked we try lock anyway consumerSession.CuSum += consumerSession.LatestRelayCu // add CuSum to current cu usage. consumerSession.LatestRelayCu = 0 // reset cu just in case consumerSession.ConsecutiveErrors = []error{} @@ -1101,12 +1102,12 @@ func (csm *ConsumerSessionManager) GetAtomicPairingAddressesLength() uint64 { } // On a successful Subscribe relay -func (csm *ConsumerSessionManager) OnSessionDoneIncreaseCUOnly(consumerSession *SingleConsumerSession, latestServicedBlock int64) error { - if err := consumerSession.VerifyLock(); err != nil { +func (csm *ConsumerSessionManager) OnSessionDoneIncreaseCUOnly(consumerSession *SingleConsumerSession, latestServicedBlock int64, extensions []*spectypes.Extension) error { + if err := consumerSession.VerifyLock(extensions); err != nil { return sdkerrors.Wrapf(err, "OnSessionDoneIncreaseRelayAndCu consumerSession.lock must be locked before accessing this method") } - defer consumerSession.Free(nil) // we need to be locked here, if we didn't get it locked we try lock anyway + defer consumerSession.Free(nil, extensions) // we need to be locked here, if we didn't get it locked we try lock anyway consumerSession.LatestBlock = latestServicedBlock consumerSession.CuSum += consumerSession.LatestRelayCu // add CuSum to current cu usage. consumerSession.LatestRelayCu = 0 // reset cu just in case diff --git a/protocol/lavasession/consumer_session_manager_test.go b/protocol/lavasession/consumer_session_manager_test.go index b22f0c2e61..598c750407 100644 --- a/protocol/lavasession/consumer_session_manager_test.go +++ b/protocol/lavasession/consumer_session_manager_test.go @@ -83,7 +83,7 @@ func TestHappyFlow(t *testing.T) { require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, cuForFirstRequest) - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) require.Equal(t, cs.Session.CuSum, cuForFirstRequest) require.Equal(t, cs.Session.LatestRelayCu, latestRelayCuAfterDone) @@ -346,7 +346,7 @@ func TestSecondChanceRecoveryFlow(t *testing.T) { _, expectedProviderAddress := css[pairingList[0].PublicLavaAddress] require.True(t, expectedProviderAddress) for _, sessionInfo := range css { - csm.OnSessionFailure(sessionInfo.Session, fmt.Errorf("testError")) + csm.OnSessionFailure(sessionInfo.Session, fmt.Errorf("testError"), nil) } _, ok := csm.secondChanceGivenToAddresses[pairingList[0].PublicLavaAddress] if ok { @@ -399,7 +399,7 @@ func TestSecondChanceRecoveryFlow(t *testing.T) { _, expectedProviderAddress := css[pairingList[0].PublicLavaAddress] require.True(t, expectedProviderAddress) for _, sessionInfo := range css { - csm.OnSessionFailure(sessionInfo.Session, fmt.Errorf("testError")) + csm.OnSessionFailure(sessionInfo.Session, fmt.Errorf("testError"), nil) require.Equal(t, BlockedProviderSessionUnusedStatus, csm.pairing[pairingList[0].PublicLavaAddress].blockedAndUsedWithChanceForRecoveryStatus) } if _, ok := csm.reportedProviders.addedToPurgeAndReport[pairingList[0].PublicLavaAddress]; ok { @@ -416,7 +416,7 @@ func runOnSessionDoneForConsumerSessionMap(t *testing.T, css ConsumerSessionsMap require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, cuForFirstRequest) - err := csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err := csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) require.Equal(t, cs.Session.CuSum, cuForFirstRequest) require.Equal(t, cs.Session.LatestRelayCu, latestRelayCuAfterDone) @@ -430,7 +430,7 @@ func runOnSessionFailureForConsumerSessionMap(t *testing.T, css ConsumerSessions require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, cuForFirstRequest) - err := csm.OnSessionFailure(cs.Session, fmt.Errorf("testError")) + err := csm.OnSessionFailure(cs.Session, fmt.Errorf("testError"), nil) require.NoError(t, err) } } @@ -448,7 +448,7 @@ func TestHappyFlowVirtualEpoch(t *testing.T) { require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, maxCuForVirtualEpoch*(virtualEpoch+1)) - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, maxCuForVirtualEpoch*(virtualEpoch+1), time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, maxCuForVirtualEpoch*(virtualEpoch+1), time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) require.Equal(t, cs.Session.CuSum, maxCuForVirtualEpoch*(virtualEpoch+1)) require.Equal(t, cs.Session.LatestRelayCu, latestRelayCuAfterDone) @@ -484,7 +484,7 @@ func TestPairingReset(t *testing.T) { require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, cuForFirstRequest) - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) require.Equal(t, cs.Session.CuSum, cuForFirstRequest) require.Equal(t, cs.Session.LatestRelayCu, latestRelayCuAfterDone) @@ -509,7 +509,7 @@ func TestPairingResetWithFailures(t *testing.T) { require.NoError(t, err) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, nil) + err = csm.OnSessionFailure(cs.Session, nil, nil) require.NoError(t, err) // fail test. } } @@ -545,7 +545,7 @@ func TestPairingResetWithMultipleFailures(t *testing.T) { require.NoError(t, err) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, nil) + err = csm.OnSessionFailure(cs.Session, nil, nil) require.NoError(t, err) } @@ -573,7 +573,7 @@ func TestPairingResetWithMultipleFailures(t *testing.T) { require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, cuForFirstRequest) - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) require.Equal(t, cs.Session.CuSum, cuForFirstRequest) require.Equal(t, cs.Session.LatestRelayCu, latestRelayCuAfterDone) @@ -619,7 +619,7 @@ func TestSuccessAndFailureOfSessionWithUpdatePairingsInTheMiddle(t *testing.T) { require.Equal(t, epoch, csm.currentEpoch) if rand.Intn(2) > 0 { - err = csm.OnSessionDone(cs, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) require.Equal(t, cs.CuSum, cuForFirstRequest) require.Equal(t, cs.LatestRelayCu, latestRelayCuAfterDone) @@ -627,7 +627,7 @@ func TestSuccessAndFailureOfSessionWithUpdatePairingsInTheMiddle(t *testing.T) { require.Equal(t, cs.LatestBlock, servicedBlockNumber) sessionListData[j] = SessTestData{cuSum: cuForFirstRequest, relayNum: 1} } else { - err = csm.OnSessionFailure(cs, nil) + err = csm.OnSessionFailure(cs, nil, nil) require.NoError(t, err) require.Equal(t, cs.CuSum, uint64(0)) require.Equal(t, cs.RelayNum, relayNumberAfterFirstFail) @@ -653,14 +653,14 @@ func TestSuccessAndFailureOfSessionWithUpdatePairingsInTheMiddle(t *testing.T) { for j := numberOfAllowedSessionsPerConsumer / 2; j < numberOfAllowedSessionsPerConsumer; j++ { cs := sessionList[j].cs if rand.Intn(2) > 0 { - err = csm.OnSessionDone(cs, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) require.Equal(t, sessionListData[j].cuSum+cuForFirstRequest, cs.CuSum) require.Equal(t, cs.LatestRelayCu, latestRelayCuAfterDone) require.Equal(t, cs.RelayNum, sessionListData[j].relayNum+1) require.Equal(t, cs.LatestBlock, servicedBlockNumber) } else { - err = csm.OnSessionFailure(cs, nil) + err = csm.OnSessionFailure(cs, nil, nil) require.NoError(t, err) require.Equal(t, sessionListData[j].cuSum, cs.CuSum) require.Equal(t, cs.RelayNum, sessionListData[j].relayNum+1) @@ -676,7 +676,7 @@ func successfulSession(ctx context.Context, csm *ConsumerSessionManager, t *test for _, cs := range css { require.NotNil(t, cs) time.Sleep(time.Duration((rand.Intn(500) + 1)) * time.Millisecond) - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) ch <- p } @@ -689,7 +689,7 @@ func failedSession(ctx context.Context, csm *ConsumerSessionManager, t *testing. for _, cs := range css { require.NotNil(t, cs) time.Sleep(time.Duration((rand.Intn(500) + 1)) * time.Millisecond) - err = csm.OnSessionFailure(cs.Session, fmt.Errorf("nothing special")) + err = csm.OnSessionFailure(cs.Session, fmt.Errorf("nothing special"), nil) require.NoError(t, err) ch <- p } @@ -808,7 +808,7 @@ func TestSessionFailureAndGetReportedProviders(t *testing.T) { require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, cuForFirstRequest) - err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError) + err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError, nil) require.NoError(t, err) require.Equal(t, cs.Session.Parent.UsedComputeUnits, cuSumOnFailure) require.Equal(t, cs.Session.CuSum, cuSumOnFailure) @@ -845,7 +845,7 @@ func TestSessionFailureEpochMisMatch(t *testing.T) { err = csm.UpdateAllProviders(secondEpochHeight, pairingList) // update the providers again. require.NoError(t, err) - err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError) + err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError, nil) require.NoError(t, err) } } @@ -945,7 +945,7 @@ func TestPairingWithAddons(t *testing.T) { css, err := csm.GetSessions(ctx, cuForFirstRequest, NewUsedProviders(nil), servicedBlockNumber, addon, nil, common.NO_STATE, 0) // get a session require.NoError(t, err, i) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError) + err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError, nil) require.NoError(t, err) } utils.LavaFormatDebug("length!", utils.Attribute{Key: "length", Value: len(csm.getValidAddresses(addon, nil))}, utils.Attribute{Key: "valid addresses", Value: csm.getValidAddresses(addon, nil)}) @@ -957,7 +957,7 @@ func TestPairingWithAddons(t *testing.T) { css, err := csm.GetSessions(ctx, cuForFirstRequest, NewUsedProviders(nil), servicedBlockNumber, addon, nil, common.NO_STATE, 0) // get a session require.NoError(t, err) for _, cs := range css { - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) } }) @@ -1020,7 +1020,7 @@ func TestPairingWithExtensions(t *testing.T) { css, err := csm.GetSessions(ctx, cuForFirstRequest, NewUsedProviders(nil), servicedBlockNumber, extensionOpt.addon, extensionsList, common.NO_STATE, 0) // get a session require.NoError(t, err, i) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError) + err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError, nil) require.NoError(t, err) } utils.LavaFormatDebug("length!", utils.Attribute{Key: "length", Value: len(csm.getValidAddresses(extensionOpt.addon, extensionOpt.extensions))}, utils.Attribute{Key: "valid addresses", Value: csm.getValidAddresses(extensionOpt.addon, extensionOpt.extensions)}) @@ -1032,7 +1032,7 @@ func TestPairingWithExtensions(t *testing.T) { css, err := csm.GetSessions(ctx, cuForFirstRequest, NewUsedProviders(nil), servicedBlockNumber, extensionOpt.addon, extensionsList, common.NO_STATE, 0) // get a session require.NoError(t, err) for _, cs := range css { - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) } }) @@ -1068,11 +1068,11 @@ func TestPairingWithStateful(t *testing.T) { require.NoError(t, err) require.Equal(t, allProviders, len(css)) for _, cs := range css { - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), numberOfProviders, numberOfProviders, false, nil) require.NoError(t, err) } usedProviders := NewUsedProviders(nil) - usedProviders.RemoveUsed(providerAddresses[0], nil) + usedProviders.RemoveUsed(providerAddresses[0], nil, nil) css, err = csm.GetSessions(ctx, cuForFirstRequest, usedProviders, servicedBlockNumber, addon, nil, common.CONSISTENCY_SELECT_ALL_PROVIDERS, 0) // get a session require.NoError(t, err) require.Equal(t, allProviders-1, len(css)) @@ -1090,7 +1090,7 @@ func TestMaximumBlockedSessionsErrorsInPairingListEmpty(t *testing.T) { css, err := csm.GetSessions(ctx, cuForFirstRequest, NewUsedProviders(nil), servicedBlockNumber, "", nil, common.NO_STATE, 0) // get a session require.NoError(t, err) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, errors.Join(BlockProviderError, SessionOutOfSyncError)) + err = csm.OnSessionFailure(cs.Session, errors.Join(BlockProviderError, SessionOutOfSyncError), nil) require.NoError(t, err) } } diff --git a/protocol/lavasession/consumer_types.go b/protocol/lavasession/consumer_types.go index 67c8b7b259..04a39dc390 100644 --- a/protocol/lavasession/consumer_types.go +++ b/protocol/lavasession/consumer_types.go @@ -13,6 +13,7 @@ import ( "github.com/lavanet/lava/v3/utils/rand" pairingtypes "github.com/lavanet/lava/v3/x/pairing/types" planstypes "github.com/lavanet/lava/v3/x/plans/types" + spectypes "github.com/lavanet/lava/v3/x/spec/types" "google.golang.org/grpc" "google.golang.org/grpc/connectivity" ) @@ -51,11 +52,11 @@ var ( ) type UsedProvidersInf interface { - RemoveUsed(providerAddress string, err error) + RemoveUsed(providerAddress string, extensions []*spectypes.Extension, err error) TryLockSelection(context.Context) error - AddUsed(ConsumerSessionsMap, error) - GetUnwantedProvidersToSend() map[string]struct{} - AddUnwantedAddresses(address string) + AddUsed(ConsumerSessionsMap, []*spectypes.Extension, error) + GetUnwantedProvidersToSend(extensions []*spectypes.Extension) map[string]struct{} + AddUnwantedAddresses(address string, extensions []string) CurrentlyUsed() int } diff --git a/protocol/lavasession/end_to_end_lavasession_test.go b/protocol/lavasession/end_to_end_lavasession_test.go index 0b434855b6..e2e85546e3 100644 --- a/protocol/lavasession/end_to_end_lavasession_test.go +++ b/protocol/lavasession/end_to_end_lavasession_test.go @@ -57,7 +57,7 @@ func TestHappyFlowE2EEmergency(t *testing.T) { skippedRelays++ - err := csm.OnSessionFailure(cs.Session, nil) + err := csm.OnSessionFailure(cs.Session, nil, nil) require.NoError(t, err) err = psm.OnSessionFailure(sps, cs.Session.RelayNum-skippedRelays) @@ -72,7 +72,7 @@ func TestHappyFlowE2EEmergency(t *testing.T) { err = psm.OnSessionDone(sps, cs.Session.RelayNum-skippedRelays) require.NoError(t, err) - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, maxCuForVirtualEpoch, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), 1, 1, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, maxCuForVirtualEpoch, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), 1, 1, false, nil) require.NoError(t, err) } @@ -124,7 +124,7 @@ func TestHappyFlowEmergencyInConsumer(t *testing.T) { require.NoError(t, err) // Consumer Side: - err = csm.OnSessionFailure(cs.Session, nil) + err = csm.OnSessionFailure(cs.Session, nil, nil) require.NoError(t, err) require.Equal(t, cs.Session.CuSum, maxCuForVirtualEpoch) require.Equal(t, cs.Session.LatestRelayCu, latestRelayCuAfterDone) @@ -195,7 +195,7 @@ func prepareSessionsWithFirstRelay(t *testing.T, cuForFirstRequest uint64) (*Con require.NoError(t, err) // Consumer Side: - err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), 1, 1, false) + err = csm.OnSessionDone(cs.Session, servicedBlockNumber, cuForFirstRequest, time.Millisecond, cs.Session.CalculateExpectedLatency(2*time.Millisecond), (servicedBlockNumber - 1), 1, 1, false, nil) require.NoError(t, err) require.Equal(t, cs.Session.CuSum, cuForFirstRequest) require.Equal(t, cs.Session.LatestRelayCu, latestRelayCuAfterDone) diff --git a/protocol/lavasession/router_key.go b/protocol/lavasession/router_key.go index 291e543235..b4ac807cc6 100644 --- a/protocol/lavasession/router_key.go +++ b/protocol/lavasession/router_key.go @@ -4,6 +4,8 @@ import ( "sort" "strconv" "strings" + + spectypes "github.com/lavanet/lava/v3/x/spec/types" ) const ( @@ -18,18 +20,31 @@ func (rk *RouterKey) ApplyMethodsRoute(routeNum int) RouterKey { return RouterKey(string(*rk) + methodRouteSep + additionalPath) } +func newRouterKeyInner(uniqueExtensions map[string]struct{}) RouterKey { + uniqueExtensionsSlice := []string{} + for addon := range uniqueExtensions { // we are sorting this anyway so we don't have to keep order + uniqueExtensionsSlice = append(uniqueExtensionsSlice, addon) + } + sort.Strings(uniqueExtensionsSlice) + return RouterKey(sep + strings.Join(uniqueExtensionsSlice, sep) + sep) +} + func NewRouterKey(extensions []string) RouterKey { // make sure addons have no repetitions uniqueExtensions := map[string]struct{}{} for _, extension := range extensions { uniqueExtensions[extension] = struct{}{} } - uniqueExtensionsSlice := []string{} - for addon := range uniqueExtensions { // we are sorting this anyway so we don't have to keep order - uniqueExtensionsSlice = append(uniqueExtensionsSlice, addon) + return newRouterKeyInner(uniqueExtensions) +} + +func NewRouterKeyFromExtensions(extensions []*spectypes.Extension) RouterKey { + // make sure addons have no repetitions + uniqueExtensions := map[string]struct{}{} + for _, extension := range extensions { + uniqueExtensions[extension.Name] = struct{}{} } - sort.Strings(uniqueExtensionsSlice) - return RouterKey(sep + strings.Join(uniqueExtensionsSlice, sep) + sep) + return newRouterKeyInner(uniqueExtensions) } func GetEmptyRouterKey() RouterKey { diff --git a/protocol/lavasession/single_consumer_session.go b/protocol/lavasession/single_consumer_session.go index 7aa0dc1304..f4e5efb1f8 100644 --- a/protocol/lavasession/single_consumer_session.go +++ b/protocol/lavasession/single_consumer_session.go @@ -9,6 +9,7 @@ import ( sdk "github.com/cosmos/cosmos-sdk/types" "github.com/lavanet/lava/v3/utils" pairingtypes "github.com/lavanet/lava/v3/x/pairing/types" + spectypes "github.com/lavanet/lava/v3/x/spec/types" ) type SingleConsumerSession struct { @@ -25,7 +26,7 @@ type SingleConsumerSession struct { BlockListed bool // if session lost sync we blacklist it. ConsecutiveErrors []error errorsCount uint64 - relayProcessor UsedProvidersInf + usedProviders UsedProvidersInf providerUniqueId string StaticProvider bool } @@ -111,14 +112,14 @@ func (scs *SingleConsumerSession) SetUsageForSession(cuNeededForSession uint64, scs.QoSInfo.LastExcellenceQoSReport = qoSExcellenceReport scs.QoSInfo.LastExcellenceQoSReportRaw = rawQoSExcellenceReport } - scs.relayProcessor = usedProviders + scs.usedProviders = usedProviders return nil } -func (scs *SingleConsumerSession) Free(err error) { - if scs.relayProcessor != nil { - scs.relayProcessor.RemoveUsed(scs.Parent.PublicLavaAddress, err) - scs.relayProcessor = nil +func (scs *SingleConsumerSession) Free(err error, extensions []*spectypes.Extension) { + if scs.usedProviders != nil { + scs.usedProviders.RemoveUsed(scs.Parent.PublicLavaAddress, extensions, err) + scs.usedProviders = nil } scs.EndpointConnection.decreaseSessionUsingConnection() scs.lock.Unlock() @@ -130,7 +131,7 @@ func (session *SingleConsumerSession) TryUseSession() (blocked bool, ok bool) { session.lock.Unlock() return true, false } - if session.relayProcessor != nil { + if session.usedProviders != nil { utils.LavaFormatError("session misuse detected, usedProviders isn't nil, missing Free call, blocking", nil, utils.LogAttr("session", session.SessionId)) session.BlockListed = true session.lock.Unlock() @@ -143,10 +144,10 @@ func (session *SingleConsumerSession) TryUseSession() (blocked bool, ok bool) { } // Verify the consumerSession is locked when getting to this function, if its not locked throw an error -func (consumerSession *SingleConsumerSession) VerifyLock() error { +func (consumerSession *SingleConsumerSession) VerifyLock(extensions []*spectypes.Extension) error { if consumerSession.lock.TryLock() { // verify. // if we managed to lock throw an error for misuse. - defer consumerSession.Free(nil) + defer consumerSession.Free(nil, extensions) // if failed to lock we should block session as it seems like a very rare case. consumerSession.BlockListed = true // block this session from future usages utils.LavaFormatError("Verify Lock failed on session Failure, blocking session", nil, utils.LogAttr("consumerSession", consumerSession)) diff --git a/protocol/lavasession/used_providers.go b/protocol/lavasession/used_providers.go index dcf95d951e..0215dcd439 100644 --- a/protocol/lavasession/used_providers.go +++ b/protocol/lavasession/used_providers.go @@ -6,6 +6,7 @@ import ( "time" "github.com/lavanet/lava/v3/utils" + spectypes "github.com/lavanet/lava/v3/x/spec/types" ) const MaximumNumberOfSelectionLockAttempts = 500 @@ -25,22 +26,34 @@ func NewUsedProviders(blockedProviders BlockedProvidersInf) *UsedProviders { } } return &UsedProviders{ - providers: map[string]struct{}{}, - unwantedProviders: unwantedProviders, - blockOnSyncLoss: map[string]struct{}{}, - erroredProviders: map[string]struct{}{}, + uniqueUsedProviders: map[RouterKey]*UniqueUsedProviders{NewRouterKey([]string{}): &UniqueUsedProviders{ + providers: map[string]struct{}{}, + unwantedProviders: unwantedProviders, + blockOnSyncLoss: map[string]struct{}{}, + erroredProviders: map[string]struct{}{}, + }}, + // we keep the original unwanted providers so when we create more unique used providers + // we can reuse it as its the user's instructions. + originalUnwantedProviders: unwantedProviders, } } +// unique used providers are specific for an extension router key. +// meaning each extension router key has a different used providers struct +type UniqueUsedProviders struct { + providers map[string]struct{} + unwantedProviders map[string]struct{} + erroredProviders map[string]struct{} // providers who returned protocol errors (used to debug relays for now) + blockOnSyncLoss map[string]struct{} +} + type UsedProviders struct { - lock sync.RWMutex - providers map[string]struct{} - selecting bool - unwantedProviders map[string]struct{} - erroredProviders map[string]struct{} // providers who returned protocol errors (used to debug relays for now) - blockOnSyncLoss map[string]struct{} - sessionsLatestBatch int - batchNumber int + lock sync.RWMutex + uniqueUsedProviders map[RouterKey]*UniqueUsedProviders + originalUnwantedProviders map[string]struct{} + selecting bool + sessionsLatestBatch int + batchNumber int } func (up *UsedProviders) CurrentlyUsed() int { @@ -50,7 +63,11 @@ func (up *UsedProviders) CurrentlyUsed() int { } up.lock.RLock() defer up.lock.RUnlock() - return len(up.providers) + currentlyUsed := 0 + for _, uniqueUsedProviders := range up.uniqueUsedProviders { + currentlyUsed += len(uniqueUsedProviders.providers) + } + return currentlyUsed } func (up *UsedProviders) SessionsLatestBatch() int { @@ -81,8 +98,10 @@ func (up *UsedProviders) CurrentlyUsedAddresses() []string { up.lock.RLock() defer up.lock.RUnlock() addresses := []string{} - for addr := range up.providers { - addresses = append(addresses, addr) + for _, uniqueUsedProviders := range up.uniqueUsedProviders { + for addr := range uniqueUsedProviders.providers { + addresses = append(addresses, addr) + } } return addresses } @@ -95,46 +114,69 @@ func (up *UsedProviders) UnwantedAddresses() []string { up.lock.RLock() defer up.lock.RUnlock() addresses := []string{} - for addr := range up.unwantedProviders { - addresses = append(addresses, addr) + for _, uniqueUsedProviders := range up.uniqueUsedProviders { + for addr := range uniqueUsedProviders.unwantedProviders { + addresses = append(addresses, addr) + } } return addresses } -func (up *UsedProviders) AddUnwantedAddresses(address string) { +// Use when locked. Checking wether a router key exists in unique used providers, +// if it does, return it. If it doesn't +// creating a new instance and returning it. +func (up *UsedProviders) createOrUseUniqueUsedProvidersForKey(key RouterKey) *UniqueUsedProviders { + uniqueUsedProviders, ok := up.uniqueUsedProviders[key] + if !ok { + up.uniqueUsedProviders[key] = &UniqueUsedProviders{ + providers: map[string]struct{}{}, + unwantedProviders: up.originalUnwantedProviders, + blockOnSyncLoss: map[string]struct{}{}, + erroredProviders: map[string]struct{}{}, + } + } + return uniqueUsedProviders +} + +func (up *UsedProviders) AddUnwantedAddresses(address string, extensions []string) { if up == nil { utils.LavaFormatError("UsedProviders.AddUnwantedAddresses is nil, misuse detected", nil) return } + routerKey := NewRouterKey(extensions) up.lock.Lock() defer up.lock.Unlock() - up.unwantedProviders[address] = struct{}{} + uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) + uniqueUsedProviders.unwantedProviders[address] = struct{}{} } -func (up *UsedProviders) RemoveUsed(provider string, err error) { +func (up *UsedProviders) RemoveUsed(provider string, extensions []*spectypes.Extension, err error) { if up == nil { return } + routerKey := NewRouterKeyFromExtensions(extensions) up.lock.Lock() defer up.lock.Unlock() + uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) + if err != nil { - up.erroredProviders[provider] = struct{}{} + uniqueUsedProviders.erroredProviders[provider] = struct{}{} if shouldRetryWithThisError(err) { - _, ok := up.blockOnSyncLoss[provider] + _, ok := uniqueUsedProviders.blockOnSyncLoss[provider] if !ok && IsSessionSyncLoss(err) { - up.blockOnSyncLoss[provider] = struct{}{} + uniqueUsedProviders.blockOnSyncLoss[provider] = struct{}{} utils.LavaFormatWarning("Identified SyncLoss in provider, allowing retry", err, utils.Attribute{Key: "address", Value: provider}) } else { - up.setUnwanted(provider) + up.setUnwanted(uniqueUsedProviders, provider) } } else { - up.setUnwanted(provider) + up.setUnwanted(uniqueUsedProviders, provider) } } else { // we got a valid response from this provider, no reason to keep using it - up.setUnwanted(provider) + up.setUnwanted(uniqueUsedProviders, provider) } - delete(up.providers, provider) + delete(uniqueUsedProviders.providers, provider) } func (up *UsedProviders) ClearUnwanted() { @@ -144,20 +186,24 @@ func (up *UsedProviders) ClearUnwanted() { up.lock.Lock() defer up.lock.Unlock() // this is nil safe - up.unwantedProviders = map[string]struct{}{} + for _, uniqueUsedProviders := range up.uniqueUsedProviders { + uniqueUsedProviders.unwantedProviders = map[string]struct{}{} + } } -func (up *UsedProviders) AddUsed(sessions ConsumerSessionsMap, err error) { +func (up *UsedProviders) AddUsed(sessions ConsumerSessionsMap, extensions []*spectypes.Extension, err error) { if up == nil { return } + routerKey := NewRouterKeyFromExtensions(extensions) up.lock.Lock() defer up.lock.Unlock() + uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) // this is nil safe if len(sessions) > 0 && err == nil { up.sessionsLatestBatch = 0 for provider := range sessions { // the key for ConsumerSessionsMap is the provider public address - up.providers[provider] = struct{}{} + uniqueUsedProviders.providers[provider] = struct{}{} up.sessionsLatestBatch++ } // increase batch number @@ -167,11 +213,8 @@ func (up *UsedProviders) AddUsed(sessions ConsumerSessionsMap, err error) { } // called when already locked. -func (up *UsedProviders) setUnwanted(provider string) { - if up == nil { - return - } - up.unwantedProviders[provider] = struct{}{} +func (up *UsedProviders) setUnwanted(uniqueUsedProviders *UniqueUsedProviders, provider string) { + uniqueUsedProviders.unwantedProviders[provider] = struct{}{} } func (up *UsedProviders) TryLockSelection(ctx context.Context) error { @@ -206,28 +249,32 @@ func (up *UsedProviders) tryLockSelection() bool { return false } -func (up *UsedProviders) GetErroredProviders() map[string]struct{} { +func (up *UsedProviders) GetErroredProviders(extensions []*spectypes.Extension) map[string]struct{} { if up == nil { return map[string]struct{}{} } - up.lock.RLock() - defer up.lock.RUnlock() - return up.erroredProviders + routerKey := NewRouterKeyFromExtensions(extensions) + up.lock.Lock() + defer up.lock.Unlock() + uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) + return uniqueUsedProviders.erroredProviders } -func (up *UsedProviders) GetUnwantedProvidersToSend() map[string]struct{} { +func (up *UsedProviders) GetUnwantedProvidersToSend(extensions []*spectypes.Extension) map[string]struct{} { if up == nil { return map[string]struct{}{} } - up.lock.RLock() - defer up.lock.RUnlock() + routerKey := NewRouterKeyFromExtensions(extensions) + up.lock.Lock() + defer up.lock.Unlock() + uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) unwantedProvidersToSend := map[string]struct{}{} // block the currently used providers - for provider := range up.providers { + for provider := range uniqueUsedProviders.providers { unwantedProvidersToSend[provider] = struct{}{} } // block providers that we have a response for - for provider := range up.unwantedProviders { + for provider := range uniqueUsedProviders.unwantedProviders { unwantedProvidersToSend[provider] = struct{}{} } return unwantedProvidersToSend diff --git a/protocol/lavasession/used_providers_test.go b/protocol/lavasession/used_providers_test.go index 30f3c7a641..aa4cb87a2e 100644 --- a/protocol/lavasession/used_providers_test.go +++ b/protocol/lavasession/used_providers_test.go @@ -7,6 +7,7 @@ import ( "time" "github.com/gogo/status" + spectypes "github.com/lavanet/lava/v3/x/spec/types" "github.com/stretchr/testify/require" "google.golang.org/grpc/codes" ) @@ -20,35 +21,35 @@ func TestUsedProviders(t *testing.T) { require.False(t, canUseAgain) require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) - unwanted := usedProviders.GetUnwantedProvidersToSend() + unwanted := usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) require.Len(t, unwanted, 0) consumerSessionsMap := ConsumerSessionsMap{"test": &SessionInfo{}, "test2": &SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) canUseAgain = usedProviders.tryLockSelection() require.True(t, canUseAgain) - unwanted = usedProviders.GetUnwantedProvidersToSend() + unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) require.Len(t, unwanted, 2) require.Equal(t, 2, usedProviders.CurrentlyUsed()) canUseAgain = usedProviders.tryLockSelection() require.False(t, canUseAgain) consumerSessionsMap = ConsumerSessionsMap{"test3": &SessionInfo{}, "test4": &SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) - unwanted = usedProviders.GetUnwantedProvidersToSend() + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) require.Len(t, unwanted, 4) require.Equal(t, 4, usedProviders.CurrentlyUsed()) // one provider gives a retry - usedProviders.RemoveUsed("test", status.Error(codes.Code(SessionOutOfSyncError.ABCICode()), "")) + usedProviders.RemoveUsed("test", nil, status.Error(codes.Code(SessionOutOfSyncError.ABCICode()), "")) require.Equal(t, 3, usedProviders.CurrentlyUsed()) - unwanted = usedProviders.GetUnwantedProvidersToSend() + unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) require.Len(t, unwanted, 3) // one provider gives a result - usedProviders.RemoveUsed("test2", nil) - unwanted = usedProviders.GetUnwantedProvidersToSend() + usedProviders.RemoveUsed("test2", nil, nil) + unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) require.Len(t, unwanted, 3) require.Equal(t, 2, usedProviders.CurrentlyUsed()) // one provider gives an error - usedProviders.RemoveUsed("test3", fmt.Errorf("bad")) - unwanted = usedProviders.GetUnwantedProvidersToSend() + usedProviders.RemoveUsed("test3", nil, fmt.Errorf("bad")) + unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) require.Len(t, unwanted, 3) require.Equal(t, 1, usedProviders.CurrentlyUsed()) canUseAgain = usedProviders.tryLockSelection() @@ -68,13 +69,13 @@ func TestUsedProvidersAsync(t *testing.T) { go func() { time.Sleep(time.Millisecond * 10) consumerSessionsMap := ConsumerSessionsMap{"test": &SessionInfo{}, "test2": &SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) }() ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*100) defer cancel() canUseAgain := usedProviders.TryLockSelection(ctx) require.Nil(t, canUseAgain) - unwanted := usedProviders.GetUnwantedProvidersToSend() + unwanted := usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) require.Len(t, unwanted, 2) require.Equal(t, 2, usedProviders.CurrentlyUsed()) }) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index b7fd41f683..bf8c0fc4ab 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -2,6 +2,7 @@ package rpcconsumer import ( context "context" + "sync/atomic" "time" "github.com/lavanet/lava/v3/protocol/chainlib" @@ -13,7 +14,6 @@ import ( type RelayStateMachine interface { GetProtocolMessage() chainlib.ProtocolMessage - ShouldRetry(numberOfRetriesLaunched int) bool GetDebugState() bool GetRelayTaskChannel() chan RelayStateSendInstructions UpdateBatch(err error) @@ -23,7 +23,6 @@ type RelayStateMachine interface { } type ConsumerRelaySender interface { - sendRelayToProvider(ctx context.Context, protocolMessage chainlib.ProtocolMessage, relayProcessor *RelayProcessor, analytics *metrics.RelayMetrics) (errRet error) getProcessingTimeout(chainMessage chainlib.ChainMessage) (processingTimeout time.Duration, relayTimeout time.Duration) GetChainIdAndApiInterface() (string, string) } @@ -84,7 +83,19 @@ func (crsm *ConsumerRelayStateMachine) GetSelection() Selection { return crsm.selection } -func (crsm *ConsumerRelayStateMachine) ShouldRetry(numberOfRetriesLaunched int) bool { +func (crsm *ConsumerRelayStateMachine) shouldRetryOnResult(numberOfRetriesLaunched int, numberOfNodeErrors uint64) bool { + shouldRetry := crsm.shouldRetryInner(numberOfRetriesLaunched) + if shouldRetry { + // retry archive logic + if len(crsm.GetProtocolMessage().GetRequestedBlocksHashes()) > 0 && numberOfNodeErrors > 0 { + // we had node error, and we have a hash parsed. + + } + } + return crsm.shouldRetryInner(numberOfRetriesLaunched) +} + +func (crsm *ConsumerRelayStateMachine) shouldRetryInner(numberOfRetriesLaunched int) bool { if numberOfRetriesLaunched >= MaximumNumberOfTickerRelayRetries { return false } @@ -92,6 +103,10 @@ func (crsm *ConsumerRelayStateMachine) ShouldRetry(numberOfRetriesLaunched int) return crsm.selection != BestResult } +func (crsm *ConsumerRelayStateMachine) shouldRetryTicker(numberOfRetriesLaunched int) bool { + return crsm.shouldRetryInner(numberOfRetriesLaunched) +} + func (crsm *ConsumerRelayStateMachine) GetDebugState() bool { return crsm.debugRelays } @@ -124,12 +139,15 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() chan RelayStateSend processingCtx, processingCtxCancel := context.WithTimeout(crsm.ctx, processingTimeout) defer processingCtxCancel() + numberOfNodeErrorsAtomic := atomic.Uint64{} readResultsFromProcessor := func() { // ProcessResults is reading responses while blocking until the conditions are met utils.LavaFormatTrace("[StateMachine] Waiting for results", utils.LogAttr("batch", crsm.usedProviders.BatchNumber())) crsm.parentRelayProcessor.WaitForResults(processingCtx) // Decide if we need to resend or not - if crsm.parentRelayProcessor.HasRequiredNodeResults() { + metRequiredNodeResults, numberOfNodeErrors := crsm.parentRelayProcessor.HasRequiredNodeResults() + numberOfNodeErrorsAtomic.Store(uint64(numberOfNodeErrors)) + if metRequiredNodeResults { gotResults <- true } else { gotResults <- false @@ -193,7 +211,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() chan RelayStateSend return } // If should retry == true, send a new batch. (success == false) - if crsm.ShouldRetry(crsm.usedProviders.BatchNumber()) { + if crsm.shouldRetryOnResult(crsm.usedProviders.BatchNumber(), numberOfNodeErrorsAtomic.Load()) { utils.LavaFormatTrace("[StateMachine] success := <-gotResults - crsm.ShouldRetry(batchNumber)", utils.LogAttr("batch", crsm.usedProviders.BatchNumber())) relayTaskChannel <- RelayStateSendInstructions{protocolMessage: crsm.GetProtocolMessage()} } else { @@ -202,7 +220,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() chan RelayStateSend go readResultsFromProcessor() case <-startNewBatchTicker.C: // Only trigger another batch for non BestResult relays or if we didn't pass the retry limit. - if crsm.ShouldRetry(crsm.usedProviders.BatchNumber()) { + if crsm.shouldRetryTicker(crsm.usedProviders.BatchNumber()) { utils.LavaFormatTrace("[StateMachine] ticker triggered", utils.LogAttr("batch", crsm.usedProviders.BatchNumber())) relayTaskChannel <- RelayStateSendInstructions{protocolMessage: crsm.GetProtocolMessage()} // Add ticker launch metrics diff --git a/protocol/rpcconsumer/consumer_relay_state_machine_test.go b/protocol/rpcconsumer/consumer_relay_state_machine_test.go index c42d003714..9cbc6d346a 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine_test.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine_test.go @@ -72,27 +72,28 @@ func TestConsumerStateMachineHappyFlow(t *testing.T) { switch taskNumber { case 0: require.False(t, task.IsDone()) - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) relayProcessor.UpdateBatch(nil) sendProtocolError(relayProcessor, "lava@test", time.Millisecond*1, fmt.Errorf("bad")) case 1: require.False(t, task.IsDone()) - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) relayProcessor.UpdateBatch(nil) sendNodeError(relayProcessor, "lava2@test", time.Millisecond*1) case 2: require.False(t, task.IsDone()) - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) relayProcessor.UpdateBatch(nil) sendNodeError(relayProcessor, "lava2@test", time.Millisecond*1) case 3: require.False(t, task.IsDone()) - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) relayProcessor.UpdateBatch(nil) sendSuccessResp(relayProcessor, "lava4@test", time.Millisecond*1) case 4: require.True(t, task.IsDone()) - require.True(t, relayProcessor.HasRequiredNodeResults()) + results, _ := relayProcessor.HasRequiredNodeResults() + require.True(t, results) returnedResult, err := relayProcessor.ProcessingResult() require.NoError(t, err) require.Equal(t, string(returnedResult.Reply.Data), "ok") diff --git a/protocol/rpcconsumer/relay_processor.go b/protocol/rpcconsumer/relay_processor.go index 75c408985f..a4faccb014 100644 --- a/protocol/rpcconsumer/relay_processor.go +++ b/protocol/rpcconsumer/relay_processor.go @@ -214,9 +214,9 @@ func (rp *RelayProcessor) shouldRetryRelay(resultsCount int, hashErr error, node return true } -func (rp *RelayProcessor) HasRequiredNodeResults() bool { +func (rp *RelayProcessor) HasRequiredNodeResults() (bool, int) { if rp == nil { - return false + return false, 0 } rp.lock.RLock() defer rp.lock.RUnlock() @@ -236,17 +236,17 @@ func (rp *RelayProcessor) HasRequiredNodeResults() bool { go rp.metricsInf.SetNodeErrorRecoveredSuccessfullyMetric(chainId, apiInterface, strconv.Itoa(nodeErrors)) } } - return true + return true, nodeErrors } if rp.selection == Quorum { // We need a quorum of all node results if nodeErrors+resultsCount >= rp.requiredSuccesses { // Retry on node error flow: - return rp.shouldRetryRelay(resultsCount, hashErr, nodeErrors, hash) + return rp.shouldRetryRelay(resultsCount, hashErr, nodeErrors, hash), nodeErrors } } // on BestResult we want to retry if there is no success - return false + return false, nodeErrors } func (rp *RelayProcessor) handleResponse(response *relayResponse) { diff --git a/protocol/rpcconsumer/relay_processor_test.go b/protocol/rpcconsumer/relay_processor_test.go index 7bd4f85151..59e2957f5b 100644 --- a/protocol/rpcconsumer/relay_processor_test.go +++ b/protocol/rpcconsumer/relay_processor_test.go @@ -41,7 +41,7 @@ var ( func sendSuccessResp(relayProcessor *RelayProcessor, provider string, delay time.Duration) { time.Sleep(delay) - relayProcessor.GetUsedProviders().RemoveUsed(provider, nil) + relayProcessor.GetUsedProviders().RemoveUsed(provider, nil, nil) response := &relayResponse{ relayResult: common.RelayResult{ Request: &pairingtypes.RelayRequest{ @@ -59,7 +59,7 @@ func sendSuccessResp(relayProcessor *RelayProcessor, provider string, delay time func sendProtocolError(relayProcessor *RelayProcessor, provider string, delay time.Duration, err error) { time.Sleep(delay) - relayProcessor.GetUsedProviders().RemoveUsed(provider, err) + relayProcessor.GetUsedProviders().RemoveUsed(provider, nil, err) response := &relayResponse{ relayResult: common.RelayResult{ Request: &pairingtypes.RelayRequest{ @@ -77,7 +77,7 @@ func sendProtocolError(relayProcessor *RelayProcessor, provider string, delay ti func sendNodeError(relayProcessor *RelayProcessor, provider string, delay time.Duration) { time.Sleep(delay) - relayProcessor.GetUsedProviders().RemoveUsed(provider, nil) + relayProcessor.GetUsedProviders().RemoveUsed(provider, nil, nil) response := &relayResponse{ relayResult: common.RelayResult{ Request: &pairingtypes.RelayRequest{ @@ -123,7 +123,7 @@ func TestRelayProcessorHappyFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*10) defer cancel() go sendSuccessResp(relayProcessor, "lava@test", time.Millisecond*5) @@ -179,14 +179,14 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) // check first reply go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) err = relayProcessor.WaitForResults(context.Background()) require.NoError(t, err) resultsOk := relayProcessor.HasResults() require.True(t, resultsOk) - requiredNodeResults := relayProcessor.HasRequiredNodeResults() + requiredNodeResults, _ := relayProcessor.HasRequiredNodeResults() require.False(t, requiredNodeResults) // check first retry go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) @@ -194,7 +194,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.NoError(t, err) resultsOk = relayProcessor.HasResults() require.True(t, resultsOk) - requiredNodeResults = relayProcessor.HasRequiredNodeResults() + requiredNodeResults, _ = relayProcessor.HasRequiredNodeResults() require.False(t, requiredNodeResults) // check first second retry @@ -203,7 +203,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.NoError(t, err) resultsOk = relayProcessor.HasResults() require.True(t, resultsOk) - requiredNodeResults = relayProcessor.HasRequiredNodeResults() + requiredNodeResults, _ = relayProcessor.HasRequiredNodeResults() require.True(t, requiredNodeResults) // 2nd relay, same inputs @@ -222,14 +222,14 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap = lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) // check first reply, this time we have hash in map, so we don't retry node errors. go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) err = relayProcessor.WaitForResults(context.Background()) require.NoError(t, err) resultsOk = relayProcessor.HasResults() require.True(t, resultsOk) - requiredNodeResults = relayProcessor.HasRequiredNodeResults() + requiredNodeResults, _ = relayProcessor.HasRequiredNodeResults() require.True(t, requiredNodeResults) // 3nd relay, different inputs @@ -248,14 +248,14 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap = lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) // check first reply, this time we have hash in map, so we don't retry node errors. go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) err = relayProcessor.WaitForResults(context.Background()) require.NoError(t, err) resultsOk = relayProcessor.HasResults() require.True(t, resultsOk) - requiredNodeResults = relayProcessor.HasRequiredNodeResults() + requiredNodeResults, _ = relayProcessor.HasRequiredNodeResults() // check our hashing mechanism works with different inputs require.False(t, requiredNodeResults) @@ -273,7 +273,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap = lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) // check first reply, this time we have hash in map, so we don't retry node errors. hash, err := relayProcessor.getInputMsgInfoHashString() require.NoError(t, err) @@ -283,7 +283,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.NoError(t, err) resultsOk = relayProcessor.HasResults() require.True(t, resultsOk) - requiredNodeResults = relayProcessor.HasRequiredNodeResults() + requiredNodeResults, _ = relayProcessor.HasRequiredNodeResults() require.True(t, requiredNodeResults) // A way for us to break early from sleep, just waiting up to 5 seconds and breaking as soon as the value we expect is there. @@ -325,14 +325,14 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) // check first reply go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) err = relayProcessor.WaitForResults(context.Background()) require.NoError(t, err) resultsOk := relayProcessor.HasResults() require.True(t, resultsOk) - requiredNodeResults := relayProcessor.HasRequiredNodeResults() + requiredNodeResults, _ := relayProcessor.HasRequiredNodeResults() require.True(t, requiredNodeResults) relayCountOnNodeError = 2 }) @@ -365,7 +365,7 @@ func TestRelayProcessorTimeout(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) go func() { time.Sleep(time.Millisecond * 5) ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*10) @@ -374,7 +374,7 @@ func TestRelayProcessorTimeout(t *testing.T) { require.NoError(t, ctx.Err()) require.Nil(t, canUse) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test3": &lavasession.SessionInfo{}, "lava@test4": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) }() go sendSuccessResp(relayProcessor, "lava@test", time.Millisecond*20) ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*200) @@ -418,7 +418,7 @@ func TestRelayProcessorRetry(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendSuccessResp(relayProcessor, "lava@test2", time.Millisecond*20) @@ -463,7 +463,7 @@ func TestRelayProcessorRetryNodeError(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendNodeError(relayProcessor, "lava@test2", time.Millisecond*20) @@ -508,7 +508,7 @@ func TestRelayProcessorStatefulApi(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava4@test": &lavasession.SessionInfo{}, "lava3@test": &lavasession.SessionInfo{}, "lava@test": &lavasession.SessionInfo{}, "lava2@test": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendNodeError(relayProcessor, "lava2@test", time.Millisecond*20) go sendNodeError(relayProcessor, "lava3@test", time.Millisecond*25) @@ -519,14 +519,14 @@ func TestRelayProcessorStatefulApi(t *testing.T) { err := relayProcessor.WaitForResults(ctx) require.NoError(t, err) // Decide if we need to resend or not - if relayProcessor.HasRequiredNodeResults() { + if results, _ := relayProcessor.HasRequiredNodeResults(); results { break } time.Sleep(5 * time.Millisecond) } resultsOk := relayProcessor.HasResults() require.True(t, resultsOk) - resultsOk = relayProcessor.HasRequiredNodeResults() + resultsOk, _ = relayProcessor.HasRequiredNodeResults() require.True(t, resultsOk) protocolErrors := relayProcessor.ProtocolErrors() require.Equal(t, uint64(1), protocolErrors) @@ -563,7 +563,7 @@ func TestRelayProcessorStatefulApiErr(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava4@test": &lavasession.SessionInfo{}, "lava3@test": &lavasession.SessionInfo{}, "lava@test": &lavasession.SessionInfo{}, "lava2@test": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendNodeError(relayProcessor, "lava2@test", time.Millisecond*20) go sendNodeError(relayProcessor, "lava3@test", time.Millisecond*25) @@ -610,7 +610,7 @@ func TestRelayProcessorLatest(t *testing.T) { require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, nil) + usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendSuccessResp(relayProcessor, "lava@test2", time.Millisecond*20) diff --git a/protocol/rpcconsumer/rpcconsumer_server.go b/protocol/rpcconsumer/rpcconsumer_server.go index af11a2d952..3422dcb5aa 100644 --- a/protocol/rpcconsumer/rpcconsumer_server.go +++ b/protocol/rpcconsumer/rpcconsumer_server.go @@ -602,8 +602,8 @@ func (rpccs *RPCConsumerServer) sendRelayToProvider( if rpccs.debugRelays { utils.LavaFormatDebug("[Before Send] returned the following sessions", utils.LogAttr("sessions", sessions), - utils.LogAttr("usedProviders.GetUnwantedProvidersToSend", usedProviders.GetUnwantedProvidersToSend()), - utils.LogAttr("usedProviders.GetErroredProviders", usedProviders.GetErroredProviders()), + utils.LogAttr("usedProviders.GetUnwantedProvidersToSend", usedProviders.GetUnwantedProvidersToSend(extensions)), + utils.LogAttr("usedProviders.GetErroredProviders", usedProviders.GetErroredProviders(extensions)), utils.LogAttr("addons", addon), utils.LogAttr("extensions", extensions), utils.LogAttr("AllowSessionDegradation", relayProcessor.GetAllowSessionDegradation()), @@ -721,7 +721,7 @@ func (rpccs *RPCConsumerServer) sendRelayToProvider( } time.Sleep(backOffDuration) // sleep before releasing this singleConsumerSession // relay failed need to fail the session advancement - errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, origErr) + errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, origErr, extensions) if errReport != nil { utils.LavaFormatError("failed relay onSessionFailure errored", errReport, utils.Attribute{Key: "GUID", Value: goroutineCtx}, utils.Attribute{Key: "original error", Value: origErr.Error()}) } @@ -756,7 +756,7 @@ func (rpccs *RPCConsumerServer) sendRelayToProvider( ) } - errResponse = rpccs.consumerSessionManager.OnSessionDone(singleConsumerSession, latestBlock, chainlib.GetComputeUnits(protocolMessage), relayLatency, singleConsumerSession.CalculateExpectedLatency(expectedRelayTimeoutForQOS), expectedBH, numOfProviders, pairingAddressesLen, protocolMessage.GetApi().Category.HangingApi) // session done successfully + errResponse = rpccs.consumerSessionManager.OnSessionDone(singleConsumerSession, latestBlock, chainlib.GetComputeUnits(protocolMessage), relayLatency, singleConsumerSession.CalculateExpectedLatency(expectedRelayTimeoutForQOS), expectedBH, numOfProviders, pairingAddressesLen, protocolMessage.GetApi().Category.HangingApi, extensions) // session done successfully isNodeError, _ := protocolMessage.CheckResponseError(localRelayResult.Reply.Data, localRelayResult.StatusCode) localRelayResult.IsNodeError = isNodeError if rpccs.cache.CacheActive() && rpcclient.ValidateStatusCodes(localRelayResult.StatusCode, true) == nil { @@ -1003,8 +1003,9 @@ func (rpccs *RPCConsumerServer) relaySubscriptionInner(ctx context.Context, hash ) replyServer, err := endpointClient.RelaySubscribe(ctx, relayResult.Request) + var extensions []*spectypes.Extension // currently no extensions for subscription, so it will be nil. if err != nil { - errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, err) + errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, err, extensions) if errReport != nil { return utils.LavaFormatError("subscribe relay failed onSessionFailure errored", errReport, utils.LogAttr("GUID", ctx), @@ -1018,7 +1019,7 @@ func (rpccs *RPCConsumerServer) relaySubscriptionInner(ctx context.Context, hash reply, err := rpccs.getFirstSubscriptionReply(ctx, hashedParams, replyServer) if err != nil { - errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, err) + errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, err, extensions) if errReport != nil { return utils.LavaFormatError("subscribe relay failed onSessionFailure errored", errReport, utils.LogAttr("GUID", ctx), @@ -1037,7 +1038,7 @@ func (rpccs *RPCConsumerServer) relaySubscriptionInner(ctx context.Context, hash relayResult.ReplyServer = replyServer relayResult.Reply = reply latestBlock := relayResult.Reply.LatestBlock - err = rpccs.consumerSessionManager.OnSessionDoneIncreaseCUOnly(singleConsumerSession, latestBlock) + err = rpccs.consumerSessionManager.OnSessionDoneIncreaseCUOnly(singleConsumerSession, latestBlock, extensions) return err } @@ -1344,7 +1345,7 @@ func (rpccs *RPCConsumerServer) appendHeadersToRelayResult(ctx context.Context, directiveHeaders := protocolMessage.GetDirectiveHeaders() _, debugRelays := directiveHeaders[common.LAVA_DEBUG_RELAY] if debugRelays { - erroredProviders := relayProcessor.GetUsedProviders().GetErroredProviders() + erroredProviders := relayProcessor.GetUsedProviders().GetErroredProviders(protocolMessage.GetExtensions()) if len(erroredProviders) > 0 { erroredProvidersArray := make([]string, len(erroredProviders)) idx := 0 From b220be90996dd98c0cac8fc21326bdb1e08271f2 Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 3 Oct 2024 13:11:00 +0200 Subject: [PATCH 02/20] fix deref --- protocol/lavasession/used_providers.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/protocol/lavasession/used_providers.go b/protocol/lavasession/used_providers.go index 0215dcd439..53f68d4328 100644 --- a/protocol/lavasession/used_providers.go +++ b/protocol/lavasession/used_providers.go @@ -128,12 +128,13 @@ func (up *UsedProviders) UnwantedAddresses() []string { func (up *UsedProviders) createOrUseUniqueUsedProvidersForKey(key RouterKey) *UniqueUsedProviders { uniqueUsedProviders, ok := up.uniqueUsedProviders[key] if !ok { - up.uniqueUsedProviders[key] = &UniqueUsedProviders{ + uniqueUsedProviders = &UniqueUsedProviders{ providers: map[string]struct{}{}, unwantedProviders: up.originalUnwantedProviders, blockOnSyncLoss: map[string]struct{}{}, erroredProviders: map[string]struct{}{}, } + up.uniqueUsedProviders[key] = uniqueUsedProviders } return uniqueUsedProviders } From f62f4881dd2a36a0723844e273819fe149f3994a Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 3 Oct 2024 13:25:41 +0200 Subject: [PATCH 03/20] feat: PRT-Block-Hash-Cache-after-redesign-part-2 --- .../consumer_relay_state_machine.go | 55 +++++++++++++------ protocol/rpcconsumer/relay_processor.go | 3 +- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index bf8c0fc4ab..5f56a9de8f 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -7,6 +7,7 @@ import ( "github.com/lavanet/lava/v3/protocol/chainlib" common "github.com/lavanet/lava/v3/protocol/common" + "github.com/lavanet/lava/v3/protocol/lavaprotocol" lavasession "github.com/lavanet/lava/v3/protocol/lavasession" "github.com/lavanet/lava/v3/protocol/metrics" "github.com/lavanet/lava/v3/utils" @@ -19,7 +20,13 @@ type RelayStateMachine interface { UpdateBatch(err error) GetSelection() Selection GetUsedProviders() *lavasession.UsedProviders - SetRelayProcessor(relayProcessor *RelayProcessor) + SetResultsChecker(resultsChecker ResultsCheckerInf) + SetRelayRetriesManager(relayRetriesManager *lavaprotocol.RelayRetriesManager) +} + +type ResultsCheckerInf interface { + WaitForResults(ctx context.Context) error + HasRequiredNodeResults() (bool, int) } type ConsumerRelaySender interface { @@ -32,16 +39,17 @@ type tickerMetricSetterInf interface { } type ConsumerRelayStateMachine struct { - ctx context.Context // same context as user context. - relaySender ConsumerRelaySender - parentRelayProcessor *RelayProcessor - protocolMessage chainlib.ProtocolMessage // only one should make changes to protocol message is ConsumerRelayStateMachine. - analytics *metrics.RelayMetrics // first relay metrics - selection Selection - debugRelays bool - tickerMetricSetter tickerMetricSetterInf - batchUpdate chan error - usedProviders *lavasession.UsedProviders + ctx context.Context // same context as user context. + relaySender ConsumerRelaySender + resultsChecker ResultsCheckerInf + protocolMessage chainlib.ProtocolMessage // only one should make changes to protocol message is ConsumerRelayStateMachine. + analytics *metrics.RelayMetrics // first relay metrics + selection Selection + debugRelays bool + tickerMetricSetter tickerMetricSetterInf + batchUpdate chan error + usedProviders *lavasession.UsedProviders + relayRetriesManager *lavaprotocol.RelayRetriesManager } func NewRelayStateMachine( @@ -71,8 +79,12 @@ func NewRelayStateMachine( } } -func (crsm *ConsumerRelayStateMachine) SetRelayProcessor(relayProcessor *RelayProcessor) { - crsm.parentRelayProcessor = relayProcessor +func (crsm *ConsumerRelayStateMachine) SetRelayRetriesManager(relayRetriesManager *lavaprotocol.RelayRetriesManager) { + crsm.relayRetriesManager = relayRetriesManager +} + +func (crsm *ConsumerRelayStateMachine) SetResultsChecker(resultsChecker ResultsCheckerInf) { + crsm.resultsChecker = resultsChecker } func (crsm *ConsumerRelayStateMachine) GetUsedProviders() *lavasession.UsedProviders { @@ -87,9 +99,18 @@ func (crsm *ConsumerRelayStateMachine) shouldRetryOnResult(numberOfRetriesLaunch shouldRetry := crsm.shouldRetryInner(numberOfRetriesLaunched) if shouldRetry { // retry archive logic - if len(crsm.GetProtocolMessage().GetRequestedBlocksHashes()) > 0 && numberOfNodeErrors > 0 { - // we had node error, and we have a hash parsed. + hashes := crsm.GetProtocolMessage().GetRequestedBlocksHashes() + if len(hashes) > 0 && numberOfNodeErrors > 0 { + // iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. + // if we are familiar with all, we don't want to allow archive. + for _, hash := range hashes { + if !crsm.relayRetriesManager.CheckHashInCache(hash) { + // if we didn't find the hash in the cache we can try archive relay. + break + } + } + // we had node error, and we have a hash parsed. } } return crsm.shouldRetryInner(numberOfRetriesLaunched) @@ -143,9 +164,9 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() chan RelayStateSend readResultsFromProcessor := func() { // ProcessResults is reading responses while blocking until the conditions are met utils.LavaFormatTrace("[StateMachine] Waiting for results", utils.LogAttr("batch", crsm.usedProviders.BatchNumber())) - crsm.parentRelayProcessor.WaitForResults(processingCtx) + crsm.resultsChecker.WaitForResults(processingCtx) // Decide if we need to resend or not - metRequiredNodeResults, numberOfNodeErrors := crsm.parentRelayProcessor.HasRequiredNodeResults() + metRequiredNodeResults, numberOfNodeErrors := crsm.resultsChecker.HasRequiredNodeResults() numberOfNodeErrorsAtomic.Store(uint64(numberOfNodeErrors)) if metRequiredNodeResults { gotResults <- true diff --git a/protocol/rpcconsumer/relay_processor.go b/protocol/rpcconsumer/relay_processor.go index a4faccb014..8626c1e3f9 100644 --- a/protocol/rpcconsumer/relay_processor.go +++ b/protocol/rpcconsumer/relay_processor.go @@ -86,7 +86,8 @@ func NewRelayProcessor( selection: relayStateMachine.GetSelection(), usedProviders: relayStateMachine.GetUsedProviders(), } - relayProcessor.RelayStateMachine.SetRelayProcessor(relayProcessor) + relayProcessor.RelayStateMachine.SetResultsChecker(relayProcessor) + relayProcessor.RelayStateMachine.SetRelayRetriesManager(relayRetriesManager) return relayProcessor } From a347619bb299111bfeb66cb973dc34b803d06f18 Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 3 Oct 2024 13:27:55 +0200 Subject: [PATCH 04/20] fix lint --- protocol/lavasession/used_providers.go | 2 +- protocol/rpcconsumer/consumer_relay_state_machine.go | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/protocol/lavasession/used_providers.go b/protocol/lavasession/used_providers.go index 53f68d4328..fe4aa9931b 100644 --- a/protocol/lavasession/used_providers.go +++ b/protocol/lavasession/used_providers.go @@ -26,7 +26,7 @@ func NewUsedProviders(blockedProviders BlockedProvidersInf) *UsedProviders { } } return &UsedProviders{ - uniqueUsedProviders: map[RouterKey]*UniqueUsedProviders{NewRouterKey([]string{}): &UniqueUsedProviders{ + uniqueUsedProviders: map[RouterKey]*UniqueUsedProviders{NewRouterKey([]string{}): { providers: map[string]struct{}{}, unwantedProviders: unwantedProviders, blockOnSyncLoss: map[string]struct{}{}, diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index bf8c0fc4ab..a56c5c735d 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -85,14 +85,8 @@ func (crsm *ConsumerRelayStateMachine) GetSelection() Selection { func (crsm *ConsumerRelayStateMachine) shouldRetryOnResult(numberOfRetriesLaunched int, numberOfNodeErrors uint64) bool { shouldRetry := crsm.shouldRetryInner(numberOfRetriesLaunched) - if shouldRetry { - // retry archive logic - if len(crsm.GetProtocolMessage().GetRequestedBlocksHashes()) > 0 && numberOfNodeErrors > 0 { - // we had node error, and we have a hash parsed. - - } - } - return crsm.shouldRetryInner(numberOfRetriesLaunched) + // archive functionality will be added here. + return shouldRetry } func (crsm *ConsumerRelayStateMachine) shouldRetryInner(numberOfRetriesLaunched int) bool { From b6803e9519a3a6facd8a3f902fbe5d82979201fd Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 3 Oct 2024 17:57:25 +0200 Subject: [PATCH 05/20] WIP --- .../consumer_relay_state_machine.go | 36 ++++++++++--------- .../consumer_relay_state_machine_test.go | 5 --- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index 5f56a9de8f..c4975d5cc9 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -95,28 +95,34 @@ func (crsm *ConsumerRelayStateMachine) GetSelection() Selection { return crsm.selection } -func (crsm *ConsumerRelayStateMachine) shouldRetryOnResult(numberOfRetriesLaunched int, numberOfNodeErrors uint64) bool { - shouldRetry := crsm.shouldRetryInner(numberOfRetriesLaunched) +func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, numberOfNodeErrors uint64) bool { + shouldRetry := crsm.retryCondition(numberOfRetriesLaunched) if shouldRetry { // retry archive logic hashes := crsm.GetProtocolMessage().GetRequestedBlocksHashes() if len(hashes) > 0 && numberOfNodeErrors > 0 { - // iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. - // if we are familiar with all, we don't want to allow archive. - for _, hash := range hashes { - if !crsm.relayRetriesManager.CheckHashInCache(hash) { - // if we didn't find the hash in the cache we can try archive relay. + // launch archive only on the first retry attempt. + if numberOfRetriesLaunched == 1 { + // iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. + // if we are familiar with all, we don't want to allow archive. + for _, hash := range hashes { + if !crsm.relayRetriesManager.CheckHashInCache(hash) { + // if we didn't find the hash in the cache we can try archive relay. - break + break + } } + // we had node error, and we have a hash parsed. + } else { + // return to original protocol message. + } - // we had node error, and we have a hash parsed. } } - return crsm.shouldRetryInner(numberOfRetriesLaunched) + return shouldRetry } -func (crsm *ConsumerRelayStateMachine) shouldRetryInner(numberOfRetriesLaunched int) bool { +func (crsm *ConsumerRelayStateMachine) retryCondition(numberOfRetriesLaunched int) bool { if numberOfRetriesLaunched >= MaximumNumberOfTickerRelayRetries { return false } @@ -124,10 +130,6 @@ func (crsm *ConsumerRelayStateMachine) shouldRetryInner(numberOfRetriesLaunched return crsm.selection != BestResult } -func (crsm *ConsumerRelayStateMachine) shouldRetryTicker(numberOfRetriesLaunched int) bool { - return crsm.shouldRetryInner(numberOfRetriesLaunched) -} - func (crsm *ConsumerRelayStateMachine) GetDebugState() bool { return crsm.debugRelays } @@ -232,7 +234,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() chan RelayStateSend return } // If should retry == true, send a new batch. (success == false) - if crsm.shouldRetryOnResult(crsm.usedProviders.BatchNumber(), numberOfNodeErrorsAtomic.Load()) { + if crsm.shouldRetry(crsm.usedProviders.BatchNumber(), numberOfNodeErrorsAtomic.Load()) { utils.LavaFormatTrace("[StateMachine] success := <-gotResults - crsm.ShouldRetry(batchNumber)", utils.LogAttr("batch", crsm.usedProviders.BatchNumber())) relayTaskChannel <- RelayStateSendInstructions{protocolMessage: crsm.GetProtocolMessage()} } else { @@ -241,7 +243,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() chan RelayStateSend go readResultsFromProcessor() case <-startNewBatchTicker.C: // Only trigger another batch for non BestResult relays or if we didn't pass the retry limit. - if crsm.shouldRetryTicker(crsm.usedProviders.BatchNumber()) { + if crsm.shouldRetry(crsm.usedProviders.BatchNumber(), numberOfNodeErrorsAtomic.Load()) { utils.LavaFormatTrace("[StateMachine] ticker triggered", utils.LogAttr("batch", crsm.usedProviders.BatchNumber())) relayTaskChannel <- RelayStateSendInstructions{protocolMessage: crsm.GetProtocolMessage()} // Add ticker launch metrics diff --git a/protocol/rpcconsumer/consumer_relay_state_machine_test.go b/protocol/rpcconsumer/consumer_relay_state_machine_test.go index 9cbc6d346a..e27cfda967 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine_test.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine_test.go @@ -10,7 +10,6 @@ import ( "github.com/lavanet/lava/v3/protocol/chainlib" "github.com/lavanet/lava/v3/protocol/chainlib/extensionslib" lavasession "github.com/lavanet/lava/v3/protocol/lavasession" - "github.com/lavanet/lava/v3/protocol/metrics" spectypes "github.com/lavanet/lava/v3/x/spec/types" "github.com/stretchr/testify/require" ) @@ -20,10 +19,6 @@ type ConsumerRelaySenderMock struct { tickerValue time.Duration } -func (crsm *ConsumerRelaySenderMock) sendRelayToProvider(ctx context.Context, protocolMessage chainlib.ProtocolMessage, relayProcessor *RelayProcessor, analytics *metrics.RelayMetrics) (errRet error) { - return crsm.retValue -} - func (crsm *ConsumerRelaySenderMock) getProcessingTimeout(chainMessage chainlib.ChainMessage) (processingTimeout time.Duration, relayTimeout time.Duration) { if crsm.tickerValue != 0 { return time.Second * 50000, crsm.tickerValue From 63cd0b1255473a1285f7f42a68068196246e89c6 Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 3 Oct 2024 19:00:05 +0200 Subject: [PATCH 06/20] Part 2 complete? --- .../extensionslib/extension_parser.go | 4 +- .../consumer_relay_state_machine.go | 92 +++++++++++++------ 2 files changed, 66 insertions(+), 30 deletions(-) diff --git a/protocol/chainlib/extensionslib/extension_parser.go b/protocol/chainlib/extensionslib/extension_parser.go index 47fc08922f..322f49aba5 100644 --- a/protocol/chainlib/extensionslib/extension_parser.go +++ b/protocol/chainlib/extensionslib/extension_parser.go @@ -4,6 +4,8 @@ import ( spectypes "github.com/lavanet/lava/v3/x/spec/types" ) +const ArchiveExtension = "archive" + type ExtensionInfo struct { ExtensionOverride []string LatestBlock uint64 @@ -77,7 +79,7 @@ func (ep *ExtensionParser) ExtensionParsing(addon string, extensionsChainMessage func NewExtensionParserRule(extension *spectypes.Extension) ExtensionParserRule { switch extension.Name { - case "archive": + case ArchiveExtension: return ArchiveParserRule{extension: extension} default: // unsupported rule diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index c4975d5cc9..eb635d94f4 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -5,7 +5,10 @@ import ( "sync/atomic" "time" + slices "github.com/lavanet/lava/v3/utils/lavaslices" + "github.com/lavanet/lava/v3/protocol/chainlib" + "github.com/lavanet/lava/v3/protocol/chainlib/extensionslib" common "github.com/lavanet/lava/v3/protocol/common" "github.com/lavanet/lava/v3/protocol/lavaprotocol" lavasession "github.com/lavanet/lava/v3/protocol/lavasession" @@ -39,17 +42,19 @@ type tickerMetricSetterInf interface { } type ConsumerRelayStateMachine struct { - ctx context.Context // same context as user context. - relaySender ConsumerRelaySender - resultsChecker ResultsCheckerInf - protocolMessage chainlib.ProtocolMessage // only one should make changes to protocol message is ConsumerRelayStateMachine. - analytics *metrics.RelayMetrics // first relay metrics - selection Selection - debugRelays bool - tickerMetricSetter tickerMetricSetterInf - batchUpdate chan error - usedProviders *lavasession.UsedProviders - relayRetriesManager *lavaprotocol.RelayRetriesManager + ctx context.Context // same context as user context. + relaySender ConsumerRelaySender + resultsChecker ResultsCheckerInf + protocolMessage chainlib.ProtocolMessage // only one should make changes to protocol message is ConsumerRelayStateMachine. + originalProtocolMessage chainlib.ProtocolMessage + appliedArchiveExtension bool + analytics *metrics.RelayMetrics // first relay metrics + selection Selection + debugRelays bool + tickerMetricSetter tickerMetricSetterInf + batchUpdate chan error + usedProviders *lavasession.UsedProviders + relayRetriesManager *lavaprotocol.RelayRetriesManager } func NewRelayStateMachine( @@ -67,15 +72,16 @@ func NewRelayStateMachine( } return &ConsumerRelayStateMachine{ - ctx: ctx, - usedProviders: usedProviders, - relaySender: relaySender, - protocolMessage: protocolMessage, - analytics: analytics, - selection: selection, - debugRelays: debugRelays, - tickerMetricSetter: tickerMetricSetter, - batchUpdate: make(chan error, MaximumNumberOfTickerRelayRetries), + ctx: ctx, + usedProviders: usedProviders, + relaySender: relaySender, + protocolMessage: protocolMessage, + originalProtocolMessage: protocolMessage, + analytics: analytics, + selection: selection, + debugRelays: debugRelays, + tickerMetricSetter: tickerMetricSetter, + batchUpdate: make(chan error, MaximumNumberOfTickerRelayRetries), } } @@ -95,27 +101,55 @@ func (crsm *ConsumerRelayStateMachine) GetSelection() Selection { return crsm.selection } +// Should retry implements the logic for when to send another relay. +// As well as the decision of changing the protocol message, +// into different extensions or addons based on certain conditions func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, numberOfNodeErrors uint64) bool { shouldRetry := crsm.retryCondition(numberOfRetriesLaunched) if shouldRetry { - // retry archive logic + // Retry archive logic hashes := crsm.GetProtocolMessage().GetRequestedBlocksHashes() if len(hashes) > 0 && numberOfNodeErrors > 0 { - // launch archive only on the first retry attempt. + // Launch archive only on the first retry attempt. if numberOfRetriesLaunched == 1 { - // iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. - // if we are familiar with all, we don't want to allow archive. + // Iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. + // If we are familiar with all, we don't want to allow archive. for _, hash := range hashes { if !crsm.relayRetriesManager.CheckHashInCache(hash) { - // if we didn't find the hash in the cache we can try archive relay. - + // If we didn't find the hash in the cache we can try archive relay. + privateData := crsm.protocolMessage.RelayPrivateData() + // Create a new array of extensions. validate it doesn't already have archive in it. + // If it does just break. if it doesn't add it + extensions := append([]string{}, privateData.Extensions...) + if slices.Contains(extensions, extensionslib.ArchiveExtension) { + break // Do nothing its already archive. + } + extensions = append(extensions, extensionslib.ArchiveExtension) + // We need to set archive. + // Create a new relay private data containing the extension. + relayRequestData := lavaprotocol.NewRelayData(crsm.ctx, privateData.ConnectionType, privateData.ApiUrl, privateData.Data, privateData.SeenBlock, privateData.RequestBlock, privateData.ApiInterface, privateData.Metadata, privateData.Addon, extensions) + userData := crsm.protocolMessage.GetUserData() + // Creating an archive protocol message, and set it to current portocol message + crsm.protocolMessage = chainlib.NewProtocolMessage(crsm.protocolMessage, crsm.protocolMessage.GetDirectiveHeaders(), relayRequestData, userData.DappId, userData.ConsumerIp) + // for future batches. + crsm.appliedArchiveExtension = true break } } - // we had node error, and we have a hash parsed. + // We had node error, and we have a hash parsed. } else { - // return to original protocol message. - + // Validate the following. + // 1. That we have applied archive + // 2. That we had more than one node error (meaning the 2nd was a successful archive [node error] 100%) + if crsm.appliedArchiveExtension && numberOfNodeErrors >= 2 { + // We know we have applied archive and failed. + // 1. We can remove the archive, return to the original protocol message, + // 2. Set all hashes as irrelevant for future queries. + crsm.protocolMessage = crsm.originalProtocolMessage + for _, hash := range hashes { + crsm.relayRetriesManager.AddHashToCache(hash) + } + } } } } From f18b9fc936b181e5a5163dfa1605b75b9ea924ec Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 3 Oct 2024 19:46:46 +0200 Subject: [PATCH 07/20] fix lint --- .../consumer_relay_state_machine.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index eb635d94f4..a54ca72733 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -137,19 +137,19 @@ func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, } } // We had node error, and we have a hash parsed. - } else { + } else if crsm.appliedArchiveExtension && numberOfNodeErrors >= 2 { // Validate the following. // 1. That we have applied archive // 2. That we had more than one node error (meaning the 2nd was a successful archive [node error] 100%) - if crsm.appliedArchiveExtension && numberOfNodeErrors >= 2 { - // We know we have applied archive and failed. - // 1. We can remove the archive, return to the original protocol message, - // 2. Set all hashes as irrelevant for future queries. - crsm.protocolMessage = crsm.originalProtocolMessage - for _, hash := range hashes { - crsm.relayRetriesManager.AddHashToCache(hash) - } + // Now - + // We know we have applied archive and failed. + // 1. We can remove the archive, return to the original protocol message, + // 2. Set all hashes as irrelevant for future queries. + crsm.protocolMessage = crsm.originalProtocolMessage + for _, hash := range hashes { + crsm.relayRetriesManager.AddHashToCache(hash) } + crsm.appliedArchiveExtension = false // so we don't get here again } } } From 1ff62454b541ccdd3d27ded8ab8a649d354de411 Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Sun, 6 Oct 2024 19:48:52 +0200 Subject: [PATCH 08/20] apply archive only on 2nd relay. --- protocol/rpcconsumer/consumer_relay_state_machine.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index a54ca72733..e78be5e7c0 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -109,9 +109,9 @@ func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, if shouldRetry { // Retry archive logic hashes := crsm.GetProtocolMessage().GetRequestedBlocksHashes() - if len(hashes) > 0 && numberOfNodeErrors > 0 { - // Launch archive only on the first retry attempt. - if numberOfRetriesLaunched == 1 { + if len(hashes) > 0 && numberOfNodeErrors > 1 { + // Launch archive only on the second retry attempt. + if numberOfRetriesLaunched == 2 { // Iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. // If we are familiar with all, we don't want to allow archive. for _, hash := range hashes { @@ -131,6 +131,8 @@ func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, userData := crsm.protocolMessage.GetUserData() // Creating an archive protocol message, and set it to current portocol message crsm.protocolMessage = chainlib.NewProtocolMessage(crsm.protocolMessage, crsm.protocolMessage.GetDirectiveHeaders(), relayRequestData, userData.DappId, userData.ConsumerIp) + // TODO: need to set cu for the extension. + // for future batches. crsm.appliedArchiveExtension = true break From f0864a50b2c9c64f1e5614f47c739702c9a9cc6d Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Mon, 7 Oct 2024 11:24:46 +0200 Subject: [PATCH 09/20] using parse msg to create a new protocol message with archive. --- protocol/chainlib/chainlib.go | 1 - protocol/chainlib/chainlib_mock.go | 4 +- .../chainlib/consumer_websocket_manager.go | 2 +- .../consumer_ws_subscription_manager.go | 2 +- .../consumer_relay_state_machine.go | 37 ++++++++++++------- .../consumer_relay_state_machine_test.go | 13 +++++++ protocol/rpcconsumer/rpcconsumer_server.go | 3 +- 7 files changed, 42 insertions(+), 20 deletions(-) diff --git a/protocol/chainlib/chainlib.go b/protocol/chainlib/chainlib.go index 5fe487a317..d64a95a50c 100644 --- a/protocol/chainlib/chainlib.go +++ b/protocol/chainlib/chainlib.go @@ -124,7 +124,6 @@ type RelaySender interface { connectionType string, dappID string, consumerIp string, - analytics *metrics.RelayMetrics, metadata []pairingtypes.Metadata, ) (ProtocolMessage, error) SendParsedRelay( diff --git a/protocol/chainlib/chainlib_mock.go b/protocol/chainlib/chainlib_mock.go index 6ee05a7f5c..bfcb947127 100644 --- a/protocol/chainlib/chainlib_mock.go +++ b/protocol/chainlib/chainlib_mock.go @@ -714,9 +714,9 @@ func (mr *MockRelaySenderMockRecorder) CreateDappKey(userData interface{}) *gomo } // ParseRelay mocks base method. -func (m *MockRelaySender) ParseRelay(ctx context.Context, url, req, connectionType, dappID, consumerIp string, analytics *metrics.RelayMetrics, metadata []types.Metadata) (ProtocolMessage, error) { +func (m *MockRelaySender) ParseRelay(ctx context.Context, url, req, connectionType, dappID, consumerIp string, metadata []types.Metadata) (ProtocolMessage, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "ParseRelay", ctx, url, req, connectionType, dappID, consumerIp, analytics, metadata) + ret := m.ctrl.Call(m, "ParseRelay", ctx, url, req, connectionType, dappID, consumerIp, metadata) ret0, _ := ret[0].(ProtocolMessage) ret1, _ := ret[1].(error) return ret0, ret1 diff --git a/protocol/chainlib/consumer_websocket_manager.go b/protocol/chainlib/consumer_websocket_manager.go index 748fd7e5b3..a5ebe2fcbf 100644 --- a/protocol/chainlib/consumer_websocket_manager.go +++ b/protocol/chainlib/consumer_websocket_manager.go @@ -214,7 +214,7 @@ func (cwm *ConsumerWebsocketManager) ListenToMessages() { metricsData := metrics.NewRelayAnalytics(dappID, cwm.chainId, cwm.apiInterface) - protocolMessage, err := cwm.relaySender.ParseRelay(webSocketCtx, "", string(msg), cwm.connectionType, dappID, userIp, metricsData, nil) + protocolMessage, err := cwm.relaySender.ParseRelay(webSocketCtx, "", string(msg), cwm.connectionType, dappID, userIp, nil) if err != nil { utils.LavaFormatDebug("ws manager could not parse message", utils.LogAttr("message", msg), utils.LogAttr("err", err)) formatterMsg := logger.AnalyzeWebSocketErrorAndGetFormattedMessage(websocketConn.LocalAddr().String(), err, msgSeed, msg, cwm.apiInterface, time.Since(startTime)) diff --git a/protocol/chainlib/consumer_ws_subscription_manager.go b/protocol/chainlib/consumer_ws_subscription_manager.go index 102bd8240a..3ae6014a75 100644 --- a/protocol/chainlib/consumer_ws_subscription_manager.go +++ b/protocol/chainlib/consumer_ws_subscription_manager.go @@ -697,7 +697,7 @@ func (cwsm *ConsumerWSSubscriptionManager) craftUnsubscribeMessage(hashedParams, // Craft the unsubscribe chain message ctx := context.Background() - protocolMessage, err := cwsm.relaySender.ParseRelay(ctx, "", unsubscribeRequestData, cwsm.connectionType, dappID, consumerIp, metricsData, nil) + protocolMessage, err := cwsm.relaySender.ParseRelay(ctx, "", unsubscribeRequestData, cwsm.connectionType, dappID, consumerIp, nil) if err != nil { return nil, utils.LavaFormatError("could not craft unsubscribe chain message", err, utils.LogAttr("hashedParams", utils.ToHexString(hashedParams)), diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index e78be5e7c0..88032698d5 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -6,6 +6,7 @@ import ( "time" slices "github.com/lavanet/lava/v3/utils/lavaslices" + pairingtypes "github.com/lavanet/lava/v3/x/pairing/types" "github.com/lavanet/lava/v3/protocol/chainlib" "github.com/lavanet/lava/v3/protocol/chainlib/extensionslib" @@ -35,6 +36,15 @@ type ResultsCheckerInf interface { type ConsumerRelaySender interface { getProcessingTimeout(chainMessage chainlib.ChainMessage) (processingTimeout time.Duration, relayTimeout time.Duration) GetChainIdAndApiInterface() (string, string) + ParseRelay( + ctx context.Context, + url string, + req string, + connectionType string, + dappID string, + consumerIp string, + metadata []pairingtypes.Metadata, + ) (protocolMessage chainlib.ProtocolMessage, err error) } type tickerMetricSetterInf interface { @@ -109,30 +119,29 @@ func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, if shouldRetry { // Retry archive logic hashes := crsm.GetProtocolMessage().GetRequestedBlocksHashes() - if len(hashes) > 0 && numberOfNodeErrors > 1 { + if len(hashes) > 0 && numberOfNodeErrors > 0 { // Launch archive only on the second retry attempt. - if numberOfRetriesLaunched == 2 { + if numberOfRetriesLaunched == 1 { // Iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. // If we are familiar with all, we don't want to allow archive. for _, hash := range hashes { if !crsm.relayRetriesManager.CheckHashInCache(hash) { // If we didn't find the hash in the cache we can try archive relay. - privateData := crsm.protocolMessage.RelayPrivateData() - // Create a new array of extensions. validate it doesn't already have archive in it. - // If it does just break. if it doesn't add it - extensions := append([]string{}, privateData.Extensions...) - if slices.Contains(extensions, extensionslib.ArchiveExtension) { + relayRequestData := crsm.protocolMessage.RelayPrivateData() + // Validate we're not already archive + if slices.Contains(relayRequestData.Extensions, extensionslib.ArchiveExtension) { break // Do nothing its already archive. } - extensions = append(extensions, extensionslib.ArchiveExtension) // We need to set archive. // Create a new relay private data containing the extension. - relayRequestData := lavaprotocol.NewRelayData(crsm.ctx, privateData.ConnectionType, privateData.ApiUrl, privateData.Data, privateData.SeenBlock, privateData.RequestBlock, privateData.ApiInterface, privateData.Metadata, privateData.Addon, extensions) userData := crsm.protocolMessage.GetUserData() - // Creating an archive protocol message, and set it to current portocol message - crsm.protocolMessage = chainlib.NewProtocolMessage(crsm.protocolMessage, crsm.protocolMessage.GetDirectiveHeaders(), relayRequestData, userData.DappId, userData.ConsumerIp) - // TODO: need to set cu for the extension. - + metaDataForArchive := []pairingtypes.Metadata{{Name: common.EXTENSION_OVERRIDE_HEADER_NAME, Value: extensionslib.ArchiveExtension}} + newProtocolMessage, err := crsm.relaySender.ParseRelay(crsm.ctx, relayRequestData.ApiUrl, string(relayRequestData.Data), relayRequestData.ConnectionType, userData.DappId, userData.ConsumerIp, metaDataForArchive) + if err != nil { + utils.LavaFormatError("Failed converting to archive message in shouldRetry", err, utils.LogAttr("relayRequestData", relayRequestData), utils.LogAttr("metadata", metaDataForArchive)) + } + // Creating an archive protocol message, and set it to current protocol message + crsm.protocolMessage = newProtocolMessage // for future batches. crsm.appliedArchiveExtension = true break @@ -152,6 +161,8 @@ func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, crsm.relayRetriesManager.AddHashToCache(hash) } crsm.appliedArchiveExtension = false // so we don't get here again + // We do not want to send additional relays after archive attempt. return false. + return false } } } diff --git a/protocol/rpcconsumer/consumer_relay_state_machine_test.go b/protocol/rpcconsumer/consumer_relay_state_machine_test.go index e27cfda967..91271c71b0 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine_test.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine_test.go @@ -10,6 +10,7 @@ import ( "github.com/lavanet/lava/v3/protocol/chainlib" "github.com/lavanet/lava/v3/protocol/chainlib/extensionslib" lavasession "github.com/lavanet/lava/v3/protocol/lavasession" + pairingtypes "github.com/lavanet/lava/v3/x/pairing/types" spectypes "github.com/lavanet/lava/v3/x/spec/types" "github.com/stretchr/testify/require" ) @@ -30,6 +31,18 @@ func (crsm *ConsumerRelaySenderMock) GetChainIdAndApiInterface() (string, string return "testUno", "testDos" } +func (crsm *ConsumerRelaySenderMock) ParseRelay( + ctx context.Context, + url string, + req string, + connectionType string, + dappID string, + consumerIp string, + metadata []pairingtypes.Metadata, +) (protocolMessage chainlib.ProtocolMessage, err error) { + return nil, nil +} + func TestConsumerStateMachineHappyFlow(t *testing.T) { t.Run("happy", func(t *testing.T) { ctx := context.Background() diff --git a/protocol/rpcconsumer/rpcconsumer_server.go b/protocol/rpcconsumer/rpcconsumer_server.go index 3422dcb5aa..3fa6e6d90f 100644 --- a/protocol/rpcconsumer/rpcconsumer_server.go +++ b/protocol/rpcconsumer/rpcconsumer_server.go @@ -326,7 +326,7 @@ func (rpccs *RPCConsumerServer) SendRelay( analytics *metrics.RelayMetrics, metadata []pairingtypes.Metadata, ) (relayResult *common.RelayResult, errRet error) { - protocolMessage, err := rpccs.ParseRelay(ctx, url, req, connectionType, dappID, consumerIp, analytics, metadata) + protocolMessage, err := rpccs.ParseRelay(ctx, url, req, connectionType, dappID, consumerIp, metadata) if err != nil { return nil, err } @@ -341,7 +341,6 @@ func (rpccs *RPCConsumerServer) ParseRelay( connectionType string, dappID string, consumerIp string, - analytics *metrics.RelayMetrics, metadata []pairingtypes.Metadata, ) (protocolMessage chainlib.ProtocolMessage, err error) { // gets the relay request data from the ChainListener From 78fd6dc9fcb205327c57807301055d940f5b3f48 Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Mon, 7 Oct 2024 11:32:34 +0200 Subject: [PATCH 10/20] lint --- protocol/rpcconsumer/consumer_relay_state_machine_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine_test.go b/protocol/rpcconsumer/consumer_relay_state_machine_test.go index 91271c71b0..f69710b6e2 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine_test.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine_test.go @@ -40,7 +40,7 @@ func (crsm *ConsumerRelaySenderMock) ParseRelay( consumerIp string, metadata []pairingtypes.Metadata, ) (protocolMessage chainlib.ProtocolMessage, err error) { - return nil, nil + return nil, fmt.Errorf("not implemented") } func TestConsumerStateMachineHappyFlow(t *testing.T) { From 2ec685518269b0f6aad93a3fd3673c467b01eaef Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Mon, 7 Oct 2024 11:35:23 +0200 Subject: [PATCH 11/20] removing sdk tests --- .github/workflows/lava.yml | 41 ++++++++++++++++++++++++++++ .github/workflows/lava_sdk_tests.yml | 38 -------------------------- 2 files changed, 41 insertions(+), 38 deletions(-) delete mode 100644 .github/workflows/lava_sdk_tests.yml diff --git a/.github/workflows/lava.yml b/.github/workflows/lava.yml index 6ee6658571..8d114b5941 100644 --- a/.github/workflows/lava.yml +++ b/.github/workflows/lava.yml @@ -305,6 +305,47 @@ jobs: # name: SDK E2E Logs # path: "testutil/e2e/sdkLogs/*" +# This part came from lava_sdk_tests.yml that was removed. just not to lose functionality it moved here. +# name: Lava SDK Tests + +# on: +# pull_request + +# jobs: +# main: +# runs-on: ubuntu-latest +# steps: +# - name: Checkout code +# uses: actions/checkout@v4 + +# - name: Cache dependencies +# uses: actions/cache@v4 +# with: +# path: ~/.yarn +# key: yarn-${{ hashFiles('yarn.lock') }} +# restore-keys: yarn- + +# - uses: actions/setup-go@v5 +# with: +# go-version: "1.20.5" + +# - uses: actions/setup-node@v4 +# with: +# node-version: "21.2.0" + +# - name: Init the SDK +# run: GOPATH=~/go ./scripts/init_sdk.sh -s +# working-directory: ./ecosystem/lava-sdk + +# - name: ESLint +# run: ./node_modules/.bin/eslint '**/*.ts' +# working-directory: ./ecosystem/lava-sdk + +# - name: Test +# run: ./node_modules/.bin/jest ./src --ci +# working-directory: ./ecosystem/lava-sdk + + test-payment-e2e: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/lava_sdk_tests.yml b/.github/workflows/lava_sdk_tests.yml deleted file mode 100644 index a13c83c348..0000000000 --- a/.github/workflows/lava_sdk_tests.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: Lava SDK Tests - -on: - pull_request - -jobs: - main: - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Cache dependencies - uses: actions/cache@v4 - with: - path: ~/.yarn - key: yarn-${{ hashFiles('yarn.lock') }} - restore-keys: yarn- - - - uses: actions/setup-go@v5 - with: - go-version: "1.20.5" - - - uses: actions/setup-node@v4 - with: - node-version: "21.2.0" - - - name: Init the SDK - run: GOPATH=~/go ./scripts/init_sdk.sh -s - working-directory: ./ecosystem/lava-sdk - - - name: ESLint - run: ./node_modules/.bin/eslint '**/*.ts' - working-directory: ./ecosystem/lava-sdk - - - name: Test - run: ./node_modules/.bin/jest ./src --ci - working-directory: ./ecosystem/lava-sdk From 7bbda5334e3301fa865507213a9fa275dc405348 Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Tue, 8 Oct 2024 13:00:02 +0200 Subject: [PATCH 12/20] fix test --- protocol/chainlib/chainlib_mock.go | 4 ++-- .../consumer_ws_subscription_manager_test.go | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/protocol/chainlib/chainlib_mock.go b/protocol/chainlib/chainlib_mock.go index bfcb947127..ab4a026e01 100644 --- a/protocol/chainlib/chainlib_mock.go +++ b/protocol/chainlib/chainlib_mock.go @@ -723,9 +723,9 @@ func (m *MockRelaySender) ParseRelay(ctx context.Context, url, req, connectionTy } // ParseRelay indicates an expected call of ParseRelay. -func (mr *MockRelaySenderMockRecorder) ParseRelay(ctx, url, req, connectionType, dappID, consumerIp, analytics, metadata interface{}) *gomock.Call { +func (mr *MockRelaySenderMockRecorder) ParseRelay(ctx, url, req, connectionType, dappID, consumerIp, metadata interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ParseRelay", reflect.TypeOf((*MockRelaySender)(nil).ParseRelay), ctx, url, req, connectionType, dappID, consumerIp, analytics, metadata) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ParseRelay", reflect.TypeOf((*MockRelaySender)(nil).ParseRelay), ctx, url, req, connectionType, dappID, consumerIp, metadata) } // SendParsedRelay mocks base method. diff --git a/protocol/chainlib/consumer_ws_subscription_manager_test.go b/protocol/chainlib/consumer_ws_subscription_manager_test.go index 48573a3512..0dae186406 100644 --- a/protocol/chainlib/consumer_ws_subscription_manager_test.go +++ b/protocol/chainlib/consumer_ws_subscription_manager_test.go @@ -88,7 +88,7 @@ func TestConsumerWSSubscriptionManagerParallelSubscriptionsOnSameDappIdIp(t *tes relaySender. EXPECT(). - ParseRelay(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + ParseRelay(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). Return(protocolMessage1, nil). AnyTimes() @@ -244,7 +244,7 @@ func TestConsumerWSSubscriptionManagerParallelSubscriptions(t *testing.T) { relaySender. EXPECT(). - ParseRelay(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + ParseRelay(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). Return(protocolMessage1, nil). AnyTimes() @@ -484,7 +484,7 @@ func TestConsumerWSSubscriptionManager(t *testing.T) { require.True(t, ok) areEqual := reqData == string(play.unsubscribeMessage1) return areEqual - }), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + }), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). Return(unsubscribeProtocolMessage1, nil). AnyTimes() @@ -495,7 +495,7 @@ func TestConsumerWSSubscriptionManager(t *testing.T) { require.True(t, ok) areEqual := reqData == string(play.subscriptionRequestData1) return areEqual - }), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + }), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). Return(subscribeProtocolMessage1, nil). AnyTimes() @@ -600,7 +600,7 @@ func TestConsumerWSSubscriptionManager(t *testing.T) { require.True(t, ok) areEqual := reqData == string(play.unsubscribeMessage2) return areEqual - }), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + }), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). Return(unsubscribeProtocolMessage2, nil). AnyTimes() @@ -611,7 +611,7 @@ func TestConsumerWSSubscriptionManager(t *testing.T) { require.True(t, ok) areEqual := reqData == string(play.subscriptionRequestData2) return areEqual - }), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + }), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). Return(subscribeProtocolMessage2, nil). AnyTimes() From 6182a6626ce0e4d95ee2719b2ce3b57ad4211404 Mon Sep 17 00:00:00 2001 From: omerlavanet Date: Sun, 3 Nov 2024 16:06:47 +0200 Subject: [PATCH 13/20] remove extensions from all flows and save RouterKey in singleConsumerSession --- .../lavasession/consumer_session_manager.go | 28 ++++++++-------- .../consumer_session_manager_test.go | 28 ++++++++-------- protocol/lavasession/consumer_types.go | 10 +++--- .../end_to_end_lavasession_test.go | 4 +-- .../lavasession/single_consumer_session.go | 14 ++++---- protocol/lavasession/used_providers.go | 26 +++++++-------- protocol/lavasession/used_providers_test.go | 27 ++++++++-------- .../consumer_relay_state_machine_test.go | 8 ++--- protocol/rpcconsumer/relay_processor_test.go | 32 +++++++++---------- protocol/rpcconsumer/rpcconsumer_server.go | 17 +++++----- 10 files changed, 99 insertions(+), 95 deletions(-) diff --git a/protocol/lavasession/consumer_session_manager.go b/protocol/lavasession/consumer_session_manager.go index bac3bc324e..90f75ae374 100644 --- a/protocol/lavasession/consumer_session_manager.go +++ b/protocol/lavasession/consumer_session_manager.go @@ -431,8 +431,9 @@ func (csm *ConsumerSessionManager) GetSessions(ctx context.Context, cuNeededForS } return nil, utils.LavaFormatError("failed getting sessions from used Providers", nil, utils.LogAttr("usedProviders", usedProviders), utils.LogAttr("endpoint", csm.rpcEndpoint)) } - defer func() { usedProviders.AddUsed(consumerSessionMap, extensions, errRet) }() - initUnwantedProviders := usedProviders.GetUnwantedProvidersToSend(extensions) + defer func() { usedProviders.AddUsed(consumerSessionMap, errRet) }() + routerKey := NewRouterKeyFromExtensions(extensions) + initUnwantedProviders := usedProviders.GetUnwantedProvidersToSend(routerKey) extensionNames := common.GetExtensionNames(extensions) // if pairing list is empty we reset the state. @@ -528,7 +529,7 @@ func (csm *ConsumerSessionManager) GetSessions(ctx context.Context, cuNeededForS if MaxComputeUnitsExceededError.Is(err) { tempIgnoredProviders.providers[providerAddress] = struct{}{} // We must unlock the consumer session before continuing. - consumerSession.Free(nil, extensions) + consumerSession.Free(nil) continue } else { utils.LavaFormatFatal("Unsupported Error", err) @@ -566,7 +567,7 @@ func (csm *ConsumerSessionManager) GetSessions(ctx context.Context, cuNeededForS // we don't want to update the reputation by it, so we null the rawQosReport rawQosReport = nil } - consumerSession.SetUsageForSession(cuNeededForSession, qosReport, rawQosReport, usedProviders) + consumerSession.SetUsageForSession(cuNeededForSession, qosReport, rawQosReport, usedProviders, routerKey) // We successfully added provider, we should ignore it if we need to fetch new tempIgnoredProviders.providers[providerAddress] = struct{}{} if len(sessions) == wantedSession { @@ -686,6 +687,7 @@ func (csm *ConsumerSessionManager) tryGetConsumerSessionWithProviderFromBlockedP // if we got here we validated the epoch is still the same epoch as we expected and we need to fetch a session from the blocked provider list. defer csm.lock.RUnlock() + routerKey := NewRouterKey(extensions) // csm.currentlyBlockedProviderAddresses is sorted by the provider with the highest cu used this epoch to the lowest // meaning if we fetch the first successful index this is probably the highest success ratio to get a response. for _, providerAddress := range csm.currentlyBlockedProviderAddresses { @@ -696,7 +698,7 @@ func (csm *ConsumerSessionManager) tryGetConsumerSessionWithProviderFromBlockedP consumerSessionsWithProvider := csm.pairing[providerAddress] // Add to ignored (no matter what) ignoredProviders.providers[providerAddress] = struct{}{} - usedProviders.AddUnwantedAddresses(providerAddress, extensions) // add the address to our unwanted providers to avoid infinite recursion + usedProviders.AddUnwantedAddresses(providerAddress, routerKey) // add the address to our unwanted providers to avoid infinite recursion // validate this provider has enough cu to be used if err := consumerSessionsWithProvider.validateComputeUnits(cuNeededForSession, virtualEpoch); err != nil { @@ -893,9 +895,9 @@ func (csm *ConsumerSessionManager) blockProvider(address string, reportProvider } // Report session failure, mark it as blocked from future usages, report if timeout happened. -func (csm *ConsumerSessionManager) OnSessionFailure(consumerSession *SingleConsumerSession, errorReceived error, extensions []*spectypes.Extension) error { +func (csm *ConsumerSessionManager) OnSessionFailure(consumerSession *SingleConsumerSession, errorReceived error) error { // consumerSession must be locked when getting here. - if err := consumerSession.VerifyLock(extensions); err != nil { + if err := consumerSession.VerifyLock(); err != nil { return sdkerrors.Wrapf(err, "OnSessionFailure, consumerSession.lock must be locked before accessing this method, additional info:") } // redemptionSession = true, if we got this provider from the blocked provider list. @@ -964,7 +966,7 @@ func (csm *ConsumerSessionManager) OnSessionFailure(consumerSession *SingleConsu parentConsumerSessionsWithProvider := consumerSession.Parent // must read this pointer before unlocking csm.updateMetricsManager(consumerSession, time.Duration(0), false) // finished with consumerSession here can unlock. - consumerSession.Free(errorReceived, extensions) // we unlock before we change anything in the parent ConsumerSessionsWithProvider + consumerSession.Free(errorReceived) // we unlock before we change anything in the parent ConsumerSessionsWithProvider err := parentConsumerSessionsWithProvider.decreaseUsedComputeUnits(cuToDecrease) // change the cu in parent if err != nil { @@ -1021,7 +1023,7 @@ func (csm *ConsumerSessionManager) OnSessionDone( extensions []*spectypes.Extension, ) error { // release locks, update CU, relaynum etc.. - if err := consumerSession.VerifyLock(extensions); err != nil { + if err := consumerSession.VerifyLock(); err != nil { return sdkerrors.Wrapf(err, "OnSessionDone, consumerSession.lock must be locked before accessing this method") } @@ -1035,7 +1037,7 @@ func (csm *ConsumerSessionManager) OnSessionDone( defer func() { go csm.validateAndReturnBlockedProviderToValidAddressesList(providerAddress) }() } - defer consumerSession.Free(nil, extensions) // we need to be locked here, if we didn't get it locked we try lock anyway + defer consumerSession.Free(nil) // we need to be locked here, if we didn't get it locked we try lock anyway consumerSession.CuSum += consumerSession.LatestRelayCu // add CuSum to current cu usage. consumerSession.LatestRelayCu = 0 // reset cu just in case consumerSession.ConsecutiveErrors = []error{} @@ -1102,12 +1104,12 @@ func (csm *ConsumerSessionManager) GetAtomicPairingAddressesLength() uint64 { } // On a successful Subscribe relay -func (csm *ConsumerSessionManager) OnSessionDoneIncreaseCUOnly(consumerSession *SingleConsumerSession, latestServicedBlock int64, extensions []*spectypes.Extension) error { - if err := consumerSession.VerifyLock(extensions); err != nil { +func (csm *ConsumerSessionManager) OnSessionDoneIncreaseCUOnly(consumerSession *SingleConsumerSession, latestServicedBlock int64) error { + if err := consumerSession.VerifyLock(); err != nil { return sdkerrors.Wrapf(err, "OnSessionDoneIncreaseRelayAndCu consumerSession.lock must be locked before accessing this method") } - defer consumerSession.Free(nil, extensions) // we need to be locked here, if we didn't get it locked we try lock anyway + defer consumerSession.Free(nil) // we need to be locked here, if we didn't get it locked we try lock anyway consumerSession.LatestBlock = latestServicedBlock consumerSession.CuSum += consumerSession.LatestRelayCu // add CuSum to current cu usage. consumerSession.LatestRelayCu = 0 // reset cu just in case diff --git a/protocol/lavasession/consumer_session_manager_test.go b/protocol/lavasession/consumer_session_manager_test.go index 598c750407..000ea5b638 100644 --- a/protocol/lavasession/consumer_session_manager_test.go +++ b/protocol/lavasession/consumer_session_manager_test.go @@ -346,7 +346,7 @@ func TestSecondChanceRecoveryFlow(t *testing.T) { _, expectedProviderAddress := css[pairingList[0].PublicLavaAddress] require.True(t, expectedProviderAddress) for _, sessionInfo := range css { - csm.OnSessionFailure(sessionInfo.Session, fmt.Errorf("testError"), nil) + csm.OnSessionFailure(sessionInfo.Session, fmt.Errorf("testError")) } _, ok := csm.secondChanceGivenToAddresses[pairingList[0].PublicLavaAddress] if ok { @@ -399,7 +399,7 @@ func TestSecondChanceRecoveryFlow(t *testing.T) { _, expectedProviderAddress := css[pairingList[0].PublicLavaAddress] require.True(t, expectedProviderAddress) for _, sessionInfo := range css { - csm.OnSessionFailure(sessionInfo.Session, fmt.Errorf("testError"), nil) + csm.OnSessionFailure(sessionInfo.Session, fmt.Errorf("testError")) require.Equal(t, BlockedProviderSessionUnusedStatus, csm.pairing[pairingList[0].PublicLavaAddress].blockedAndUsedWithChanceForRecoveryStatus) } if _, ok := csm.reportedProviders.addedToPurgeAndReport[pairingList[0].PublicLavaAddress]; ok { @@ -430,7 +430,7 @@ func runOnSessionFailureForConsumerSessionMap(t *testing.T, css ConsumerSessions require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, cuForFirstRequest) - err := csm.OnSessionFailure(cs.Session, fmt.Errorf("testError"), nil) + err := csm.OnSessionFailure(cs.Session, fmt.Errorf("testError")) require.NoError(t, err) } } @@ -509,7 +509,7 @@ func TestPairingResetWithFailures(t *testing.T) { require.NoError(t, err) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, nil, nil) + err = csm.OnSessionFailure(cs.Session, nil) require.NoError(t, err) // fail test. } } @@ -545,7 +545,7 @@ func TestPairingResetWithMultipleFailures(t *testing.T) { require.NoError(t, err) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, nil, nil) + err = csm.OnSessionFailure(cs.Session, nil) require.NoError(t, err) } @@ -627,7 +627,7 @@ func TestSuccessAndFailureOfSessionWithUpdatePairingsInTheMiddle(t *testing.T) { require.Equal(t, cs.LatestBlock, servicedBlockNumber) sessionListData[j] = SessTestData{cuSum: cuForFirstRequest, relayNum: 1} } else { - err = csm.OnSessionFailure(cs, nil, nil) + err = csm.OnSessionFailure(cs, nil) require.NoError(t, err) require.Equal(t, cs.CuSum, uint64(0)) require.Equal(t, cs.RelayNum, relayNumberAfterFirstFail) @@ -660,7 +660,7 @@ func TestSuccessAndFailureOfSessionWithUpdatePairingsInTheMiddle(t *testing.T) { require.Equal(t, cs.RelayNum, sessionListData[j].relayNum+1) require.Equal(t, cs.LatestBlock, servicedBlockNumber) } else { - err = csm.OnSessionFailure(cs, nil, nil) + err = csm.OnSessionFailure(cs, nil) require.NoError(t, err) require.Equal(t, sessionListData[j].cuSum, cs.CuSum) require.Equal(t, cs.RelayNum, sessionListData[j].relayNum+1) @@ -689,7 +689,7 @@ func failedSession(ctx context.Context, csm *ConsumerSessionManager, t *testing. for _, cs := range css { require.NotNil(t, cs) time.Sleep(time.Duration((rand.Intn(500) + 1)) * time.Millisecond) - err = csm.OnSessionFailure(cs.Session, fmt.Errorf("nothing special"), nil) + err = csm.OnSessionFailure(cs.Session, fmt.Errorf("nothing special")) require.NoError(t, err) ch <- p } @@ -808,7 +808,7 @@ func TestSessionFailureAndGetReportedProviders(t *testing.T) { require.NotNil(t, cs) require.Equal(t, cs.Epoch, csm.currentEpoch) require.Equal(t, cs.Session.LatestRelayCu, cuForFirstRequest) - err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError, nil) + err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError) require.NoError(t, err) require.Equal(t, cs.Session.Parent.UsedComputeUnits, cuSumOnFailure) require.Equal(t, cs.Session.CuSum, cuSumOnFailure) @@ -845,7 +845,7 @@ func TestSessionFailureEpochMisMatch(t *testing.T) { err = csm.UpdateAllProviders(secondEpochHeight, pairingList) // update the providers again. require.NoError(t, err) - err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError, nil) + err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError) require.NoError(t, err) } } @@ -945,7 +945,7 @@ func TestPairingWithAddons(t *testing.T) { css, err := csm.GetSessions(ctx, cuForFirstRequest, NewUsedProviders(nil), servicedBlockNumber, addon, nil, common.NO_STATE, 0) // get a session require.NoError(t, err, i) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError, nil) + err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError) require.NoError(t, err) } utils.LavaFormatDebug("length!", utils.Attribute{Key: "length", Value: len(csm.getValidAddresses(addon, nil))}, utils.Attribute{Key: "valid addresses", Value: csm.getValidAddresses(addon, nil)}) @@ -1020,7 +1020,7 @@ func TestPairingWithExtensions(t *testing.T) { css, err := csm.GetSessions(ctx, cuForFirstRequest, NewUsedProviders(nil), servicedBlockNumber, extensionOpt.addon, extensionsList, common.NO_STATE, 0) // get a session require.NoError(t, err, i) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError, nil) + err = csm.OnSessionFailure(cs.Session, ReportAndBlockProviderError) require.NoError(t, err) } utils.LavaFormatDebug("length!", utils.Attribute{Key: "length", Value: len(csm.getValidAddresses(extensionOpt.addon, extensionOpt.extensions))}, utils.Attribute{Key: "valid addresses", Value: csm.getValidAddresses(extensionOpt.addon, extensionOpt.extensions)}) @@ -1072,7 +1072,7 @@ func TestPairingWithStateful(t *testing.T) { require.NoError(t, err) } usedProviders := NewUsedProviders(nil) - usedProviders.RemoveUsed(providerAddresses[0], nil, nil) + usedProviders.RemoveUsed(providerAddresses[0], NewRouterKey(nil), nil) css, err = csm.GetSessions(ctx, cuForFirstRequest, usedProviders, servicedBlockNumber, addon, nil, common.CONSISTENCY_SELECT_ALL_PROVIDERS, 0) // get a session require.NoError(t, err) require.Equal(t, allProviders-1, len(css)) @@ -1090,7 +1090,7 @@ func TestMaximumBlockedSessionsErrorsInPairingListEmpty(t *testing.T) { css, err := csm.GetSessions(ctx, cuForFirstRequest, NewUsedProviders(nil), servicedBlockNumber, "", nil, common.NO_STATE, 0) // get a session require.NoError(t, err) for _, cs := range css { - err = csm.OnSessionFailure(cs.Session, errors.Join(BlockProviderError, SessionOutOfSyncError), nil) + err = csm.OnSessionFailure(cs.Session, errors.Join(BlockProviderError, SessionOutOfSyncError)) require.NoError(t, err) } } diff --git a/protocol/lavasession/consumer_types.go b/protocol/lavasession/consumer_types.go index 04a39dc390..84138d7dd3 100644 --- a/protocol/lavasession/consumer_types.go +++ b/protocol/lavasession/consumer_types.go @@ -13,7 +13,6 @@ import ( "github.com/lavanet/lava/v3/utils/rand" pairingtypes "github.com/lavanet/lava/v3/x/pairing/types" planstypes "github.com/lavanet/lava/v3/x/plans/types" - spectypes "github.com/lavanet/lava/v3/x/spec/types" "google.golang.org/grpc" "google.golang.org/grpc/connectivity" ) @@ -52,11 +51,11 @@ var ( ) type UsedProvidersInf interface { - RemoveUsed(providerAddress string, extensions []*spectypes.Extension, err error) + RemoveUsed(providerAddress string, routerKey RouterKey, err error) TryLockSelection(context.Context) error - AddUsed(ConsumerSessionsMap, []*spectypes.Extension, error) - GetUnwantedProvidersToSend(extensions []*spectypes.Extension) map[string]struct{} - AddUnwantedAddresses(address string, extensions []string) + AddUsed(ConsumerSessionsMap, error) + GetUnwantedProvidersToSend(RouterKey) map[string]struct{} + AddUnwantedAddresses(address string, routerKey RouterKey) CurrentlyUsed() int } @@ -440,6 +439,7 @@ func (cswp *ConsumerSessionsWithProvider) GetConsumerSessionInstanceFromEndpoint Parent: cswp, EndpointConnection: endpointConnection, StaticProvider: cswp.StaticProvider, + routerKey: NewRouterKey(nil), } consumerSession.TryUseSession() // we must lock the session so other requests wont get it. diff --git a/protocol/lavasession/end_to_end_lavasession_test.go b/protocol/lavasession/end_to_end_lavasession_test.go index e2e85546e3..f983c652ef 100644 --- a/protocol/lavasession/end_to_end_lavasession_test.go +++ b/protocol/lavasession/end_to_end_lavasession_test.go @@ -57,7 +57,7 @@ func TestHappyFlowE2EEmergency(t *testing.T) { skippedRelays++ - err := csm.OnSessionFailure(cs.Session, nil, nil) + err := csm.OnSessionFailure(cs.Session, nil) require.NoError(t, err) err = psm.OnSessionFailure(sps, cs.Session.RelayNum-skippedRelays) @@ -124,7 +124,7 @@ func TestHappyFlowEmergencyInConsumer(t *testing.T) { require.NoError(t, err) // Consumer Side: - err = csm.OnSessionFailure(cs.Session, nil, nil) + err = csm.OnSessionFailure(cs.Session, nil) require.NoError(t, err) require.Equal(t, cs.Session.CuSum, maxCuForVirtualEpoch) require.Equal(t, cs.Session.LatestRelayCu, latestRelayCuAfterDone) diff --git a/protocol/lavasession/single_consumer_session.go b/protocol/lavasession/single_consumer_session.go index f4e5efb1f8..3867cd3c98 100644 --- a/protocol/lavasession/single_consumer_session.go +++ b/protocol/lavasession/single_consumer_session.go @@ -9,7 +9,6 @@ import ( sdk "github.com/cosmos/cosmos-sdk/types" "github.com/lavanet/lava/v3/utils" pairingtypes "github.com/lavanet/lava/v3/x/pairing/types" - spectypes "github.com/lavanet/lava/v3/x/spec/types" ) type SingleConsumerSession struct { @@ -29,6 +28,7 @@ type SingleConsumerSession struct { usedProviders UsedProvidersInf providerUniqueId string StaticProvider bool + routerKey RouterKey } // returns the expected latency to a threshold. @@ -104,7 +104,7 @@ func (cs *SingleConsumerSession) CalculateQoS(latency, expectedLatency time.Dura } } -func (scs *SingleConsumerSession) SetUsageForSession(cuNeededForSession uint64, qoSExcellenceReport *pairingtypes.QualityOfServiceReport, rawQoSExcellenceReport *pairingtypes.QualityOfServiceReport, usedProviders UsedProvidersInf) error { +func (scs *SingleConsumerSession) SetUsageForSession(cuNeededForSession uint64, qoSExcellenceReport *pairingtypes.QualityOfServiceReport, rawQoSExcellenceReport *pairingtypes.QualityOfServiceReport, usedProviders UsedProvidersInf, routerKey RouterKey) error { scs.LatestRelayCu = cuNeededForSession // set latestRelayCu scs.RelayNum += RelayNumberIncrement // increase relayNum if scs.RelayNum > 1 { @@ -113,14 +113,16 @@ func (scs *SingleConsumerSession) SetUsageForSession(cuNeededForSession uint64, scs.QoSInfo.LastExcellenceQoSReportRaw = rawQoSExcellenceReport } scs.usedProviders = usedProviders + scs.routerKey = routerKey return nil } -func (scs *SingleConsumerSession) Free(err error, extensions []*spectypes.Extension) { +func (scs *SingleConsumerSession) Free(err error) { if scs.usedProviders != nil { - scs.usedProviders.RemoveUsed(scs.Parent.PublicLavaAddress, extensions, err) + scs.usedProviders.RemoveUsed(scs.Parent.PublicLavaAddress, scs.routerKey, err) scs.usedProviders = nil } + scs.routerKey = NewRouterKey(nil) scs.EndpointConnection.decreaseSessionUsingConnection() scs.lock.Unlock() } @@ -144,10 +146,10 @@ func (session *SingleConsumerSession) TryUseSession() (blocked bool, ok bool) { } // Verify the consumerSession is locked when getting to this function, if its not locked throw an error -func (consumerSession *SingleConsumerSession) VerifyLock(extensions []*spectypes.Extension) error { +func (consumerSession *SingleConsumerSession) VerifyLock() error { if consumerSession.lock.TryLock() { // verify. // if we managed to lock throw an error for misuse. - defer consumerSession.Free(nil, extensions) + defer consumerSession.Free(nil) // if failed to lock we should block session as it seems like a very rare case. consumerSession.BlockListed = true // block this session from future usages utils.LavaFormatError("Verify Lock failed on session Failure, blocking session", nil, utils.LogAttr("consumerSession", consumerSession)) diff --git a/protocol/lavasession/used_providers.go b/protocol/lavasession/used_providers.go index fe4aa9931b..ad9708237e 100644 --- a/protocol/lavasession/used_providers.go +++ b/protocol/lavasession/used_providers.go @@ -6,7 +6,6 @@ import ( "time" "github.com/lavanet/lava/v3/utils" - spectypes "github.com/lavanet/lava/v3/x/spec/types" ) const MaximumNumberOfSelectionLockAttempts = 500 @@ -139,23 +138,21 @@ func (up *UsedProviders) createOrUseUniqueUsedProvidersForKey(key RouterKey) *Un return uniqueUsedProviders } -func (up *UsedProviders) AddUnwantedAddresses(address string, extensions []string) { +func (up *UsedProviders) AddUnwantedAddresses(address string, routerKey RouterKey) { if up == nil { utils.LavaFormatError("UsedProviders.AddUnwantedAddresses is nil, misuse detected", nil) return } - routerKey := NewRouterKey(extensions) up.lock.Lock() defer up.lock.Unlock() uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) uniqueUsedProviders.unwantedProviders[address] = struct{}{} } -func (up *UsedProviders) RemoveUsed(provider string, extensions []*spectypes.Extension, err error) { +func (up *UsedProviders) RemoveUsed(provider string, routerKey RouterKey, err error) { if up == nil { return } - routerKey := NewRouterKeyFromExtensions(extensions) up.lock.Lock() defer up.lock.Unlock() uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) @@ -192,18 +189,23 @@ func (up *UsedProviders) ClearUnwanted() { } } -func (up *UsedProviders) AddUsed(sessions ConsumerSessionsMap, extensions []*spectypes.Extension, err error) { +func (up *UsedProviders) AddUsed(sessions ConsumerSessionsMap, err error) { if up == nil { return } - routerKey := NewRouterKeyFromExtensions(extensions) up.lock.Lock() defer up.lock.Unlock() - uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) // this is nil safe if len(sessions) > 0 && err == nil { up.sessionsLatestBatch = 0 - for provider := range sessions { // the key for ConsumerSessionsMap is the provider public address + for provider, sessionInfo := range sessions { // the key for ConsumerSessionsMap is the provider public address + var routerKey RouterKey + if sessionInfo.Session != nil { + routerKey = sessionInfo.Session.routerKey + } else { + routerKey = NewRouterKey(nil) + } + uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) uniqueUsedProviders.providers[provider] = struct{}{} up.sessionsLatestBatch++ } @@ -250,22 +252,20 @@ func (up *UsedProviders) tryLockSelection() bool { return false } -func (up *UsedProviders) GetErroredProviders(extensions []*spectypes.Extension) map[string]struct{} { +func (up *UsedProviders) GetErroredProviders(routerKey RouterKey) map[string]struct{} { if up == nil { return map[string]struct{}{} } - routerKey := NewRouterKeyFromExtensions(extensions) up.lock.Lock() defer up.lock.Unlock() uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) return uniqueUsedProviders.erroredProviders } -func (up *UsedProviders) GetUnwantedProvidersToSend(extensions []*spectypes.Extension) map[string]struct{} { +func (up *UsedProviders) GetUnwantedProvidersToSend(routerKey RouterKey) map[string]struct{} { if up == nil { return map[string]struct{}{} } - routerKey := NewRouterKeyFromExtensions(extensions) up.lock.Lock() defer up.lock.Unlock() uniqueUsedProviders := up.createOrUseUniqueUsedProvidersForKey(routerKey) diff --git a/protocol/lavasession/used_providers_test.go b/protocol/lavasession/used_providers_test.go index aa4cb87a2e..7f0adcb5be 100644 --- a/protocol/lavasession/used_providers_test.go +++ b/protocol/lavasession/used_providers_test.go @@ -7,7 +7,6 @@ import ( "time" "github.com/gogo/status" - spectypes "github.com/lavanet/lava/v3/x/spec/types" "github.com/stretchr/testify/require" "google.golang.org/grpc/codes" ) @@ -21,35 +20,35 @@ func TestUsedProviders(t *testing.T) { require.False(t, canUseAgain) require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) - unwanted := usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) + unwanted := usedProviders.GetUnwantedProvidersToSend(NewRouterKey(nil)) require.Len(t, unwanted, 0) consumerSessionsMap := ConsumerSessionsMap{"test": &SessionInfo{}, "test2": &SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) canUseAgain = usedProviders.tryLockSelection() require.True(t, canUseAgain) - unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) + unwanted = usedProviders.GetUnwantedProvidersToSend(NewRouterKey(nil)) require.Len(t, unwanted, 2) require.Equal(t, 2, usedProviders.CurrentlyUsed()) canUseAgain = usedProviders.tryLockSelection() require.False(t, canUseAgain) consumerSessionsMap = ConsumerSessionsMap{"test3": &SessionInfo{}, "test4": &SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) - unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) + usedProviders.AddUsed(consumerSessionsMap, nil) + unwanted = usedProviders.GetUnwantedProvidersToSend(NewRouterKey(nil)) require.Len(t, unwanted, 4) require.Equal(t, 4, usedProviders.CurrentlyUsed()) // one provider gives a retry - usedProviders.RemoveUsed("test", nil, status.Error(codes.Code(SessionOutOfSyncError.ABCICode()), "")) + usedProviders.RemoveUsed("test", NewRouterKey(nil), status.Error(codes.Code(SessionOutOfSyncError.ABCICode()), "")) require.Equal(t, 3, usedProviders.CurrentlyUsed()) - unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) + unwanted = usedProviders.GetUnwantedProvidersToSend(NewRouterKey(nil)) require.Len(t, unwanted, 3) // one provider gives a result - usedProviders.RemoveUsed("test2", nil, nil) - unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) + usedProviders.RemoveUsed("test2", NewRouterKey(nil), nil) + unwanted = usedProviders.GetUnwantedProvidersToSend(NewRouterKey(nil)) require.Len(t, unwanted, 3) require.Equal(t, 2, usedProviders.CurrentlyUsed()) // one provider gives an error - usedProviders.RemoveUsed("test3", nil, fmt.Errorf("bad")) - unwanted = usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) + usedProviders.RemoveUsed("test3", NewRouterKey(nil), fmt.Errorf("bad")) + unwanted = usedProviders.GetUnwantedProvidersToSend(NewRouterKey(nil)) require.Len(t, unwanted, 3) require.Equal(t, 1, usedProviders.CurrentlyUsed()) canUseAgain = usedProviders.tryLockSelection() @@ -69,13 +68,13 @@ func TestUsedProvidersAsync(t *testing.T) { go func() { time.Sleep(time.Millisecond * 10) consumerSessionsMap := ConsumerSessionsMap{"test": &SessionInfo{}, "test2": &SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) }() ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*100) defer cancel() canUseAgain := usedProviders.TryLockSelection(ctx) require.Nil(t, canUseAgain) - unwanted := usedProviders.GetUnwantedProvidersToSend([]*spectypes.Extension{}) + unwanted := usedProviders.GetUnwantedProvidersToSend(NewRouterKey(nil)) require.Len(t, unwanted, 2) require.Equal(t, 2, usedProviders.CurrentlyUsed()) }) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine_test.go b/protocol/rpcconsumer/consumer_relay_state_machine_test.go index 9cbc6d346a..3c74425327 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine_test.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine_test.go @@ -72,22 +72,22 @@ func TestConsumerStateMachineHappyFlow(t *testing.T) { switch taskNumber { case 0: require.False(t, task.IsDone()) - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) relayProcessor.UpdateBatch(nil) sendProtocolError(relayProcessor, "lava@test", time.Millisecond*1, fmt.Errorf("bad")) case 1: require.False(t, task.IsDone()) - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) relayProcessor.UpdateBatch(nil) sendNodeError(relayProcessor, "lava2@test", time.Millisecond*1) case 2: require.False(t, task.IsDone()) - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) relayProcessor.UpdateBatch(nil) sendNodeError(relayProcessor, "lava2@test", time.Millisecond*1) case 3: require.False(t, task.IsDone()) - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) relayProcessor.UpdateBatch(nil) sendSuccessResp(relayProcessor, "lava4@test", time.Millisecond*1) case 4: diff --git a/protocol/rpcconsumer/relay_processor_test.go b/protocol/rpcconsumer/relay_processor_test.go index 59e2957f5b..fef668ea43 100644 --- a/protocol/rpcconsumer/relay_processor_test.go +++ b/protocol/rpcconsumer/relay_processor_test.go @@ -41,7 +41,7 @@ var ( func sendSuccessResp(relayProcessor *RelayProcessor, provider string, delay time.Duration) { time.Sleep(delay) - relayProcessor.GetUsedProviders().RemoveUsed(provider, nil, nil) + relayProcessor.GetUsedProviders().RemoveUsed(provider, lavasession.NewRouterKey(nil), nil) response := &relayResponse{ relayResult: common.RelayResult{ Request: &pairingtypes.RelayRequest{ @@ -59,7 +59,7 @@ func sendSuccessResp(relayProcessor *RelayProcessor, provider string, delay time func sendProtocolError(relayProcessor *RelayProcessor, provider string, delay time.Duration, err error) { time.Sleep(delay) - relayProcessor.GetUsedProviders().RemoveUsed(provider, nil, err) + relayProcessor.GetUsedProviders().RemoveUsed(provider, lavasession.NewRouterKey(nil), err) response := &relayResponse{ relayResult: common.RelayResult{ Request: &pairingtypes.RelayRequest{ @@ -77,7 +77,7 @@ func sendProtocolError(relayProcessor *RelayProcessor, provider string, delay ti func sendNodeError(relayProcessor *RelayProcessor, provider string, delay time.Duration) { time.Sleep(delay) - relayProcessor.GetUsedProviders().RemoveUsed(provider, nil, nil) + relayProcessor.GetUsedProviders().RemoveUsed(provider, lavasession.NewRouterKey(nil), nil) response := &relayResponse{ relayResult: common.RelayResult{ Request: &pairingtypes.RelayRequest{ @@ -123,7 +123,7 @@ func TestRelayProcessorHappyFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*10) defer cancel() go sendSuccessResp(relayProcessor, "lava@test", time.Millisecond*5) @@ -179,7 +179,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) // check first reply go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) err = relayProcessor.WaitForResults(context.Background()) @@ -222,7 +222,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap = lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) // check first reply, this time we have hash in map, so we don't retry node errors. go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) err = relayProcessor.WaitForResults(context.Background()) @@ -248,7 +248,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap = lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) // check first reply, this time we have hash in map, so we don't retry node errors. go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) err = relayProcessor.WaitForResults(context.Background()) @@ -273,7 +273,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap = lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) // check first reply, this time we have hash in map, so we don't retry node errors. hash, err := relayProcessor.getInputMsgInfoHashString() require.NoError(t, err) @@ -325,7 +325,7 @@ func TestRelayProcessorNodeErrorRetryFlow(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) // check first reply go sendNodeError(relayProcessor, "lava@test", time.Millisecond*5) err = relayProcessor.WaitForResults(context.Background()) @@ -365,7 +365,7 @@ func TestRelayProcessorTimeout(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) go func() { time.Sleep(time.Millisecond * 5) ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*10) @@ -374,7 +374,7 @@ func TestRelayProcessorTimeout(t *testing.T) { require.NoError(t, ctx.Err()) require.Nil(t, canUse) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test3": &lavasession.SessionInfo{}, "lava@test4": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) }() go sendSuccessResp(relayProcessor, "lava@test", time.Millisecond*20) ctx, cancel = context.WithTimeout(context.Background(), time.Millisecond*200) @@ -418,7 +418,7 @@ func TestRelayProcessorRetry(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendSuccessResp(relayProcessor, "lava@test2", time.Millisecond*20) @@ -463,7 +463,7 @@ func TestRelayProcessorRetryNodeError(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendNodeError(relayProcessor, "lava@test2", time.Millisecond*20) @@ -508,7 +508,7 @@ func TestRelayProcessorStatefulApi(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava4@test": &lavasession.SessionInfo{}, "lava3@test": &lavasession.SessionInfo{}, "lava@test": &lavasession.SessionInfo{}, "lava2@test": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendNodeError(relayProcessor, "lava2@test", time.Millisecond*20) go sendNodeError(relayProcessor, "lava3@test", time.Millisecond*25) @@ -563,7 +563,7 @@ func TestRelayProcessorStatefulApiErr(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava4@test": &lavasession.SessionInfo{}, "lava3@test": &lavasession.SessionInfo{}, "lava@test": &lavasession.SessionInfo{}, "lava2@test": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendNodeError(relayProcessor, "lava2@test", time.Millisecond*20) go sendNodeError(relayProcessor, "lava3@test", time.Millisecond*25) @@ -610,7 +610,7 @@ func TestRelayProcessorLatest(t *testing.T) { require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - usedProviders.AddUsed(consumerSessionsMap, []*spectypes.Extension{}, nil) + usedProviders.AddUsed(consumerSessionsMap, nil) go sendProtocolError(relayProcessor, "lava@test", time.Millisecond*5, fmt.Errorf("bad")) go sendSuccessResp(relayProcessor, "lava@test2", time.Millisecond*20) diff --git a/protocol/rpcconsumer/rpcconsumer_server.go b/protocol/rpcconsumer/rpcconsumer_server.go index 3422dcb5aa..3a422c4423 100644 --- a/protocol/rpcconsumer/rpcconsumer_server.go +++ b/protocol/rpcconsumer/rpcconsumer_server.go @@ -600,10 +600,11 @@ func (rpccs *RPCConsumerServer) sendRelayToProvider( } if rpccs.debugRelays { + routerKey := lavasession.NewRouterKeyFromExtensions(extensions) utils.LavaFormatDebug("[Before Send] returned the following sessions", utils.LogAttr("sessions", sessions), - utils.LogAttr("usedProviders.GetUnwantedProvidersToSend", usedProviders.GetUnwantedProvidersToSend(extensions)), - utils.LogAttr("usedProviders.GetErroredProviders", usedProviders.GetErroredProviders(extensions)), + utils.LogAttr("usedProviders.GetUnwantedProvidersToSend", usedProviders.GetUnwantedProvidersToSend(routerKey)), + utils.LogAttr("usedProviders.GetErroredProviders", usedProviders.GetErroredProviders(routerKey)), utils.LogAttr("addons", addon), utils.LogAttr("extensions", extensions), utils.LogAttr("AllowSessionDegradation", relayProcessor.GetAllowSessionDegradation()), @@ -721,7 +722,7 @@ func (rpccs *RPCConsumerServer) sendRelayToProvider( } time.Sleep(backOffDuration) // sleep before releasing this singleConsumerSession // relay failed need to fail the session advancement - errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, origErr, extensions) + errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, origErr) if errReport != nil { utils.LavaFormatError("failed relay onSessionFailure errored", errReport, utils.Attribute{Key: "GUID", Value: goroutineCtx}, utils.Attribute{Key: "original error", Value: origErr.Error()}) } @@ -1003,9 +1004,8 @@ func (rpccs *RPCConsumerServer) relaySubscriptionInner(ctx context.Context, hash ) replyServer, err := endpointClient.RelaySubscribe(ctx, relayResult.Request) - var extensions []*spectypes.Extension // currently no extensions for subscription, so it will be nil. if err != nil { - errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, err, extensions) + errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, err) if errReport != nil { return utils.LavaFormatError("subscribe relay failed onSessionFailure errored", errReport, utils.LogAttr("GUID", ctx), @@ -1019,7 +1019,7 @@ func (rpccs *RPCConsumerServer) relaySubscriptionInner(ctx context.Context, hash reply, err := rpccs.getFirstSubscriptionReply(ctx, hashedParams, replyServer) if err != nil { - errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, err, extensions) + errReport := rpccs.consumerSessionManager.OnSessionFailure(singleConsumerSession, err) if errReport != nil { return utils.LavaFormatError("subscribe relay failed onSessionFailure errored", errReport, utils.LogAttr("GUID", ctx), @@ -1038,7 +1038,7 @@ func (rpccs *RPCConsumerServer) relaySubscriptionInner(ctx context.Context, hash relayResult.ReplyServer = replyServer relayResult.Reply = reply latestBlock := relayResult.Reply.LatestBlock - err = rpccs.consumerSessionManager.OnSessionDoneIncreaseCUOnly(singleConsumerSession, latestBlock, extensions) + err = rpccs.consumerSessionManager.OnSessionDoneIncreaseCUOnly(singleConsumerSession, latestBlock) return err } @@ -1345,7 +1345,8 @@ func (rpccs *RPCConsumerServer) appendHeadersToRelayResult(ctx context.Context, directiveHeaders := protocolMessage.GetDirectiveHeaders() _, debugRelays := directiveHeaders[common.LAVA_DEBUG_RELAY] if debugRelays { - erroredProviders := relayProcessor.GetUsedProviders().GetErroredProviders(protocolMessage.GetExtensions()) + routerKey := lavasession.NewRouterKeyFromExtensions(protocolMessage.GetExtensions()) + erroredProviders := relayProcessor.GetUsedProviders().GetErroredProviders(routerKey) if len(erroredProviders) > 0 { erroredProvidersArray := make([]string, len(erroredProviders)) idx := 0 From e325c5c20f76e106608519ed9c3a64a88da481dc Mon Sep 17 00:00:00 2001 From: omerlavanet Date: Sun, 3 Nov 2024 16:10:37 +0200 Subject: [PATCH 14/20] version merge --- protocol/lavasession/router_key.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/lavasession/router_key.go b/protocol/lavasession/router_key.go index b4ac807cc6..671f3e780d 100644 --- a/protocol/lavasession/router_key.go +++ b/protocol/lavasession/router_key.go @@ -5,7 +5,7 @@ import ( "strconv" "strings" - spectypes "github.com/lavanet/lava/v3/x/spec/types" + spectypes "github.com/lavanet/lava/v4/x/spec/types" ) const ( From 71e11b75d809ea652a1ad465f13694da927c7e2b Mon Sep 17 00:00:00 2001 From: omerlavanet Date: Sun, 3 Nov 2024 16:12:14 +0200 Subject: [PATCH 15/20] rename function for better description on functionality --- protocol/lavasession/used_providers.go | 2 +- protocol/rpcconsumer/relay_processor.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/protocol/lavasession/used_providers.go b/protocol/lavasession/used_providers.go index 7b28883546..ec5820f9a3 100644 --- a/protocol/lavasession/used_providers.go +++ b/protocol/lavasession/used_providers.go @@ -105,7 +105,7 @@ func (up *UsedProviders) CurrentlyUsedAddresses() []string { return addresses } -func (up *UsedProviders) UnwantedAddresses() []string { +func (up *UsedProviders) AllUnwantedAddresses() []string { if up == nil { utils.LavaFormatError("UsedProviders.UnwantedAddresses is nil, misuse detected", nil) return []string{} diff --git a/protocol/rpcconsumer/relay_processor.go b/protocol/rpcconsumer/relay_processor.go index d0d29595bf..589c054fcc 100644 --- a/protocol/rpcconsumer/relay_processor.go +++ b/protocol/rpcconsumer/relay_processor.go @@ -120,7 +120,7 @@ func (rp *RelayProcessor) String() string { usedProviders := rp.GetUsedProviders() currentlyUsedAddresses := usedProviders.CurrentlyUsedAddresses() - unwantedAddresses := usedProviders.UnwantedAddresses() + unwantedAddresses := usedProviders.AllUnwantedAddresses() return fmt.Sprintf("relayProcessor {%s, unwantedAddresses: %s,currentlyUsedAddresses:%s}", rp.ResultsManager.String(), strings.Join(unwantedAddresses, ";"), strings.Join(currentlyUsedAddresses, ";")) } @@ -370,7 +370,7 @@ func (rp *RelayProcessor) ProcessingResult() (returnedResult *common.RelayResult } // this must be here before the lock because this function locks - allProvidersAddresses := rp.GetUsedProviders().UnwantedAddresses() + allProvidersAddresses := rp.GetUsedProviders().AllUnwantedAddresses() rp.lock.RLock() defer rp.lock.RUnlock() From a3cc9ab5236f1fc52c1176f40725fd5d1e866bf9 Mon Sep 17 00:00:00 2001 From: omerlavanet Date: Sun, 3 Nov 2024 18:52:16 +0200 Subject: [PATCH 16/20] bigger consistency timeout to succeed on actions --- protocol/rpcprovider/rpcprovider_server_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/rpcprovider/rpcprovider_server_test.go b/protocol/rpcprovider/rpcprovider_server_test.go index a18228b9ec..912a368eb2 100644 --- a/protocol/rpcprovider/rpcprovider_server_test.go +++ b/protocol/rpcprovider/rpcprovider_server_test.go @@ -129,7 +129,7 @@ func TestHandleConsistency(t *testing.T) { requestBlock: spectypes.LATEST_BLOCK, specId: "LAV1", err: nil, - timeout: 20 * time.Millisecond, // 150 is one way travel time + timeout: 25 * time.Millisecond, // 150 is one way travel time chainTrackerBlocks: []int64{100, 101}, changeTime: 100 * time.Second, sleep: true, From 8beacb9fb16fdf20bfebe521a4d6d63bbd85412d Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Wed, 6 Nov 2024 17:46:34 +0100 Subject: [PATCH 17/20] adding initialized flag --- .../rpcconsumer/consumer_relay_state_machine.go | 13 ++++++++++--- .../consumer_relay_state_machine_test.go | 6 ++++-- protocol/rpcconsumer/rpcconsumer_server.go | 5 ++++- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index 8f2cf797fd..003adec751 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -20,7 +20,7 @@ import ( type RelayStateMachine interface { GetProtocolMessage() chainlib.ProtocolMessage GetDebugState() bool - GetRelayTaskChannel() chan RelayStateSendInstructions + GetRelayTaskChannel() (chan RelayStateSendInstructions, error) UpdateBatch(err error) GetSelection() Selection GetUsedProviders() *lavasession.UsedProviders @@ -95,6 +95,10 @@ func NewRelayStateMachine( } } +func (crsm *ConsumerRelayStateMachine) Initialized() bool { + return crsm.relayRetriesManager != nil && crsm.resultsChecker != nil +} + func (crsm *ConsumerRelayStateMachine) SetRelayRetriesManager(relayRetriesManager *lavaprotocol.RelayRetriesManager) { crsm.relayRetriesManager = relayRetriesManager } @@ -196,7 +200,10 @@ func (rssi *RelayStateSendInstructions) IsDone() bool { return rssi.done || rssi.err != nil } -func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() chan RelayStateSendInstructions { +func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() (chan RelayStateSendInstructions, error) { + if !crsm.Initialized() { + return nil, utils.LavaFormatError("ConsumerRelayStateMachine was not initialized properly", nil) + } relayTaskChannel := make(chan RelayStateSendInstructions) go func() { // A channel to be notified processing was done, true means we have results and can return @@ -325,7 +332,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() chan RelayStateSend } } }() - return relayTaskChannel + return relayTaskChannel, nil } func (crsm *ConsumerRelayStateMachine) UpdateBatch(err error) { diff --git a/protocol/rpcconsumer/consumer_relay_state_machine_test.go b/protocol/rpcconsumer/consumer_relay_state_machine_test.go index 3783d74e5d..bd51cd5673 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine_test.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine_test.go @@ -74,7 +74,8 @@ func TestConsumerStateMachineHappyFlow(t *testing.T) { require.Zero(t, usedProviders.SessionsLatestBatch()) consumerSessionsMap := lavasession.ConsumerSessionsMap{"lava@test": &lavasession.SessionInfo{}, "lava@test2": &lavasession.SessionInfo{}} - relayTaskChannel := relayProcessor.GetRelayTaskChannel() + relayTaskChannel, err := relayProcessor.GetRelayTaskChannel() + require.NoError(t, err) taskNumber := 0 for task := range relayTaskChannel { switch taskNumber { @@ -143,7 +144,8 @@ func TestConsumerStateMachineExhaustRetries(t *testing.T) { require.Zero(t, usedProviders.CurrentlyUsed()) require.Zero(t, usedProviders.SessionsLatestBatch()) - relayTaskChannel := relayProcessor.GetRelayTaskChannel() + relayTaskChannel, err := relayProcessor.GetRelayTaskChannel() + require.NoError(t, err) taskNumber := 0 for task := range relayTaskChannel { switch taskNumber { diff --git a/protocol/rpcconsumer/rpcconsumer_server.go b/protocol/rpcconsumer/rpcconsumer_server.go index 86582a43f4..468c536fe9 100644 --- a/protocol/rpcconsumer/rpcconsumer_server.go +++ b/protocol/rpcconsumer/rpcconsumer_server.go @@ -437,7 +437,10 @@ func (rpccs *RPCConsumerServer) ProcessRelaySend(ctx context.Context, protocolMe NewRelayStateMachine(ctx, usedProviders, rpccs, protocolMessage, analytics, rpccs.debugRelays, rpccs.rpcConsumerLogs), ) - relayTaskChannel := relayProcessor.GetRelayTaskChannel() + relayTaskChannel, err := relayProcessor.GetRelayTaskChannel() + if err != nil { + return relayProcessor, err + } for task := range relayTaskChannel { if task.IsDone() { return relayProcessor, task.err From c33e5e0e4ab6c4c0d02f8a1be39664b5cb3957e6 Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Wed, 6 Nov 2024 17:59:47 +0100 Subject: [PATCH 18/20] append existing extensions to archive --- protocol/rpcconsumer/consumer_relay_state_machine.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index 003adec751..96a420d04a 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -2,6 +2,7 @@ package rpcconsumer import ( context "context" + "strings" "sync/atomic" "time" @@ -139,7 +140,9 @@ func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, // We need to set archive. // Create a new relay private data containing the extension. userData := crsm.protocolMessage.GetUserData() - metaDataForArchive := []pairingtypes.Metadata{{Name: common.EXTENSION_OVERRIDE_HEADER_NAME, Value: extensionslib.ArchiveExtension}} + // add all existing extensions including archive split by "," so the override will work + existingExtensionsPlusArchive := strings.Join(append(relayRequestData.Extensions, extensionslib.ArchiveExtension), ",") + metaDataForArchive := []pairingtypes.Metadata{{Name: common.EXTENSION_OVERRIDE_HEADER_NAME, Value: existingExtensionsPlusArchive}} newProtocolMessage, err := crsm.relaySender.ParseRelay(crsm.ctx, relayRequestData.ApiUrl, string(relayRequestData.Data), relayRequestData.ConnectionType, userData.DappId, userData.ConsumerIp, metaDataForArchive) if err != nil { utils.LavaFormatError("Failed converting to archive message in shouldRetry", err, utils.LogAttr("relayRequestData", relayRequestData), utils.LogAttr("metadata", metaDataForArchive)) From aaf3cc0734556dc486de58616106dc768120950f Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 7 Nov 2024 11:36:50 +0100 Subject: [PATCH 19/20] WIP --- .../consumer_relay_state_machine.go | 131 +++++++----------- protocol/rpcconsumer/relay_state.go | 113 +++++++++++++++ 2 files changed, 161 insertions(+), 83 deletions(-) create mode 100644 protocol/rpcconsumer/relay_state.go diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index 96a420d04a..daaa76e888 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -2,15 +2,12 @@ package rpcconsumer import ( context "context" - "strings" "sync/atomic" "time" - slices "github.com/lavanet/lava/v4/utils/lavaslices" pairingtypes "github.com/lavanet/lava/v4/x/pairing/types" "github.com/lavanet/lava/v4/protocol/chainlib" - "github.com/lavanet/lava/v4/protocol/chainlib/extensionslib" common "github.com/lavanet/lava/v4/protocol/common" "github.com/lavanet/lava/v4/protocol/lavaprotocol" lavasession "github.com/lavanet/lava/v4/protocol/lavasession" @@ -53,19 +50,18 @@ type tickerMetricSetterInf interface { } type ConsumerRelayStateMachine struct { - ctx context.Context // same context as user context. - relaySender ConsumerRelaySender - resultsChecker ResultsCheckerInf - protocolMessage chainlib.ProtocolMessage // only one should make changes to protocol message is ConsumerRelayStateMachine. - originalProtocolMessage chainlib.ProtocolMessage - appliedArchiveExtension bool - analytics *metrics.RelayMetrics // first relay metrics - selection Selection - debugRelays bool - tickerMetricSetter tickerMetricSetterInf - batchUpdate chan error - usedProviders *lavasession.UsedProviders - relayRetriesManager *lavaprotocol.RelayRetriesManager + ctx context.Context // same context as user context. + relaySender ConsumerRelaySender + resultsChecker ResultsCheckerInf + analytics *metrics.RelayMetrics // first relay metrics + selection Selection + debugRelays bool + tickerMetricSetter tickerMetricSetterInf + batchUpdate chan error + usedProviders *lavasession.UsedProviders + relayRetriesManager *lavaprotocol.RelayRetriesManager + relayState []*RelayState + protocolMessage chainlib.ProtocolMessage } func NewRelayStateMachine( @@ -83,16 +79,16 @@ func NewRelayStateMachine( } return &ConsumerRelayStateMachine{ - ctx: ctx, - usedProviders: usedProviders, - relaySender: relaySender, - protocolMessage: protocolMessage, - originalProtocolMessage: protocolMessage, - analytics: analytics, - selection: selection, - debugRelays: debugRelays, - tickerMetricSetter: tickerMetricSetter, - batchUpdate: make(chan error, MaximumNumberOfTickerRelayRetries), + ctx: ctx, + usedProviders: usedProviders, + relaySender: relaySender, + protocolMessage: protocolMessage, + analytics: analytics, + selection: selection, + debugRelays: debugRelays, + tickerMetricSetter: tickerMetricSetter, + batchUpdate: make(chan error, MaximumNumberOfTickerRelayRetries), + relayState: make([]*RelayState, 0), } } @@ -116,62 +112,28 @@ func (crsm *ConsumerRelayStateMachine) GetSelection() Selection { return crsm.selection } +func (crsm *ConsumerRelayStateMachine) stateTransition(relayState *RelayState) *RelayState { + var nextState *RelayState + if relayState == nil { // initial state + nextState = NewRelayState(crsm.ctx, crsm.protocolMessage, 0, crsm.relayRetriesManager, crsm.relaySender) + } else { + nextState = NewRelayState(crsm.ctx, relayState.GetProtocolMessage(), relayState.GetStateNumber()+1, crsm.relayRetriesManager, crsm.relaySender) + } + crsm.relayState = append(crsm.relayState, nextState) + return nextState +} + // Should retry implements the logic for when to send another relay. // As well as the decision of changing the protocol message, // into different extensions or addons based on certain conditions -func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfRetriesLaunched int, numberOfNodeErrors uint64) bool { - shouldRetry := crsm.retryCondition(numberOfRetriesLaunched) +func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfNodeErrors uint64) bool { + batchNumber := crsm.usedProviders.BatchNumber() + shouldRetry := crsm.retryCondition(batchNumber) if shouldRetry { + lastState := crsm.relayState[len(crsm.relayState)-1] + nextState := crsm.stateTransition(lastState) // Retry archive logic - hashes := crsm.GetProtocolMessage().GetRequestedBlocksHashes() - if len(hashes) > 0 && numberOfNodeErrors > 0 { - // Launch archive only on the second retry attempt. - if numberOfRetriesLaunched == 1 { - // Iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. - // If we are familiar with all, we don't want to allow archive. - for _, hash := range hashes { - if !crsm.relayRetriesManager.CheckHashInCache(hash) { - // If we didn't find the hash in the cache we can try archive relay. - relayRequestData := crsm.protocolMessage.RelayPrivateData() - // Validate we're not already archive - if slices.Contains(relayRequestData.Extensions, extensionslib.ArchiveExtension) { - break // Do nothing its already archive. - } - // We need to set archive. - // Create a new relay private data containing the extension. - userData := crsm.protocolMessage.GetUserData() - // add all existing extensions including archive split by "," so the override will work - existingExtensionsPlusArchive := strings.Join(append(relayRequestData.Extensions, extensionslib.ArchiveExtension), ",") - metaDataForArchive := []pairingtypes.Metadata{{Name: common.EXTENSION_OVERRIDE_HEADER_NAME, Value: existingExtensionsPlusArchive}} - newProtocolMessage, err := crsm.relaySender.ParseRelay(crsm.ctx, relayRequestData.ApiUrl, string(relayRequestData.Data), relayRequestData.ConnectionType, userData.DappId, userData.ConsumerIp, metaDataForArchive) - if err != nil { - utils.LavaFormatError("Failed converting to archive message in shouldRetry", err, utils.LogAttr("relayRequestData", relayRequestData), utils.LogAttr("metadata", metaDataForArchive)) - } - // Creating an archive protocol message, and set it to current protocol message - crsm.protocolMessage = newProtocolMessage - // for future batches. - crsm.appliedArchiveExtension = true - break - } - } - // We had node error, and we have a hash parsed. - } else if crsm.appliedArchiveExtension && numberOfNodeErrors >= 2 { - // Validate the following. - // 1. That we have applied archive - // 2. That we had more than one node error (meaning the 2nd was a successful archive [node error] 100%) - // Now - - // We know we have applied archive and failed. - // 1. We can remove the archive, return to the original protocol message, - // 2. Set all hashes as irrelevant for future queries. - crsm.protocolMessage = crsm.originalProtocolMessage - for _, hash := range hashes { - crsm.relayRetriesManager.AddHashToCache(hash) - } - crsm.appliedArchiveExtension = false // so we don't get here again - // We do not want to send additional relays after archive attempt. return false. - return false - } - } + return nextState.upgradeToArchiveIfNeeded(batchNumber, numberOfNodeErrors) } return shouldRetry } @@ -189,7 +151,11 @@ func (crsm *ConsumerRelayStateMachine) GetDebugState() bool { } func (crsm *ConsumerRelayStateMachine) GetProtocolMessage() chainlib.ProtocolMessage { - return crsm.protocolMessage + stateLength := len(crsm.relayState) + if stateLength == 0 { + return crsm.protocolMessage + } + return crsm.relayState[stateLength-1].GetProtocolMessage() } type RelayStateSendInstructions struct { @@ -246,6 +212,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() (chan RelayStateSen } } + crsm.stateTransition(nil) // Send First Message, with analytics and without waiting for batch update. relayTaskChannel <- RelayStateSendInstructions{ protocolMessage: crsm.GetProtocolMessage(), @@ -273,9 +240,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() (chan RelayStateSen } else { utils.LavaFormatTrace("[StateMachine] batchUpdate - err != nil - batch fail retry attempt", utils.LogAttr("batch", crsm.usedProviders.BatchNumber()), utils.LogAttr("consecutiveBatchErrors", consecutiveBatchErrors)) // Failed sending message, but we still want to attempt sending more. - relayTaskChannel <- RelayStateSendInstructions{ - protocolMessage: crsm.GetProtocolMessage(), - } + relayTaskChannel <- RelayStateSendInstructions{protocolMessage: crsm.GetProtocolMessage()} } continue } @@ -291,7 +256,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() (chan RelayStateSen return } // If should retry == true, send a new batch. (success == false) - if crsm.shouldRetry(crsm.usedProviders.BatchNumber(), numberOfNodeErrorsAtomic.Load()) { + if crsm.shouldRetry(numberOfNodeErrorsAtomic.Load()) { utils.LavaFormatTrace("[StateMachine] success := <-gotResults - crsm.ShouldRetry(batchNumber)", utils.LogAttr("batch", crsm.usedProviders.BatchNumber())) relayTaskChannel <- RelayStateSendInstructions{protocolMessage: crsm.GetProtocolMessage()} } else { @@ -300,7 +265,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() (chan RelayStateSen go readResultsFromProcessor() case <-startNewBatchTicker.C: // Only trigger another batch for non BestResult relays or if we didn't pass the retry limit. - if crsm.shouldRetry(crsm.usedProviders.BatchNumber(), numberOfNodeErrorsAtomic.Load()) { + if crsm.shouldRetry(numberOfNodeErrorsAtomic.Load()) { utils.LavaFormatTrace("[StateMachine] ticker triggered", utils.LogAttr("batch", crsm.usedProviders.BatchNumber())) relayTaskChannel <- RelayStateSendInstructions{protocolMessage: crsm.GetProtocolMessage()} // Add ticker launch metrics diff --git a/protocol/rpcconsumer/relay_state.go b/protocol/rpcconsumer/relay_state.go new file mode 100644 index 0000000000..ec564fb3b5 --- /dev/null +++ b/protocol/rpcconsumer/relay_state.go @@ -0,0 +1,113 @@ +package rpcconsumer + +import ( + "context" + "strings" + + "github.com/lavanet/lava/v4/protocol/chainlib" + "github.com/lavanet/lava/v4/protocol/chainlib/extensionslib" + common "github.com/lavanet/lava/v4/protocol/common" + "github.com/lavanet/lava/v4/utils" + slices "github.com/lavanet/lava/v4/utils/lavaslices" + pairingtypes "github.com/lavanet/lava/v4/x/pairing/types" +) + +type RetryHashCacheInf interface { + CheckHashInCache(hash string) bool + AddHashToCache(hash string) +} + +type RelayParserInf interface { + ParseRelay( + ctx context.Context, + url string, + req string, + connectionType string, + dappID string, + consumerIp string, + metadata []pairingtypes.Metadata, + ) (protocolMessage chainlib.ProtocolMessage, err error) +} + +type RelayState struct { + isArchive bool + isUpgraded bool + isHashCached bool + stateNumber int + protocolMessage chainlib.ProtocolMessage + cache RetryHashCacheInf + relayParser RelayParserInf + ctx context.Context +} + +func NewRelayState(ctx context.Context, protocolMessage chainlib.ProtocolMessage, stateNumber int, cache RetryHashCacheInf, relayParser RelayParserInf) *RelayState { + relayRequestData := protocolMessage.RelayPrivateData() + isArchive := false + if slices.Contains(relayRequestData.Extensions, extensionslib.ArchiveExtension) { + isArchive = true + } + return &RelayState{ctx: ctx, protocolMessage: protocolMessage, stateNumber: stateNumber, cache: cache, relayParser: relayParser, isArchive: isArchive} +} + +func (rs *RelayState) GetStateNumber() int { + if rs == nil { + return 0 + } + return rs.stateNumber +} + +func (rs *RelayState) GetProtocolMessage() chainlib.ProtocolMessage { + return rs.protocolMessage +} + +func (rs *RelayState) upgradeToArchiveIfNeeded(numberOfRetriesLaunched int, numberOfNodeErrors uint64) bool { + hashes := rs.protocolMessage.GetRequestedBlocksHashes() + // If we got upgraded and we still got a node error (>= 2) we know upgrade didn't work + if rs.isUpgraded && numberOfNodeErrors >= 2 { + // Validate the following. + // 1. That we have applied archive + // 2. That we had more than one node error (meaning the 2nd was a successful archive [node error] 100%) + // Now - + // We know we have applied archive and failed. + // 1. We can remove the archive, return to the original protocol message, + // 2. Set all hashes as irrelevant for future queries. + if !rs.isHashCached { + for _, hash := range hashes { + rs.cache.AddHashToCache(hash) + } + } + // We do not want to send additional relays after archive attempt. return false. + return false + } + if !rs.isArchive && len(hashes) > 0 && numberOfNodeErrors > 0 { + // Launch archive only on the second retry attempt. + if numberOfRetriesLaunched == 1 { + // Iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. + // If we are familiar with all, we don't want to allow archive. + for _, hash := range hashes { + if !rs.cache.CheckHashInCache(hash) { + // If we didn't find the hash in the cache we can try archive relay. + relayRequestData := rs.protocolMessage.RelayPrivateData() + // We need to set archive. + // Create a new relay private data containing the extension. + userData := rs.protocolMessage.GetUserData() + // add all existing extensions including archive split by "," so the override will work + existingExtensionsPlusArchive := strings.Join(append(relayRequestData.Extensions, extensionslib.ArchiveExtension), ",") + metaDataForArchive := []pairingtypes.Metadata{{Name: common.EXTENSION_OVERRIDE_HEADER_NAME, Value: existingExtensionsPlusArchive}} + newProtocolMessage, err := rs.relayParser.ParseRelay(rs.ctx, relayRequestData.ApiUrl, string(relayRequestData.Data), relayRequestData.ConnectionType, userData.DappId, userData.ConsumerIp, metaDataForArchive) + if err != nil { + utils.LavaFormatError("Failed converting to archive message in shouldRetry", err, utils.LogAttr("relayRequestData", relayRequestData), utils.LogAttr("metadata", metaDataForArchive)) + } + // Creating an archive protocol message, and set it to current protocol message + rs.protocolMessage = newProtocolMessage + // for future batches. + rs.isUpgraded = true + rs.isArchive = true + break + } + } + // We had node error, and we have a hash parsed. + } + } + return true +} From 618fa813090ccda1a40eada837f151cbf70c95f4 Mon Sep 17 00:00:00 2001 From: Ran Mishael Date: Thu, 7 Nov 2024 15:09:51 +0100 Subject: [PATCH 20/20] finished state machine --- .../consumer_relay_state_machine.go | 32 +++++++++++++++---- protocol/rpcconsumer/relay_state.go | 25 +++++++++------ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/protocol/rpcconsumer/consumer_relay_state_machine.go b/protocol/rpcconsumer/consumer_relay_state_machine.go index daaa76e888..484fb0bc36 100644 --- a/protocol/rpcconsumer/consumer_relay_state_machine.go +++ b/protocol/rpcconsumer/consumer_relay_state_machine.go @@ -2,6 +2,7 @@ package rpcconsumer import ( context "context" + "sync" "sync/atomic" "time" @@ -62,6 +63,7 @@ type ConsumerRelayStateMachine struct { relayRetriesManager *lavaprotocol.RelayRetriesManager relayState []*RelayState protocolMessage chainlib.ProtocolMessage + relayStateLock sync.RWMutex } func NewRelayStateMachine( @@ -112,14 +114,29 @@ func (crsm *ConsumerRelayStateMachine) GetSelection() Selection { return crsm.selection } +func (crsm *ConsumerRelayStateMachine) appendRelayState(nextState *RelayState) { + crsm.relayStateLock.Lock() + defer crsm.relayStateLock.Unlock() + crsm.relayState = append(crsm.relayState, nextState) +} + +func (crsm *ConsumerRelayStateMachine) getLatestState() *RelayState { + crsm.relayStateLock.RLock() + defer crsm.relayStateLock.RUnlock() + if len(crsm.relayState) == 0 { + return nil + } + return crsm.relayState[len(crsm.relayState)-1] +} + func (crsm *ConsumerRelayStateMachine) stateTransition(relayState *RelayState) *RelayState { var nextState *RelayState if relayState == nil { // initial state - nextState = NewRelayState(crsm.ctx, crsm.protocolMessage, 0, crsm.relayRetriesManager, crsm.relaySender) + nextState = NewRelayState(crsm.ctx, crsm.protocolMessage, 0, crsm.relayRetriesManager, crsm.relaySender, ArchiveStatus{}) } else { - nextState = NewRelayState(crsm.ctx, relayState.GetProtocolMessage(), relayState.GetStateNumber()+1, crsm.relayRetriesManager, crsm.relaySender) + nextState = NewRelayState(crsm.ctx, relayState.GetProtocolMessage(), relayState.GetStateNumber()+1, crsm.relayRetriesManager, crsm.relaySender, relayState.archiveStatus) } - crsm.relayState = append(crsm.relayState, nextState) + crsm.appendRelayState(nextState) return nextState } @@ -130,7 +147,7 @@ func (crsm *ConsumerRelayStateMachine) shouldRetry(numberOfNodeErrors uint64) bo batchNumber := crsm.usedProviders.BatchNumber() shouldRetry := crsm.retryCondition(batchNumber) if shouldRetry { - lastState := crsm.relayState[len(crsm.relayState)-1] + lastState := crsm.getLatestState() nextState := crsm.stateTransition(lastState) // Retry archive logic return nextState.upgradeToArchiveIfNeeded(batchNumber, numberOfNodeErrors) @@ -151,11 +168,11 @@ func (crsm *ConsumerRelayStateMachine) GetDebugState() bool { } func (crsm *ConsumerRelayStateMachine) GetProtocolMessage() chainlib.ProtocolMessage { - stateLength := len(crsm.relayState) - if stateLength == 0 { + latestState := crsm.getLatestState() + if latestState == nil { // failed fetching latest state return crsm.protocolMessage } - return crsm.relayState[stateLength-1].GetProtocolMessage() + return latestState.GetProtocolMessage() } type RelayStateSendInstructions struct { @@ -212,6 +229,7 @@ func (crsm *ConsumerRelayStateMachine) GetRelayTaskChannel() (chan RelayStateSen } } + // initialize relay state crsm.stateTransition(nil) // Send First Message, with analytics and without waiting for batch update. relayTaskChannel <- RelayStateSendInstructions{ diff --git a/protocol/rpcconsumer/relay_state.go b/protocol/rpcconsumer/relay_state.go index ec564fb3b5..8bee2eda25 100644 --- a/protocol/rpcconsumer/relay_state.go +++ b/protocol/rpcconsumer/relay_state.go @@ -29,10 +29,14 @@ type RelayParserInf interface { ) (protocolMessage chainlib.ProtocolMessage, err error) } +type ArchiveStatus struct { + isArchive bool + isUpgraded bool + isHashCached bool +} + type RelayState struct { - isArchive bool - isUpgraded bool - isHashCached bool + archiveStatus ArchiveStatus stateNumber int protocolMessage chainlib.ProtocolMessage cache RetryHashCacheInf @@ -40,13 +44,13 @@ type RelayState struct { ctx context.Context } -func NewRelayState(ctx context.Context, protocolMessage chainlib.ProtocolMessage, stateNumber int, cache RetryHashCacheInf, relayParser RelayParserInf) *RelayState { +func NewRelayState(ctx context.Context, protocolMessage chainlib.ProtocolMessage, stateNumber int, cache RetryHashCacheInf, relayParser RelayParserInf, archiveInfo ArchiveStatus) *RelayState { relayRequestData := protocolMessage.RelayPrivateData() isArchive := false if slices.Contains(relayRequestData.Extensions, extensionslib.ArchiveExtension) { isArchive = true } - return &RelayState{ctx: ctx, protocolMessage: protocolMessage, stateNumber: stateNumber, cache: cache, relayParser: relayParser, isArchive: isArchive} + return &RelayState{ctx: ctx, protocolMessage: protocolMessage, stateNumber: stateNumber, cache: cache, relayParser: relayParser, archiveStatus: ArchiveStatus{isArchive: isArchive, isUpgraded: archiveInfo.isUpgraded, isHashCached: archiveInfo.isHashCached}} } func (rs *RelayState) GetStateNumber() int { @@ -63,7 +67,7 @@ func (rs *RelayState) GetProtocolMessage() chainlib.ProtocolMessage { func (rs *RelayState) upgradeToArchiveIfNeeded(numberOfRetriesLaunched int, numberOfNodeErrors uint64) bool { hashes := rs.protocolMessage.GetRequestedBlocksHashes() // If we got upgraded and we still got a node error (>= 2) we know upgrade didn't work - if rs.isUpgraded && numberOfNodeErrors >= 2 { + if rs.archiveStatus.isUpgraded && numberOfNodeErrors >= 2 { // Validate the following. // 1. That we have applied archive // 2. That we had more than one node error (meaning the 2nd was a successful archive [node error] 100%) @@ -71,15 +75,16 @@ func (rs *RelayState) upgradeToArchiveIfNeeded(numberOfRetriesLaunched int, numb // We know we have applied archive and failed. // 1. We can remove the archive, return to the original protocol message, // 2. Set all hashes as irrelevant for future queries. - if !rs.isHashCached { + if !rs.archiveStatus.isHashCached { for _, hash := range hashes { rs.cache.AddHashToCache(hash) } + rs.archiveStatus.isHashCached = true } // We do not want to send additional relays after archive attempt. return false. return false } - if !rs.isArchive && len(hashes) > 0 && numberOfNodeErrors > 0 { + if !rs.archiveStatus.isArchive && len(hashes) > 0 && numberOfNodeErrors > 0 { // Launch archive only on the second retry attempt. if numberOfRetriesLaunched == 1 { // Iterate over all hashes found in relay, if we don't have them in the cache we can try retry on archive. @@ -101,8 +106,8 @@ func (rs *RelayState) upgradeToArchiveIfNeeded(numberOfRetriesLaunched int, numb // Creating an archive protocol message, and set it to current protocol message rs.protocolMessage = newProtocolMessage // for future batches. - rs.isUpgraded = true - rs.isArchive = true + rs.archiveStatus.isUpgraded = true + rs.archiveStatus.isArchive = true break } }