From cb3ace0401e53400ed662619f6b45b1a8bf045be Mon Sep 17 00:00:00 2001 From: Zixuan James Li Date: Thu, 8 Aug 2024 18:31:42 -0400 Subject: [PATCH 1/4] store test: Make a group for retry tests Signed-off-by: Zixuan James Li --- test/model/store_test.dart | 88 +++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/test/model/store_test.dart b/test/model/store_test.dart index 615dd27dc6..8f503be275 100644 --- a/test/model/store_test.dart +++ b/test/model/store_test.dart @@ -455,53 +455,55 @@ void main() { check(store.debugMessageListViews).isEmpty(); })); - void checkRetry(void Function() prepareError) { - awaitFakeAsync((async) async { - await prepareStore(lastEventId: 1); - updateMachine.debugPauseLoop(); - updateMachine.poll(); - check(async.pendingTimers).length.equals(0); - - // Make the request, inducing an error in it. - prepareError(); - updateMachine.debugAdvanceLoop(); - async.elapse(Duration.zero); - checkLastRequest(lastEventId: 1); - check(store).isLoading.isTrue(); - - // Polling doesn't resume immediately; there's a timer. - check(async.pendingTimers).length.equals(1); - updateMachine.debugAdvanceLoop(); - async.flushMicrotasks(); - check(connection.lastRequest).isNull(); - check(async.pendingTimers).length.equals(1); - - // Polling continues after a timer. - connection.prepare(json: GetEventsResult(events: [ - HeartbeatEvent(id: 2), - ], queueId: null).toJson()); - async.flushTimers(); - checkLastRequest(lastEventId: 1); - check(updateMachine.lastEventId).equals(2); - check(store).isLoading.isFalse(); - }); - } + group('retries on errors', () { + void checkRetry(void Function() prepareError) { + awaitFakeAsync((async) async { + await prepareStore(lastEventId: 1); + updateMachine.debugPauseLoop(); + updateMachine.poll(); + check(async.pendingTimers).length.equals(0); + + // Make the request, inducing an error in it. + prepareError(); + updateMachine.debugAdvanceLoop(); + async.elapse(Duration.zero); + checkLastRequest(lastEventId: 1); + check(store).isLoading.isTrue(); + + // Polling doesn't resume immediately; there's a timer. + check(async.pendingTimers).length.equals(1); + updateMachine.debugAdvanceLoop(); + async.flushMicrotasks(); + check(connection.lastRequest).isNull(); + check(async.pendingTimers).length.equals(1); + + // Polling continues after a timer. + connection.prepare(json: GetEventsResult(events: [ + HeartbeatEvent(id: 2), + ], queueId: null).toJson()); + async.flushTimers(); + checkLastRequest(lastEventId: 1); + check(updateMachine.lastEventId).equals(2); + check(store).isLoading.isFalse(); + }); + } - test('retries on Server5xxException', () { - checkRetry(() => connection.prepare(httpStatus: 500, body: 'splat')); - }); + test('Server5xxException', () { + checkRetry(() => connection.prepare(httpStatus: 500, body: 'splat')); + }); - test('retries on NetworkException', () { - checkRetry(() => connection.prepare(exception: Exception("failed"))); - }); + test('NetworkException', () { + checkRetry(() => connection.prepare(exception: Exception("failed"))); + }); - test('retries on ZulipApiException', () { - checkRetry(() => connection.prepare(httpStatus: 400, json: { - 'result': 'error', 'code': 'BAD_REQUEST', 'msg': 'Bad request'})); - }); + test('ZulipApiException', () { + checkRetry(() => connection.prepare(httpStatus: 400, json: { + 'result': 'error', 'code': 'BAD_REQUEST', 'msg': 'Bad request'})); + }); - test('retries on MalformedServerResponseException', () { - checkRetry(() => connection.prepare(httpStatus: 200, body: 'nonsense')); + test('MalformedServerResponseException', () { + checkRetry(() => connection.prepare(httpStatus: 200, body: 'nonsense')); + }); }); }); From ac0dd7d8a817899d9972138d640d5e984c3f4023 Mon Sep 17 00:00:00 2001 From: Zixuan James Li Date: Tue, 6 Aug 2024 19:13:21 -0400 Subject: [PATCH 2/4] store: Report polling errors with details We extend the checkRetry helper to support simulating multiple consecutive errors. We delay reporting transient errors until we have retried a certain number of times, because they might recover on their own. Signed-off-by: Zixuan James Li --- assets/l10n/app_en.arb | 12 ++++++ lib/model/store.dart | 34 ++++++++++++++++- test/model/store_test.dart | 76 ++++++++++++++++++++++++++++++++------ 3 files changed, 109 insertions(+), 13 deletions(-) diff --git a/assets/l10n/app_en.arb b/assets/l10n/app_en.arb index f89e86aa2d..0031066390 100644 --- a/assets/l10n/app_en.arb +++ b/assets/l10n/app_en.arb @@ -160,6 +160,18 @@ "message": {"type": "String", "example": "Invalid format"} } }, + "errorConnectingToServerShort": "Error connecting to Zulip. Retrying…", + "@errorConnectingToServerShort": { + "description": "Short error message for a generic unknown error connecting to the server." + }, + "errorConnectingToServerDetails": "Error connecting to Zulip at {serverUrl}. Will retry:\n\n{error}", + "@errorConnectingToServerDetails": { + "description": "Dialog error message for a generic unknown error connecting to the server with details.", + "placeholders": { + "serverUrl": {"type": "String", "example": "http://example.com/"}, + "error": {"type": "String", "example": "Invalid format"} + } + }, "errorSharingFailed": "Sharing failed", "@errorSharingFailed": { "description": "Error message when sharing a message failed." diff --git a/lib/model/store.dart b/lib/model/store.dart index 309f12f287..09510a149a 100644 --- a/lib/model/store.dart +++ b/lib/model/store.dart @@ -21,6 +21,7 @@ import '../notifications/receive.dart'; import 'autocomplete.dart'; import 'database.dart'; import 'emoji.dart'; +import 'localizations.dart'; import 'message.dart'; import 'message_list.dart'; import 'recent_dm_conversations.dart'; @@ -793,8 +794,25 @@ class UpdateMachine { }()); } + /// This controls when we start to report transient errors to the user when + /// polling. + /// + /// At the 6th failure, the expected time elapsed since the first failure + /// will be 1.55 seocnds. + static const transientFailureCountNotifyThreshold = 5; + void poll() async { BackoffMachine? backoffMachine; + int accumulatedTransientFailureCount = 0; + + /// This only reports transient errors after reaching + /// a pre-defined threshold of retries. + void maybeReportTransientError(String? message, {String? details}) { + accumulatedTransientFailureCount++; + if (accumulatedTransientFailureCount > transientFailureCountNotifyThreshold) { + reportErrorToUserBriefly(message, details: details); + } + } while (true) { if (_debugLoopSignal != null) { @@ -811,6 +829,8 @@ class UpdateMachine { queueId: queueId, lastEventId: lastEventId); } catch (e) { store.isLoading = true; + final localizations = GlobalLocalizations.zulipLocalizations; + final serverUrl = store.connection.realmUrl.origin; switch (e) { case ZulipApiException(code: 'BAD_EVENT_QUEUE_ID'): assert(debugLog('Lost event queue for $store. Replacing…')); @@ -822,7 +842,10 @@ class UpdateMachine { case Server5xxException() || NetworkException(): assert(debugLog('Transient error polling event queue for $store: $e\n' 'Backing off, then will retry…')); - // TODO tell user if transient polling errors persist + maybeReportTransientError( + localizations.errorConnectingToServerShort, + details: localizations.errorConnectingToServerDetails( + serverUrl, e.toString())); await (backoffMachine ??= BackoffMachine()).wait(); assert(debugLog('… Backoff wait complete, retrying poll.')); continue; @@ -830,7 +853,11 @@ class UpdateMachine { default: assert(debugLog('Error polling event queue for $store: $e\n' 'Backing off and retrying even though may be hopeless…')); - // TODO tell user on non-transient error in polling + // TODO(#186): Handle unrecoverable failures + reportErrorToUserBriefly( + localizations.errorConnectingToServerShort, + details: localizations.errorConnectingToServerDetails( + serverUrl, e.toString())); await (backoffMachine ??= BackoffMachine()).wait(); assert(debugLog('… Backoff wait complete, retrying poll.')); continue; @@ -854,6 +881,9 @@ class UpdateMachine { // and failures, the successes themselves should space out the requests. backoffMachine = null; store.isLoading = false; + // Dismiss existing errors, if any. + reportErrorToUserBriefly(null); + accumulatedTransientFailureCount = 0; final events = result.events; for (final event in events) { diff --git a/test/model/store_test.dart b/test/model/store_test.dart index 8f503be275..125e88f99d 100644 --- a/test/model/store_test.dart +++ b/test/model/store_test.dart @@ -10,6 +10,7 @@ import 'package:zulip/api/route/events.dart'; import 'package:zulip/api/route/messages.dart'; import 'package:zulip/model/message_list.dart'; import 'package:zulip/model/narrow.dart'; +import 'package:zulip/log.dart'; import 'package:zulip/model/store.dart'; import 'package:zulip/notifications/receive.dart'; @@ -393,6 +394,22 @@ void main() { check(store.userSettings!.twentyFourHourTime).isTrue(); })); + String? lastReportedError; + String? takeLastReportedError() { + final result = lastReportedError; + lastReportedError = null; + return result; + } + + /// This is an alternative to [ZulipApp]'s implementation of + /// [reportErrorToUserBriefly] for testing. + Future logAndReportErrorToUserBriefly(String? message, { + String? details, + }) async { + if (message == null) return; + lastReportedError = '$message\n$details'; + } + test('handles expired queue', () => awaitFakeAsync((async) async { await prepareStore(); updateMachine.debugPauseLoop(); @@ -456,19 +473,52 @@ void main() { })); group('retries on errors', () { - void checkRetry(void Function() prepareError) { + /// Check if [UpdateMachine.poll] retries as expected when there are + /// errors. + /// + /// This also verifies that the first user-facing error message appears + /// after the `numFailedRequests`'th failed request. + void checkRetry(void Function() prepareError, { + required int numFailedRequests, + }) { + assert(numFailedRequests > 0); + reportErrorToUserBriefly = logAndReportErrorToUserBriefly; + addTearDown(() => reportErrorToUserBriefly = defaultReportErrorToUserBriefly); + + final expectedErrorMessage = + 'Error connecting to Zulip. Retrying…\n' + 'Error connecting to Zulip at ${eg.realmUrl.origin}. Will retry'; + awaitFakeAsync((async) async { await prepareStore(lastEventId: 1); updateMachine.debugPauseLoop(); updateMachine.poll(); check(async.pendingTimers).length.equals(0); - // Make the request, inducing an error in it. - prepareError(); - updateMachine.debugAdvanceLoop(); - async.elapse(Duration.zero); - checkLastRequest(lastEventId: 1); - check(store).isLoading.isTrue(); + for (int i = 0; i < numFailedRequests; i++) { + // Make the request, inducing an error in it. + prepareError(); + updateMachine.debugAdvanceLoop(); + async.elapse(Duration.zero); + checkLastRequest(lastEventId: 1); + check(store).isLoading.isTrue(); + + if (i != numFailedRequests - 1) { + // Skip polling backoff unless this is the final iteration. + // This allows the next `updateMachine.debugAdvanceLoop` call to + // trigger the next request without wait. + async.flushTimers(); + } + + if (i < numFailedRequests - 1) { + // The error message should not appear until the `updateMachine` + // has retried the given number of times. + check(takeLastReportedError()).isNull(); + continue; + } + assert(i == numFailedRequests - 1); + check(takeLastReportedError()).isNotNull().contains(expectedErrorMessage); + } // Polling doesn't resume immediately; there's a timer. check(async.pendingTimers).length.equals(1); @@ -489,20 +539,24 @@ void main() { } test('Server5xxException', () { - checkRetry(() => connection.prepare(httpStatus: 500, body: 'splat')); + checkRetry(() => connection.prepare(httpStatus: 500, body: 'splat'), + numFailedRequests: UpdateMachine.transientFailureCountNotifyThreshold + 1); }); test('NetworkException', () { - checkRetry(() => connection.prepare(exception: Exception("failed"))); + checkRetry(() => connection.prepare(exception: Exception("failed")), + numFailedRequests: UpdateMachine.transientFailureCountNotifyThreshold + 1); }); test('ZulipApiException', () { checkRetry(() => connection.prepare(httpStatus: 400, json: { - 'result': 'error', 'code': 'BAD_REQUEST', 'msg': 'Bad request'})); + 'result': 'error', 'code': 'BAD_REQUEST', 'msg': 'Bad request'}), + numFailedRequests: 1); }); test('MalformedServerResponseException', () { - checkRetry(() => connection.prepare(httpStatus: 200, body: 'nonsense')); + checkRetry(() => connection.prepare(httpStatus: 200, body: 'nonsense'), + numFailedRequests: 1); }); }); }); From 36c4a0e9f51bb8fd98c2970afd760e3faa6624fd Mon Sep 17 00:00:00 2001 From: Zixuan James Li Date: Fri, 27 Sep 2024 17:46:50 -0400 Subject: [PATCH 3/4] store: Ignore boring NetworkExceptions Usually when a client goes back from sleep or lose network connection, there will be a bunch of errors when polling event queue. This known error will always be a `NetworkException`, so we make a separate case for it. Previously, we may report these errors, and it ended up being spammy. This filters out the more interesting one to report instead. Signed-off-by: Zixuan James Li --- lib/model/store.dart | 23 ++++++++++++++++++++++- test/model/store_test.dart | 23 +++++++++++++++++++++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/lib/model/store.dart b/lib/model/store.dart index 09510a149a..a0257d459f 100644 --- a/lib/model/store.dart +++ b/lib/model/store.dart @@ -839,7 +839,7 @@ class UpdateMachine { debugLog('… Event queue replaced.'); return; - case Server5xxException() || NetworkException(): + case Server5xxException(): assert(debugLog('Transient error polling event queue for $store: $e\n' 'Backing off, then will retry…')); maybeReportTransientError( @@ -850,6 +850,27 @@ class UpdateMachine { assert(debugLog('… Backoff wait complete, retrying poll.')); continue; + case NetworkException(): + assert(debugLog('Transient error polling event queue for $store: $e\n' + 'Backing off, then will retry…')); + if (e.cause is! SocketException) { + // Heuristic check to only report interesting errors to the user. + // This currently ignores [SocketException], which typically + // occurs when the client goes back from sleep. + // TODO: Investigate if there are other cases of [SocketException] + // that might turn out to be interesting. + // + // See also: + // * [NetworkException.cause], which is the underlying exception. + maybeReportTransientError( + localizations.errorConnectingToServerShort, + details: localizations.errorConnectingToServerDetails( + serverUrl, e.toString())); + } + await (backoffMachine ??= BackoffMachine()).wait(); + assert(debugLog('… Backoff wait complete, retrying poll.')); + continue; + default: assert(debugLog('Error polling event queue for $store: $e\n' 'Backing off and retrying even though may be hopeless…')); diff --git a/test/model/store_test.dart b/test/model/store_test.dart index 125e88f99d..eec0b5fcc3 100644 --- a/test/model/store_test.dart +++ b/test/model/store_test.dart @@ -1,4 +1,5 @@ import 'dart:async'; +import 'dart:io'; import 'package:checks/checks.dart'; import 'package:flutter/foundation.dart'; @@ -476,10 +477,14 @@ void main() { /// Check if [UpdateMachine.poll] retries as expected when there are /// errors. /// - /// This also verifies that the first user-facing error message appears - /// after the `numFailedRequests`'th failed request. + /// By default, also verify that the first user-facing error message + /// appears after the `numFailedRequests`'th failed request. + /// + /// If `expectNotifyError` is false, instead verify that there is no + /// user-facing error message regardless of the retries. void checkRetry(void Function() prepareError, { required int numFailedRequests, + bool expectNotifyError = true, }) { assert(numFailedRequests > 0); reportErrorToUserBriefly = logAndReportErrorToUserBriefly; @@ -510,12 +515,19 @@ void main() { async.flushTimers(); } + if (!expectNotifyError) { + // No matter how many retries there have been, no request should + // result in an error message. + check(takeLastReportedError()).isNull(); + continue; + } if (i < numFailedRequests - 1) { // The error message should not appear until the `updateMachine` // has retried the given number of times. check(takeLastReportedError()).isNull(); continue; } + assert(expectNotifyError); assert(i == numFailedRequests - 1); check(takeLastReportedError()).isNotNull().contains(expectedErrorMessage); } @@ -548,6 +560,13 @@ void main() { numFailedRequests: UpdateMachine.transientFailureCountNotifyThreshold + 1); }); + test('NetworkException to be ignored', () { + checkRetry(() => connection.prepare( + exception: const SocketException("failed")), + numFailedRequests: UpdateMachine.transientFailureCountNotifyThreshold + 1, + expectNotifyError: false); + }); + test('ZulipApiException', () { checkRetry(() => connection.prepare(httpStatus: 400, json: { 'result': 'error', 'code': 'BAD_REQUEST', 'msg': 'Bad request'}), From 90370a70ff191449deb3f2487eb981a34476be79 Mon Sep 17 00:00:00 2001 From: Zixuan James Li Date: Thu, 8 Aug 2024 18:20:05 -0400 Subject: [PATCH 4/4] store: Show error message when replacing event queue Fixes: #555 Signed-off-by: Zixuan James Li --- assets/l10n/app_en.arb | 5 +++++ lib/model/store.dart | 1 + test/model/store_test.dart | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/assets/l10n/app_en.arb b/assets/l10n/app_en.arb index 0031066390..85c1cfa687 100644 --- a/assets/l10n/app_en.arb +++ b/assets/l10n/app_en.arb @@ -172,6 +172,11 @@ "error": {"type": "String", "example": "Invalid format"} } }, + "errorReconnectingToServer": "Reconnecting to {serverUrl}…", + "@errorReconnectingToServer": { + "serverUrl": {"type": "String", "example": "http://example.com/"}, + "description": "Message when reconnecting to the server." + }, "errorSharingFailed": "Sharing failed", "@errorSharingFailed": { "description": "Error message when sharing a message failed." diff --git a/lib/model/store.dart b/lib/model/store.dart index a0257d459f..638ed3763e 100644 --- a/lib/model/store.dart +++ b/lib/model/store.dart @@ -834,6 +834,7 @@ class UpdateMachine { switch (e) { case ZulipApiException(code: 'BAD_EVENT_QUEUE_ID'): assert(debugLog('Lost event queue for $store. Replacing…')); + reportErrorToUserBriefly(localizations.errorReconnectingToServer(serverUrl)); await store._globalStore._reloadPerAccount(store.accountId); dispose(); debugLog('… Event queue replaced.'); diff --git a/test/model/store_test.dart b/test/model/store_test.dart index eec0b5fcc3..8367479a2b 100644 --- a/test/model/store_test.dart +++ b/test/model/store_test.dart @@ -412,6 +412,9 @@ void main() { } test('handles expired queue', () => awaitFakeAsync((async) async { + reportErrorToUserBriefly = logAndReportErrorToUserBriefly; + addTearDown(() => reportErrorToUserBriefly = defaultReportErrorToUserBriefly); + await prepareStore(); updateMachine.debugPauseLoop(); updateMachine.poll(); @@ -425,7 +428,10 @@ void main() { }); updateMachine.debugAdvanceLoop(); async.flushMicrotasks(); + check(lastReportedError).isNull(); await Future.delayed(Duration.zero); + check(takeLastReportedError()).isNotNull() + .contains('Reconnecting to ${eg.realmUrl.origin}…'); check(store).isLoading.isTrue(); // The global store has a new store.