@@ -996,125 +996,36 @@ class UpdateMachine {
996
996
}());
997
997
}
998
998
999
- // This is static so that it persists through new UpdateMachine instances
1000
- // as we attempt to fix things by reloading data from scratch. In principle
1001
- // it could also be per-account (or per-realm or per-server); but currently
1002
- // we skip that complication, as well as attempting to reset backoff on
1003
- // later success. After all, these unexpected errors should be uncommon;
1004
- // ideally they'd never happen.
1005
- static BackoffMachine get _unexpectedErrorBackoffMachine {
1006
- return __unexpectedErrorBackoffMachine
1007
- ?? = BackoffMachine (maxBound: const Duration (seconds: 60 ));
999
+ Future <void > _debugLoopWait () async {
1000
+ await _debugLoopSignal! .future;
1001
+ if (_disposed) return ;
1002
+ assert (() {
1003
+ _debugLoopSignal = Completer ();
1004
+ return true ;
1005
+ }());
1008
1006
}
1009
- static BackoffMachine ? __unexpectedErrorBackoffMachine;
1010
-
1011
- /// This controls when we start to report transient errors to the user when
1012
- /// polling.
1013
- ///
1014
- /// At the 6th failure, the expected time elapsed since the first failure
1015
- /// will be 1.55 seocnds.
1016
- static const transientFailureCountNotifyThreshold = 5 ;
1017
1007
1018
1008
void poll () async {
1019
1009
assert (! _disposed);
1020
1010
try {
1021
- BackoffMachine ? backoffMachine;
1022
- int accumulatedTransientFailureCount = 0 ;
1023
-
1024
- /// This only reports transient errors after reaching
1025
- /// a pre-defined threshold of retries.
1026
- void maybeReportToUserTransientError (Object error) {
1027
- accumulatedTransientFailureCount++ ;
1028
- if (accumulatedTransientFailureCount > transientFailureCountNotifyThreshold) {
1029
- _reportToUserErrorConnectingToServer (error);
1030
- }
1031
- }
1032
-
1033
1011
while (true ) {
1034
1012
if (_debugLoopSignal != null ) {
1035
- await _debugLoopSignal ! .future ;
1013
+ await _debugLoopWait () ;
1036
1014
if (_disposed) return ;
1037
- assert (() {
1038
- _debugLoopSignal = Completer ();
1039
- return true ;
1040
- }());
1041
1015
}
1042
1016
1043
1017
final GetEventsResult result;
1044
1018
try {
1045
1019
result = await getEvents (store.connection,
1046
1020
queueId: queueId, lastEventId: lastEventId);
1047
1021
if (_disposed) return ;
1048
- } catch (e) {
1022
+ } catch (e, stackTrace ) {
1049
1023
if (_disposed) return ;
1050
- store.isLoading = true ;
1051
-
1052
- if (e is ! ApiRequestException ) {
1053
- // Some unexpected error, outside even making the HTTP request.
1054
- // Definitely a bug in our code.
1055
- rethrow ;
1056
- }
1057
-
1058
- bool shouldReportToUser;
1059
- switch (e) {
1060
- case NetworkException (cause: SocketException ()):
1061
- // A [SocketException] is common when the app returns from sleep.
1062
- shouldReportToUser = false ;
1063
-
1064
- case NetworkException ():
1065
- case Server5xxException ():
1066
- shouldReportToUser = true ;
1067
-
1068
- case ServerException (httpStatus: 429 ):
1069
- case ZulipApiException (httpStatus: 429 ):
1070
- case ZulipApiException (code: 'RATE_LIMIT_HIT' ):
1071
- // TODO(#946) handle rate-limit errors more generally, in ApiConnection
1072
- shouldReportToUser = true ;
1073
-
1074
- case ZulipApiException (code: 'BAD_EVENT_QUEUE_ID' ):
1075
- rethrow ;
1076
-
1077
- case ZulipApiException ():
1078
- case MalformedServerResponseException ():
1079
- // Either a 4xx we didn't expect, or a malformed response;
1080
- // in either case, a mismatch of the client's expectations to the
1081
- // server's behavior, and therefore a bug in one or the other.
1082
- // TODO(#1054) handle auth failures specifically
1083
- rethrow ;
1084
- }
1085
-
1086
- assert (debugLog ('Transient error polling event queue for $store : $e \n '
1087
- 'Backing off, then will retry…' ));
1088
- if (shouldReportToUser) {
1089
- maybeReportToUserTransientError (e);
1090
- }
1091
- await (backoffMachine ?? = BackoffMachine ()).wait ();
1024
+ await _handlePollRequestError (e, stackTrace); // may rethrow
1092
1025
if (_disposed) return ;
1093
- assert (debugLog ('… Backoff wait complete, retrying poll.' ));
1094
1026
continue ;
1095
1027
}
1096
-
1097
- // After one successful request, we reset backoff to its initial state.
1098
- // That way if the user is off the network and comes back on, the app
1099
- // doesn't wind up in a state where it's slow to recover the next time
1100
- // one request fails.
1101
- //
1102
- // This does mean that if the server is having trouble and handling some
1103
- // but not all of its requests, we'll end up doing a lot more retries than
1104
- // if we stayed at the max backoff interval; partway toward what would
1105
- // happen if we weren't backing off at all.
1106
- //
1107
- // But at least for [getEvents] requests, as here, it should be OK,
1108
- // because this is a long-poll. That means a typical successful request
1109
- // takes a long time to come back; in fact longer than our max backoff
1110
- // duration (which is 10 seconds). So if we're getting a mix of successes
1111
- // and failures, the successes themselves should space out the requests.
1112
- backoffMachine = null ;
1113
-
1114
- store.isLoading = false ;
1115
- // Dismiss existing errors, if any.
1116
- reportErrorToUserBriefly (null );
1117
- accumulatedTransientFailureCount = 0 ;
1028
+ _clearPollErrors ();
1118
1029
1119
1030
final events = result.events;
1120
1031
for (final event in events) {
@@ -1133,61 +1044,186 @@ class UpdateMachine {
1133
1044
}
1134
1045
} catch (e) {
1135
1046
if (_disposed) return ;
1047
+ await _handlePollError (e);
1048
+ assert (_disposed);
1049
+ return ;
1050
+ }
1051
+ }
1136
1052
1137
- // An error occurred, other than the transient request errors we retry on.
1138
- // This means either a lost/expired event queue on the server (which is
1139
- // normal after the app is offline for a period like 10 minutes),
1140
- // or an unexpected exception representing a bug in our code or the server.
1141
- // Either way, the show must go on. So reload server data from scratch.
1142
-
1143
- // First, log what happened.
1144
- store.isLoading = true ;
1145
- bool isUnexpected;
1146
- switch (e) {
1147
- case ZulipApiException (code: 'BAD_EVENT_QUEUE_ID' ):
1148
- assert (debugLog ('Lost event queue for $store . Replacing…' ));
1149
- // The old event queue is gone, so we need a new one. This is normal.
1150
- isUnexpected = false ;
1151
-
1152
- case _EventHandlingException (: final cause, : final event):
1153
- assert (debugLog ('BUG: Error handling an event: $cause \n ' // TODO(log)
1154
- ' event: $event \n '
1155
- 'Replacing event queue…' ));
1156
- reportErrorToUserBriefly (
1157
- GlobalLocalizations .zulipLocalizations.errorHandlingEventTitle,
1158
- details: GlobalLocalizations .zulipLocalizations.errorHandlingEventDetails (
1159
- store.realmUrl.toString (), cause.toString (), event.toString ()));
1160
- // We can't just continue with the next event, because our state
1161
- // may be garbled due to failing to apply this one (and worse,
1162
- // any invariants that were left in a broken state from where
1163
- // the exception was thrown). So reload from scratch.
1164
- // Hopefully (probably?) the bug only affects our implementation of
1165
- // the *change* in state represented by the event, and when we get the
1166
- // new state in a fresh InitialSnapshot we'll handle that just fine.
1167
- isUnexpected = true ;
1168
-
1169
- default :
1170
- assert (debugLog ('BUG: Unexpected error in event polling: $e \n ' // TODO(log)
1171
- 'Replacing event queue…' ));
1172
- _reportToUserErrorConnectingToServer (e);
1173
- // Similar story to the _EventHandlingException case;
1174
- // separate only so that that other case can print more context.
1175
- // The bug here could be in the server if it's an ApiRequestException,
1176
- // but our remedy is the same either way.
1177
- isUnexpected = true ;
1178
- }
1053
+ // This is static so that it persists through new UpdateMachine instances
1054
+ // as we attempt to fix things by reloading data from scratch. In principle
1055
+ // it could also be per-account (or per-realm or per-server); but currently
1056
+ // we skip that complication, as well as attempting to reset backoff on
1057
+ // later success. After all, these unexpected errors should be uncommon;
1058
+ // ideally they'd never happen.
1059
+ static BackoffMachine get _unexpectedErrorBackoffMachine {
1060
+ return __unexpectedErrorBackoffMachine
1061
+ ?? = BackoffMachine (maxBound: const Duration (seconds: 60 ));
1062
+ }
1063
+ static BackoffMachine ? __unexpectedErrorBackoffMachine;
1179
1064
1180
- if (isUnexpected) {
1181
- // We don't know the cause of the failure; it might well keep happening.
1182
- // Avoid creating a retry storm.
1183
- await _unexpectedErrorBackoffMachine.wait ();
1184
- if (_disposed) return ;
1185
- }
1065
+ BackoffMachine ? _pollBackoffMachine;
1186
1066
1187
- // This disposes the store, which disposes this update machine.
1188
- await store._globalStore._reloadPerAccount (store.accountId);
1189
- assert (debugLog ('… Event queue replaced.' ));
1190
- return ;
1067
+ /// This controls when we start to report transient errors to the user when
1068
+ /// polling.
1069
+ ///
1070
+ /// At the 6th failure, the expected time elapsed since the first failure
1071
+ /// will be 1.55 seocnds.
1072
+ static const transientFailureCountNotifyThreshold = 5 ;
1073
+
1074
+ int _accumulatedTransientFailureCount = 0 ;
1075
+
1076
+ void _clearPollErrors () {
1077
+ // After one successful request, we reset backoff to its initial state.
1078
+ // That way if the user is off the network and comes back on, the app
1079
+ // doesn't wind up in a state where it's slow to recover the next time
1080
+ // one request fails.
1081
+ //
1082
+ // This does mean that if the server is having trouble and handling some
1083
+ // but not all of its requests, we'll end up doing a lot more retries than
1084
+ // if we stayed at the max backoff interval; partway toward what would
1085
+ // happen if we weren't backing off at all.
1086
+ //
1087
+ // But at least for [getEvents] requests, as here, it should be OK,
1088
+ // because this is a long-poll. That means a typical successful request
1089
+ // takes a long time to come back; in fact longer than our max backoff
1090
+ // duration (which is 10 seconds). So if we're getting a mix of successes
1091
+ // and failures, the successes themselves should space out the requests.
1092
+ _pollBackoffMachine = null ;
1093
+
1094
+ store.isLoading = false ;
1095
+ _accumulatedTransientFailureCount = 0 ;
1096
+ reportErrorToUserBriefly (null );
1097
+ }
1098
+
1099
+ /// Sort out an error from the network request in [poll] :
1100
+ /// either wait for a backoff duration (and possibly report the error),
1101
+ /// or rethrow.
1102
+ ///
1103
+ /// If the request should be retried, this method uses [_pollBackoffMachine]
1104
+ /// to wait an appropriate backoff duration for that retry,
1105
+ /// after reporting the error if appropriate to the user and/or developer.
1106
+ ///
1107
+ /// Otherwise this method rethrows the error, with no other side effects.
1108
+ ///
1109
+ /// See also:
1110
+ /// * [_handlePollError] , which handles errors from the rest of [poll]
1111
+ /// and errors this method rethrows.
1112
+ Future <void > _handlePollRequestError (Object error, StackTrace stackTrace) async {
1113
+ store.isLoading = true ;
1114
+
1115
+ if (error is ! ApiRequestException ) {
1116
+ // Some unexpected error, outside even making the HTTP request.
1117
+ // Definitely a bug in our code.
1118
+ Error .throwWithStackTrace (error, stackTrace);
1119
+ }
1120
+
1121
+ bool shouldReportToUser;
1122
+ switch (error) {
1123
+ case NetworkException (cause: SocketException ()):
1124
+ // A [SocketException] is common when the app returns from sleep.
1125
+ shouldReportToUser = false ;
1126
+
1127
+ case NetworkException ():
1128
+ case Server5xxException ():
1129
+ shouldReportToUser = true ;
1130
+
1131
+ case ServerException (httpStatus: 429 ):
1132
+ case ZulipApiException (httpStatus: 429 ):
1133
+ case ZulipApiException (code: 'RATE_LIMIT_HIT' ):
1134
+ // TODO(#946) handle rate-limit errors more generally, in ApiConnection
1135
+ shouldReportToUser = true ;
1136
+
1137
+ case ZulipApiException (code: 'BAD_EVENT_QUEUE_ID' ):
1138
+ Error .throwWithStackTrace (error, stackTrace);
1139
+
1140
+ case ZulipApiException ():
1141
+ case MalformedServerResponseException ():
1142
+ // Either a 4xx we didn't expect, or a malformed response;
1143
+ // in either case, a mismatch of the client's expectations to the
1144
+ // server's behavior, and therefore a bug in one or the other.
1145
+ // TODO(#1054) handle auth failures specifically
1146
+ Error .throwWithStackTrace (error, stackTrace);
1147
+ }
1148
+
1149
+ assert (debugLog ('Transient error polling event queue for $store : $error \n '
1150
+ 'Backing off, then will retry…' ));
1151
+ if (shouldReportToUser) {
1152
+ _maybeReportToUserTransientError (error);
1153
+ }
1154
+ await (_pollBackoffMachine ?? = BackoffMachine ()).wait ();
1155
+ if (_disposed) return ;
1156
+ assert (debugLog ('… Backoff wait complete, retrying poll.' ));
1157
+ }
1158
+
1159
+ /// Deal with an error in [poll] : reload server data to replace the store,
1160
+ /// after reporting the error as appropriate to the user and/or developer.
1161
+ ///
1162
+ /// See also:
1163
+ /// * [_handlePollRequestError] , which handles certain errors
1164
+ /// and causes them not to reach this method.
1165
+ Future <void > _handlePollError (Object error) async {
1166
+ // An error occurred, other than the transient request errors we retry on.
1167
+ // This means either a lost/expired event queue on the server (which is
1168
+ // normal after the app is offline for a period like 10 minutes),
1169
+ // or an unexpected exception representing a bug in our code or the server.
1170
+ // Either way, the show must go on. So reload server data from scratch.
1171
+
1172
+ store.isLoading = true ;
1173
+
1174
+ bool isUnexpected;
1175
+ switch (error) {
1176
+ case ZulipApiException (code: 'BAD_EVENT_QUEUE_ID' ):
1177
+ assert (debugLog ('Lost event queue for $store . Replacing…' ));
1178
+ // The old event queue is gone, so we need a new one. This is normal.
1179
+ isUnexpected = false ;
1180
+
1181
+ case _EventHandlingException (: final cause, : final event):
1182
+ assert (debugLog ('BUG: Error handling an event: $cause \n ' // TODO(log)
1183
+ ' event: $event \n '
1184
+ 'Replacing event queue…' ));
1185
+ reportErrorToUserBriefly (
1186
+ GlobalLocalizations .zulipLocalizations.errorHandlingEventTitle,
1187
+ details: GlobalLocalizations .zulipLocalizations.errorHandlingEventDetails (
1188
+ store.realmUrl.toString (), cause.toString (), event.toString ()));
1189
+ // We can't just continue with the next event, because our state
1190
+ // may be garbled due to failing to apply this one (and worse,
1191
+ // any invariants that were left in a broken state from where
1192
+ // the exception was thrown). So reload from scratch.
1193
+ // Hopefully (probably?) the bug only affects our implementation of
1194
+ // the *change* in state represented by the event, and when we get the
1195
+ // new state in a fresh InitialSnapshot we'll handle that just fine.
1196
+ isUnexpected = true ;
1197
+
1198
+ default :
1199
+ assert (debugLog ('BUG: Unexpected error in event polling: $error \n ' // TODO(log)
1200
+ 'Replacing event queue…' ));
1201
+ _reportToUserErrorConnectingToServer (error);
1202
+ // Similar story to the _EventHandlingException case;
1203
+ // separate only so that that other case can print more context.
1204
+ // The bug here could be in the server if it's an ApiRequestException,
1205
+ // but our remedy is the same either way.
1206
+ isUnexpected = true ;
1207
+ }
1208
+
1209
+ if (isUnexpected) {
1210
+ // We don't know the cause of the failure; it might well keep happening.
1211
+ // Avoid creating a retry storm.
1212
+ await _unexpectedErrorBackoffMachine.wait ();
1213
+ if (_disposed) return ;
1214
+ }
1215
+
1216
+ await store._globalStore._reloadPerAccount (store.accountId);
1217
+ assert (_disposed);
1218
+ assert (debugLog ('… Event queue replaced.' ));
1219
+ }
1220
+
1221
+ /// This only reports transient errors after reaching
1222
+ /// a pre-defined threshold of retries.
1223
+ void _maybeReportToUserTransientError (Object error) {
1224
+ _accumulatedTransientFailureCount++ ;
1225
+ if (_accumulatedTransientFailureCount > transientFailureCountNotifyThreshold) {
1226
+ _reportToUserErrorConnectingToServer (error);
1191
1227
}
1192
1228
}
1193
1229
0 commit comments