Skip to content

Commit 6129a3d

Browse files
srv_resolver: add 15s timeout to DNS lookups
Co-authored-by: Jonathan de Jong <[email protected]>
1 parent 631eed9 commit 6129a3d

File tree

3 files changed

+45
-3
lines changed

3 files changed

+45
-3
lines changed

changelog.d/19026.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Shorten DNS resolver timeout/retry sequence from 60s to 15s to ensure DNS failures are visible before federation HTTP request timeouts.

synapse/http/federation/srv_resolver.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import attr
2828

29+
from twisted.internet import defer
2930
from twisted.internet.error import ConnectError
3031
from twisted.names import client, dns
3132
from twisted.names.error import DNSNameError, DNSNotImplementedError, DomainError
@@ -36,6 +37,20 @@
3637

3738
SERVER_CACHE: Dict[bytes, List["Server"]] = {}
3839

40+
DNS_LOOKUP_TIMEOUTS = (
41+
1, # Quick retry for packet loss/scenarios
42+
3, # Still reasonable for slow responders
43+
3, # ...
44+
3, # Already catching 99.9% of successful queries at 10s
45+
# Final attempt for extreme edge cases.
46+
#
47+
# TODO: In the future (after 2026-01-01), we could consider removing this extra
48+
# time if we don't see complaints. For comparison, the Windows
49+
# DNS resolver gives up after 10s using `(1, 1, 2, 4, 2)`, see
50+
# https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/dns-client-resolution-timeouts
51+
5,
52+
)
53+
3954

4055
@attr.s(auto_attribs=True, slots=True, frozen=True)
4156
class Server:
@@ -145,7 +160,25 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
145160

146161
try:
147162
answers, _, _ = await make_deferred_yieldable(
148-
self._dns_client.lookupService(service_name)
163+
self._dns_client.lookupService(
164+
service_name,
165+
# This is a sequence of ints that represent the "number of seconds
166+
# after which to reissue the query. When the last timeout expires,
167+
# the query is considered failed." The default value in Twisted is
168+
# `timeout=(1, 3, 11, 45)` (60s total) which is an "arbitrary"
169+
# exponential backoff sequence and is too long (see below).
170+
#
171+
# We want the total timeout to be below the overarching HTTP request
172+
# timeout (60s for federation requests) that spurred on this lookup.
173+
# This way, we can see the underlying DNS failure and move on
174+
# instead of the user ending up with a generic HTTP request timeout.
175+
#
176+
# Since these DNS queries are done over UDP (unreliable transport),
177+
# by it's nature, it's bound to occasionally fail (dropped packets,
178+
# etc). We want a list that starts small and re-issues DNS queries
179+
# multiple times until we get a response or timeout.
180+
timeout=DNS_LOOKUP_TIMEOUTS,
181+
)
149182
)
150183
except DNSNameError:
151184
# TODO: cache this. We can get the SOA out of the exception, and use
@@ -165,6 +198,10 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
165198
return list(cache_entry)
166199
else:
167200
raise e
201+
except defer.TimeoutError as e:
202+
raise defer.TimeoutError(
203+
f"Timed out while trying to resolve DNS for SRV record for {service_name!r} (timeout={sum(DNS_LOOKUP_TIMEOUTS)}s)"
204+
) from e
168205

169206
if (
170207
len(answers) == 1

tests/http/federation/test_srv_resolver.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,9 @@ def do_lookup() -> Generator["Deferred[object]", object, List[Server]]:
6868
test_d = do_lookup()
6969
self.assertNoResult(test_d)
7070

71-
dns_client_mock.lookupService.assert_called_once_with(service_name)
71+
dns_client_mock.lookupService.assert_called_once_with(
72+
service_name, timeout=(1, 3, 3, 3, 5)
73+
)
7274

7375
result_deferred.callback(([answer_srv], None, None))
7476

@@ -98,7 +100,9 @@ def test_from_cache_expired_and_dns_fail(
98100
servers: List[Server]
99101
servers = yield defer.ensureDeferred(resolver.resolve_service(service_name)) # type: ignore[assignment]
100102

101-
dns_client_mock.lookupService.assert_called_once_with(service_name)
103+
dns_client_mock.lookupService.assert_called_once_with(
104+
service_name, timeout=(1, 3, 3, 3, 5)
105+
)
102106

103107
self.assertEqual(len(servers), 1)
104108
self.assertEqual(servers, cache[service_name])

0 commit comments

Comments
 (0)