2626
2727import attr
2828
29+ from twisted .internet import defer
2930from twisted .internet .error import ConnectError
3031from twisted .names import client , dns
3132from twisted .names .error import DNSNameError , DNSNotImplementedError , DomainError
3637
3738SERVER_CACHE : Dict [bytes , List ["Server" ]] = {}
3839
40+ DNS_LOOKUP_TIMEOUTS = (
41+ 1 , # Quick retry for packet loss/scenarios
42+ 3 , # Still reasonable for slow responders
43+ 3 , # ...
44+ 3 , # Already catching 99.9% of successful queries at 10s
45+ # Final attempt for extreme edge cases.
46+ #
47+ # TODO: In the future (after 2026-01-01), we could consider removing this extra
48+ # time if we don't see complaints. For comparison, the Windows
49+ # DNS resolver gives up after 10s using `(1, 1, 2, 4, 2)`, see
50+ # https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/dns-client-resolution-timeouts
51+ 5 ,
52+ )
53+
3954
4055@attr .s (auto_attribs = True , slots = True , frozen = True )
4156class Server :
@@ -145,7 +160,25 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
145160
146161 try :
147162 answers , _ , _ = await make_deferred_yieldable (
148- self ._dns_client .lookupService (service_name )
163+ self ._dns_client .lookupService (
164+ service_name ,
165+ # This is a sequence of ints that represent the "number of seconds
166+ # after which to reissue the query. When the last timeout expires,
167+ # the query is considered failed." The default value in Twisted is
168+ # `timeout=(1, 3, 11, 45)` (60s total) which is an "arbitrary"
169+ # exponential backoff sequence and is too long (see below).
170+ #
171+ # We want the total timeout to be below the overarching HTTP request
172+ # timeout (60s for federation requests) that spurred on this lookup.
173+ # This way, we can see the underlying DNS failure and move on
174+ # instead of the user ending up with a generic HTTP request timeout.
175+ #
176+ # Since these DNS queries are done over UDP (unreliable transport),
177+ # by it's nature, it's bound to occasionally fail (dropped packets,
178+ # etc). We want a list that starts small and re-issues DNS queries
179+ # multiple times until we get a response or timeout.
180+ timeout = DNS_LOOKUP_TIMEOUTS ,
181+ )
149182 )
150183 except DNSNameError :
151184 # TODO: cache this. We can get the SOA out of the exception, and use
@@ -165,6 +198,10 @@ async def resolve_service(self, service_name: bytes) -> List[Server]:
165198 return list (cache_entry )
166199 else :
167200 raise e
201+ except defer .TimeoutError as e :
202+ raise defer .TimeoutError (
203+ f"Timed out while trying to resolve DNS for SRV record for { service_name !r} (timeout={ sum (DNS_LOOKUP_TIMEOUTS )} s)"
204+ ) from e
168205
169206 if (
170207 len (answers ) == 1
0 commit comments