@@ -18,47 +18,37 @@ pub async fn setup(config: Config) -> Result<CrdbPool, Error> {
1818 }
1919
2020 let pool = sqlx:: postgres:: PgPoolOptions :: new ( )
21- // The default connection timeout is too high
22- . acquire_timeout ( Duration :: from_secs ( 60 ) )
21+ // Reduced from 30s to allow retries within API timeout (50s).
22+ // With 10s acquire + 8s query = 18s per attempt, allowing 2-3 retries.
23+ . acquire_timeout ( Duration :: from_secs ( 10 ) )
2324 // Increase lifetime to mitigate: https://github.com/launchbadge/sqlx/issues/2854
2425 //
2526 // See max lifetime https://www.cockroachlabs.com/docs/stable/connection-pooling#set-the-maximum-lifetime-of-connections
26- . max_lifetime ( Duration :: from_secs ( 15 * 60 ) )
27+ //
28+ // Reduce this to < 10 minutes since GCP has a 10 minute idle TCP timeout that causes
29+ // problems. Unsure if idle_timeout is actually working correctly, so we're being cautious
30+ // here.
31+ . max_lifetime ( Duration :: from_secs ( 8 * 60 ) )
2732 . max_lifetime_jitter ( Duration :: from_secs ( 90 ) )
28- // Remove connections after a while in order to reduce load
29- // on CRDB after bursts
30- . idle_timeout ( Some ( Duration :: from_secs ( 10 * 60 ) ) )
33+ // Remove connections after a while in order to reduce load on CRDB after bursts.
34+ //
35+ // IMPORTANT: Must be less than 10 minutes due to GCP's connection tracking timeout.
36+ // See https://cloud.google.com/compute/docs/troubleshooting/general-tips
37+ . idle_timeout ( Some ( Duration :: from_secs ( 5 * 60 ) ) )
3138 // Open connections immediately on startup
3239 . min_connections ( crdb. min_connections )
3340 // Raise the cap, since this is effectively the amount of
3441 // simultaneous requests we can handle. See
3542 // https://www.cockroachlabs.com/docs/stable/connection-pooling.html
3643 . max_connections ( crdb. max_connections )
37- // NOTE: This is disabled until we can ensure that TCP connections stop getting dropped
38- // on AWS.
39- // // Speeds up requests at the expense of potential
40- // // failures. See `before_acquire`.
41- // .test_before_acquire(false)
42- // // Ping once per minute to validate the connection is still alive
43- // .before_acquire(|conn, meta| {
44- // Box::pin(async move {
45- // if meta.idle_for.as_secs() < 60 {
46- // Ok(true)
47- // } else {
48- // match sqlx::Connection::ping(conn).await {
49- // Ok(_) => Ok(true),
50- // Err(err) => {
51- // // See https://docs.aws.amazon.com/vpc/latest/userguide/nat-gateway-troubleshooting.html#nat-gateway-troubleshooting-timeout
52- // tracing::warn!(
53- // ?err,
54- // "crdb ping failed, potential idle tcp connection drop"
55- // );
56- // Ok(false)
57- // }
58- // }
59- // }
60- // })
61- // })
44+ // Ping connections before use to validate they're still alive.
45+ // This catches stale connections that may have been dropped by load balancers
46+ // or firewalls (e.g., GCP's 10-minute idle timeout, AWS NAT gateway timeout).
47+ . test_before_acquire ( true )
48+ // NOTE: Server-side statement_timeout is not reliable for cross-cloud connections
49+ // because if the network is dead, CockroachDB can't send the timeout error back.
50+ // Instead, we use client-side timeout (tokio::time::timeout) in the SQL macros.
51+ // See QUERY_TIMEOUT_SECS in sql_query_macros.rs.
6252 . connect_with ( opts)
6353 . await
6454 . map_err ( Error :: BuildSqlx ) ?;
0 commit comments