Skip to content

Commit 486ac72

Browse files
committed
chore: tweak sqlx connections for flakey cross-cloud queries
1 parent 95738cb commit 486ac72

File tree

8 files changed

+368
-63
lines changed

8 files changed

+368
-63
lines changed

packages/common/chirp-workflow/core/src/db/crdb_nats/debug.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ impl DatabaseDebug for DatabaseCrdbNats {
4949
WHERE
5050
workflow_id = ANY($1)
5151
",
52-
workflow_ids,
52+
workflow_ids.clone(),
5353
)
5454
.await?;
5555

@@ -493,7 +493,7 @@ impl DatabaseDebug for DatabaseCrdbNats {
493493
FROM db_workflow.tagged_signals
494494
WHERE signal_id = ANY($1)
495495
",
496-
signal_ids,
496+
signal_ids.clone(),
497497
)
498498
.await?;
499499

packages/common/pools/src/db/crdb.rs

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,47 +18,37 @@ pub async fn setup(config: Config) -> Result<CrdbPool, Error> {
1818
}
1919

2020
let pool = sqlx::postgres::PgPoolOptions::new()
21-
// The default connection timeout is too high
22-
.acquire_timeout(Duration::from_secs(60))
21+
// Reduced from 30s to allow retries within API timeout (50s).
22+
// With 10s acquire + 8s query = 18s per attempt, allowing 2-3 retries.
23+
.acquire_timeout(Duration::from_secs(10))
2324
// Increase lifetime to mitigate: https://github.com/launchbadge/sqlx/issues/2854
2425
//
2526
// See max lifetime https://www.cockroachlabs.com/docs/stable/connection-pooling#set-the-maximum-lifetime-of-connections
26-
.max_lifetime(Duration::from_secs(15 * 60))
27+
//
28+
// Reduce this to < 10 minutes since GCP has a 10 minute idle TCP timeout that causes
29+
// problems. Unsure if idle_timeout is actually working correctly, so we're being cautious
30+
// here.
31+
.max_lifetime(Duration::from_secs(8 * 60))
2732
.max_lifetime_jitter(Duration::from_secs(90))
28-
// Remove connections after a while in order to reduce load
29-
// on CRDB after bursts
30-
.idle_timeout(Some(Duration::from_secs(10 * 60)))
33+
// Remove connections after a while in order to reduce load on CRDB after bursts.
34+
//
35+
// IMPORTANT: Must be less than 10 minutes due to GCP's connection tracking timeout.
36+
// See https://cloud.google.com/compute/docs/troubleshooting/general-tips
37+
.idle_timeout(Some(Duration::from_secs(5 * 60)))
3138
// Open connections immediately on startup
3239
.min_connections(crdb.min_connections)
3340
// Raise the cap, since this is effectively the amount of
3441
// simultaneous requests we can handle. See
3542
// https://www.cockroachlabs.com/docs/stable/connection-pooling.html
3643
.max_connections(crdb.max_connections)
37-
// NOTE: This is disabled until we can ensure that TCP connections stop getting dropped
38-
// on AWS.
39-
// // Speeds up requests at the expense of potential
40-
// // failures. See `before_acquire`.
41-
// .test_before_acquire(false)
42-
// // Ping once per minute to validate the connection is still alive
43-
// .before_acquire(|conn, meta| {
44-
// Box::pin(async move {
45-
// if meta.idle_for.as_secs() < 60 {
46-
// Ok(true)
47-
// } else {
48-
// match sqlx::Connection::ping(conn).await {
49-
// Ok(_) => Ok(true),
50-
// Err(err) => {
51-
// // See https://docs.aws.amazon.com/vpc/latest/userguide/nat-gateway-troubleshooting.html#nat-gateway-troubleshooting-timeout
52-
// tracing::warn!(
53-
// ?err,
54-
// "crdb ping failed, potential idle tcp connection drop"
55-
// );
56-
// Ok(false)
57-
// }
58-
// }
59-
// }
60-
// })
61-
// })
44+
// Ping connections before use to validate they're still alive.
45+
// This catches stale connections that may have been dropped by load balancers
46+
// or firewalls (e.g., GCP's 10-minute idle timeout, AWS NAT gateway timeout).
47+
.test_before_acquire(true)
48+
// NOTE: Server-side statement_timeout is not reliable for cross-cloud connections
49+
// because if the network is dead, CockroachDB can't send the timeout error back.
50+
// Instead, we use client-side timeout (tokio::time::timeout) in the SQL macros.
51+
// See QUERY_TIMEOUT_SECS in sql_query_macros.rs.
6252
.connect_with(opts)
6353
.await
6454
.map_err(Error::BuildSqlx)?;

packages/common/pools/src/metrics.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,26 @@ lazy_static::lazy_static! {
2222
&["action", "context_name", "location"],
2323
*REGISTRY,
2424
).unwrap();
25+
pub static ref SQL_QUERY_FAIL_TOTAL: IntCounterVec = register_int_counter_vec_with_registry!(
26+
"sql_query_fail_total",
27+
"Total number of failed queries.",
28+
&["action", "context_name", "location", "error_type"],
29+
*REGISTRY,
30+
).unwrap();
2531
pub static ref SQL_QUERY_DURATION: HistogramVec = register_histogram_vec_with_registry!(
2632
"sql_query_duration",
2733
"Total duration of sql query.",
2834
&["action", "context_name", "location"],
2935
BUCKETS.to_vec(),
3036
*REGISTRY,
3137
).unwrap();
38+
pub static ref SQL_QUERY_FAIL_DURATION: HistogramVec = register_histogram_vec_with_registry!(
39+
"sql_query_fail_duration",
40+
"Duration of failed sql queries (includes timeouts).",
41+
&["action", "context_name", "location", "error_type"],
42+
BUCKETS.to_vec(),
43+
*REGISTRY,
44+
).unwrap();
3245
pub static ref SQL_ACQUIRE_DURATION: HistogramVec = register_histogram_vec_with_registry!(
3346
"sql_acquire_duration",
3447
"Total duration to acquire an sql connection.",

0 commit comments

Comments
 (0)