diff --git a/crates/orchestrator/src/discovery/monitor.rs b/crates/orchestrator/src/discovery/monitor.rs index d5fee629..a5f31b29 100644 --- a/crates/orchestrator/src/discovery/monitor.rs +++ b/crates/orchestrator/src/discovery/monitor.rs @@ -309,23 +309,40 @@ impl DiscoveryMonitor { if !discovery_node.is_active && existing_node.status == NodeStatus::Healthy { // Node is active False but we have it in store and it is healthy // This means that the node likely got kicked by e.g. the validator - // We simply remove it from the store now and will rediscover it later? - info!( - "Node {} is no longer active on chain, marking as ejected", - node_address - ); - if !discovery_node.is_provider_whitelisted { - if let Err(e) = self - .update_node_status(&node_address, NodeStatus::Ejected) + // Add a grace period check to avoid immediately marking nodes that just became healthy + let should_mark_inactive = + if let Some(last_status_change) = existing_node.last_status_change { + let grace_period = chrono::Duration::minutes(5); // 5 minute grace period + let now = chrono::Utc::now(); + now.signed_duration_since(last_status_change) > grace_period + } else { + // If no last_status_change, assume it's been healthy for a while + true + }; + + if should_mark_inactive { + info!( + "Node {} is no longer active on chain, marking as ejected", + node_address + ); + if !discovery_node.is_provider_whitelisted { + if let Err(e) = self + .update_node_status(&node_address, NodeStatus::Ejected) + .await + { + error!("Error updating node status: {}", e); + } + } else if let Err(e) = self + .update_node_status(&node_address, NodeStatus::Dead) .await { error!("Error updating node status: {}", e); } - } else if let Err(e) = self - .update_node_status(&node_address, NodeStatus::Dead) - .await - { - error!("Error updating node status: {}", e); + } else { + info!( + "Node {} is no longer active on chain but recently became healthy, waiting before marking inactive", + node_address + ); } } @@ -359,6 +376,16 @@ impl DiscoveryMonitor { ) { if last_change < last_updated { info!("Node {} is dead but has been updated on discovery, marking as discovered", node_address); + + if existing_node.compute_specs != discovery_node.compute_specs { + info!( + "Node {} compute specs changed, marking as discovered", + node_address + ); + let mut node = existing_node.clone(); + node.compute_specs = discovery_node.compute_specs.clone(); + let _ = self.store_context.node_store.add_node(node.clone()).await; + } if let Err(e) = self .update_node_status(&node_address, NodeStatus::Discovered) .await diff --git a/crates/orchestrator/src/store/domains/heartbeat_store.rs b/crates/orchestrator/src/store/domains/heartbeat_store.rs index a54cb769..1011dc86 100644 --- a/crates/orchestrator/src/store/domains/heartbeat_store.rs +++ b/crates/orchestrator/src/store/domains/heartbeat_store.rs @@ -29,7 +29,7 @@ impl HeartbeatStore { con.set_options::<_, _, ()>( &key, payload_string, - redis::SetOptions::default().with_expiration(redis::SetExpiry::EX(180)), + redis::SetOptions::default().with_expiration(redis::SetExpiry::EX(90)), ) .await .map_err(|_| anyhow!("Failed to set options"))?; diff --git a/crates/shared/src/security/auth_signature_middleware.rs b/crates/shared/src/security/auth_signature_middleware.rs index 8732ae20..af63e318 100644 --- a/crates/shared/src/security/auth_signature_middleware.rs +++ b/crates/shared/src/security/auth_signature_middleware.rs @@ -32,6 +32,7 @@ const MAX_NONCE_LENGTH: usize = 64; const MIN_NONCE_LENGTH: usize = 16; const RATE_LIMIT_WINDOW_SECS: u64 = 60; const MAX_REQUESTS_PER_WINDOW: usize = 100; +const REQUEST_EXPIRY_SECS: u64 = 300; type SyncAddressValidator = Arc bool + Send + Sync>; type AsyncAddressValidator = Arc LocalBoxFuture<'static, bool> + Send + Sync>; @@ -445,7 +446,7 @@ where .duration_since(UNIX_EPOCH) .unwrap() .as_secs(); - if current_time - timestamp > 10 { + if current_time - timestamp > REQUEST_EXPIRY_SECS { return Err(ErrorBadRequest(json!({ "error": "Request expired", "code": "REQUEST_EXPIRED", diff --git a/crates/worker/src/operations/heartbeat/service.rs b/crates/worker/src/operations/heartbeat/service.rs index 6e5a291f..9c828cdd 100644 --- a/crates/worker/src/operations/heartbeat/service.rs +++ b/crates/worker/src/operations/heartbeat/service.rs @@ -44,7 +44,7 @@ impl HeartbeatService { state: Arc, ) -> Result, HeartbeatError> { let client = Client::builder() - .timeout(Duration::from_secs(5)) // 5 second timeout + .timeout(Duration::from_secs(20)) .build() .map_err(|_| HeartbeatError::InitFailed)?; // Adjusted to match the expected error type