Skip to content
This repository was archived by the owner on Jan 27, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 40 additions & 13 deletions crates/orchestrator/src/discovery/monitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -309,23 +309,40 @@ impl DiscoveryMonitor {
if !discovery_node.is_active && existing_node.status == NodeStatus::Healthy {
// Node is active False but we have it in store and it is healthy
// This means that the node likely got kicked by e.g. the validator
// We simply remove it from the store now and will rediscover it later?
info!(
"Node {} is no longer active on chain, marking as ejected",
node_address
);
if !discovery_node.is_provider_whitelisted {
if let Err(e) = self
.update_node_status(&node_address, NodeStatus::Ejected)
// Add a grace period check to avoid immediately marking nodes that just became healthy
let should_mark_inactive =
if let Some(last_status_change) = existing_node.last_status_change {
let grace_period = chrono::Duration::minutes(5); // 5 minute grace period
Comment thread
JannikSt marked this conversation as resolved.
let now = chrono::Utc::now();
now.signed_duration_since(last_status_change) > grace_period
} else {
// If no last_status_change, assume it's been healthy for a while
true
};

if should_mark_inactive {
info!(
"Node {} is no longer active on chain, marking as ejected",
node_address
);
if !discovery_node.is_provider_whitelisted {
if let Err(e) = self
.update_node_status(&node_address, NodeStatus::Ejected)
.await
{
error!("Error updating node status: {}", e);
}
} else if let Err(e) = self
.update_node_status(&node_address, NodeStatus::Dead)
.await
{
error!("Error updating node status: {}", e);
}
} else if let Err(e) = self
.update_node_status(&node_address, NodeStatus::Dead)
.await
{
error!("Error updating node status: {}", e);
} else {
info!(
"Node {} is no longer active on chain but recently became healthy, waiting before marking inactive",
node_address
);
}
}

Expand Down Expand Up @@ -359,6 +376,16 @@ impl DiscoveryMonitor {
) {
if last_change < last_updated {
info!("Node {} is dead but has been updated on discovery, marking as discovered", node_address);

if existing_node.compute_specs != discovery_node.compute_specs {
info!(
"Node {} compute specs changed, marking as discovered",
node_address
);
let mut node = existing_node.clone();
node.compute_specs = discovery_node.compute_specs.clone();
let _ = self.store_context.node_store.add_node(node.clone()).await;
}
if let Err(e) = self
.update_node_status(&node_address, NodeStatus::Discovered)
.await
Expand Down
2 changes: 1 addition & 1 deletion crates/orchestrator/src/store/domains/heartbeat_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ impl HeartbeatStore {
con.set_options::<_, _, ()>(
&key,
payload_string,
redis::SetOptions::default().with_expiration(redis::SetExpiry::EX(180)),
redis::SetOptions::default().with_expiration(redis::SetExpiry::EX(90)),
Comment thread
JannikSt marked this conversation as resolved.
)
.await
.map_err(|_| anyhow!("Failed to set options"))?;
Expand Down
3 changes: 2 additions & 1 deletion crates/shared/src/security/auth_signature_middleware.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const MAX_NONCE_LENGTH: usize = 64;
const MIN_NONCE_LENGTH: usize = 16;
const RATE_LIMIT_WINDOW_SECS: u64 = 60;
const MAX_REQUESTS_PER_WINDOW: usize = 100;
const REQUEST_EXPIRY_SECS: u64 = 300;

type SyncAddressValidator = Arc<dyn Fn(&Address) -> bool + Send + Sync>;
type AsyncAddressValidator = Arc<dyn Fn(&Address) -> LocalBoxFuture<'static, bool> + Send + Sync>;
Expand Down Expand Up @@ -445,7 +446,7 @@ where
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs();
if current_time - timestamp > 10 {
if current_time - timestamp > REQUEST_EXPIRY_SECS {
return Err(ErrorBadRequest(json!({
"error": "Request expired",
"code": "REQUEST_EXPIRED",
Expand Down
2 changes: 1 addition & 1 deletion crates/worker/src/operations/heartbeat/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ impl HeartbeatService {
state: Arc<SystemState>,
) -> Result<Arc<Self>, HeartbeatError> {
let client = Client::builder()
.timeout(Duration::from_secs(5)) // 5 second timeout
.timeout(Duration::from_secs(20))
Comment thread
JannikSt marked this conversation as resolved.
.build()
.map_err(|_| HeartbeatError::InitFailed)?; // Adjusted to match the expected error type

Expand Down