Skip to content

Commit a9feea4

Browse files
fix: Summit cold start from checkpoint deadlock (#131)
1 parent 381e5be commit a9feea4

10 files changed

Lines changed: 732 additions & 35 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
.idea/
33
testnet/
44
.vscode/*
5+
.nvim.lua

finalizer/src/actor.rs

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,32 @@ impl<
220220
}))
221221
.await;
222222

223+
// Send initial forkchoice to the execution client so it knows the chain
224+
// head and can start P2P sync. Then wait for sync to complete before
225+
// replaying any blocks. Without this, catch-up blocks fail because the
226+
// execution client doesn't have them yet.
227+
{
228+
let forkchoice = self.canonical_state.forkchoice;
229+
if !forkchoice.head_block_hash.is_zero() {
230+
info!(
231+
head = %forkchoice.head_block_hash,
232+
"sending initial forkchoice update to execution client, waiting for sync..."
233+
);
234+
loop {
235+
let status = self.engine_client.commit_hash(forkchoice).await;
236+
if status.is_valid() {
237+
info!("execution client synced to checkpoint head, ready to replay blocks");
238+
break;
239+
} else if status.is_syncing() {
240+
warn!("execution client still syncing, waiting 5s...");
241+
self.context.sleep(std::time::Duration::from_secs(5)).await;
242+
} else {
243+
panic!("finalizer started with invalid forkchoice");
244+
}
245+
}
246+
}
247+
}
248+
223249
loop {
224250
if self.validator_exit
225251
&& is_first_block_of_epoch(
@@ -1070,7 +1096,21 @@ async fn execute_block<
10701096
// check the payload
10711097
#[cfg(feature = "prom")]
10721098
let payload_check_start = Instant::now();
1073-
let payload_status = engine_client.check_payload(block).await;
1099+
let payload_status = loop {
1100+
let status = engine_client.check_payload(block).await;
1101+
1102+
if status.is_syncing() {
1103+
error!(
1104+
height = block.height(),
1105+
"execution client returned SYNCING, sending forkchoice update to trigger sync and retrying..."
1106+
);
1107+
1108+
context.sleep(std::time::Duration::from_secs(5)).await;
1109+
continue;
1110+
}
1111+
break status;
1112+
};
1113+
10741114
let new_height = block.height();
10751115

10761116
#[cfg(feature = "prom")]
@@ -1081,7 +1121,7 @@ async fn execute_block<
10811121

10821122
// Validate block against execution layer state
10831123
// Note: withdrawals are validated in the application layer before voting
1084-
if payload_status.is_valid() && state.forkchoice.head_block_hash == block.eth_parent_hash() {
1124+
if payload_status.is_valid() {
10851125
let eth_hash = block.eth_block_hash();
10861126
let tx_count = block.payload.payload_inner.payload_inner.transactions.len();
10871127
info!(

finalizer/src/tests/fork_handling.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ fn test_orphaned_block_processed_when_parent_arrives() {
152152
archive_mode: false,
153153
mailbox_size: 100,
154154
db_prefix: "test_orphaned".to_string(),
155-
engine_client: MockEngineClient,
155+
engine_client: MockEngineClient::new(),
156156
oracle: MockNetworkOracle,
157157
orchestrator_mailbox,
158158
protocol_consts: ProtocolConsts {
@@ -233,7 +233,7 @@ fn test_multiple_forks_tracked() {
233233
archive_mode: false,
234234
mailbox_size: 100,
235235
db_prefix: "test_forks".to_string(),
236-
engine_client: MockEngineClient,
236+
engine_client: MockEngineClient::new(),
237237
oracle: MockNetworkOracle,
238238
orchestrator_mailbox,
239239
protocol_consts: ProtocolConsts {
@@ -315,7 +315,7 @@ fn test_dead_fork_block_discarded() {
315315
archive_mode: false,
316316
mailbox_size: 100,
317317
db_prefix: "test_dead_fork".to_string(),
318-
engine_client: MockEngineClient,
318+
engine_client: MockEngineClient::new(),
319319
oracle: MockNetworkOracle,
320320
orchestrator_mailbox,
321321
protocol_consts: ProtocolConsts {
@@ -413,7 +413,7 @@ fn test_fork_states_pruned_after_finalization() {
413413
archive_mode: false,
414414
mailbox_size: 100,
415415
db_prefix: "test_prune_forks".to_string(),
416-
engine_client: MockEngineClient,
416+
engine_client: MockEngineClient::new(),
417417
oracle: MockNetworkOracle,
418418
orchestrator_mailbox,
419419
protocol_consts: ProtocolConsts {
@@ -530,7 +530,7 @@ fn test_orphaned_blocks_pruned_after_finalization() {
530530
archive_mode: false,
531531
mailbox_size: 100,
532532
db_prefix: "test_prune_orphans".to_string(),
533-
engine_client: MockEngineClient,
533+
engine_client: MockEngineClient::new(),
534534
oracle: MockNetworkOracle,
535535
orchestrator_mailbox,
536536
protocol_consts: ProtocolConsts {
@@ -639,7 +639,7 @@ fn test_fork_state_reused_when_notarized_then_finalized() {
639639
archive_mode: false,
640640
mailbox_size: 100,
641641
db_prefix: "test_reuse".to_string(),
642-
engine_client: MockEngineClient,
642+
engine_client: MockEngineClient::new(),
643643
oracle: MockNetworkOracle,
644644
orchestrator_mailbox,
645645
protocol_consts: ProtocolConsts {
@@ -744,7 +744,7 @@ fn test_competing_fork_pruned_on_finalization() {
744744
archive_mode: false,
745745
mailbox_size: 100,
746746
db_prefix: "test_compete".to_string(),
747-
engine_client: MockEngineClient,
747+
engine_client: MockEngineClient::new(),
748748
oracle: MockNetworkOracle,
749749
orchestrator_mailbox,
750750
protocol_consts: ProtocolConsts {

finalizer/src/tests/mocks.rs

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
use alloy_primitives::{Address, FixedBytes, U256};
44
use alloy_rpc_types_engine::{
55
ExecutionPayloadEnvelopeV3, ExecutionPayloadEnvelopeV4, ExecutionPayloadV1, ExecutionPayloadV2,
6-
ExecutionPayloadV3, ForkchoiceState, PayloadId, PayloadStatus, PayloadStatusEnum,
6+
ExecutionPayloadV3, ForkchoiceState, ForkchoiceUpdated, PayloadId, PayloadStatus,
7+
PayloadStatusEnum,
78
};
89
use commonware_consensus::simplex::scheme::bls12381_multisig;
910
use commonware_consensus::simplex::types::{Finalization, Finalize, Proposal};
@@ -13,6 +14,8 @@ use commonware_cryptography::{Signer as _, ed25519};
1314
use commonware_math::algebra::Random;
1415
use commonware_parallel::Sequential;
1516
use commonware_utils::ordered::{BiMap, Map};
17+
use std::collections::VecDeque;
18+
use std::sync::{Arc, Mutex};
1619
use summit_types::network_oracle::NetworkOracle;
1720
use summit_types::{Block, Digest, EngineClient, PublicKey};
1821

@@ -76,9 +79,52 @@ pub fn make_finalization(
7679
Finalization::from_finalizes(&schemes[0], &finalizes, &Sequential).unwrap()
7780
}
7881

79-
/// Minimal mock EngineClient that accepts all blocks
82+
/// Minimal mock EngineClient that accepts all blocks.
83+
/// Supports queuing override responses for check_payload and commit_hash
84+
/// to simulate SYNCING behavior during tests.
8085
#[derive(Clone)]
81-
pub struct MockEngineClient;
86+
pub struct MockEngineClient {
87+
check_payload_overrides: Arc<Mutex<VecDeque<PayloadStatus>>>,
88+
commit_hash_overrides: Arc<Mutex<VecDeque<ForkchoiceUpdated>>>,
89+
}
90+
91+
impl MockEngineClient {
92+
pub fn new() -> Self {
93+
Self {
94+
check_payload_overrides: Arc::new(Mutex::new(VecDeque::new())),
95+
commit_hash_overrides: Arc::new(Mutex::new(VecDeque::new())),
96+
}
97+
}
98+
99+
/// Queue SYNCING responses for check_payload. After these are consumed,
100+
/// check_payload falls back to returning VALID.
101+
#[allow(unused)]
102+
pub fn queue_check_payload_syncing(&self, count: usize) {
103+
let mut overrides = self.check_payload_overrides.lock().unwrap();
104+
for _ in 0..count {
105+
overrides.push_back(PayloadStatus::new(PayloadStatusEnum::Syncing, None));
106+
}
107+
}
108+
109+
/// Queue SYNCING responses for commit_hash. After these are consumed,
110+
/// commit_hash falls back to returning VALID.
111+
#[allow(unused)]
112+
pub fn queue_commit_hash_syncing(&self, count: usize) {
113+
let mut overrides = self.commit_hash_overrides.lock().unwrap();
114+
for _ in 0..count {
115+
overrides.push_back(ForkchoiceUpdated {
116+
payload_status: PayloadStatus::new(PayloadStatusEnum::Syncing, None),
117+
payload_id: None,
118+
});
119+
}
120+
}
121+
}
122+
123+
impl Default for MockEngineClient {
124+
fn default() -> Self {
125+
Self::new()
126+
}
127+
}
82128

83129
impl EngineClient for MockEngineClient {
84130
#[allow(unused_variables)]
@@ -129,13 +175,27 @@ impl EngineClient for MockEngineClient {
129175
}
130176

131177
async fn check_payload(&mut self, _block: &Block) -> PayloadStatus {
178+
if let Some(override_status) = self.check_payload_overrides.lock().unwrap().pop_front() {
179+
return override_status;
180+
}
132181
PayloadStatus {
133182
status: PayloadStatusEnum::Valid,
134183
latest_valid_hash: Some([0u8; 32].into()),
135184
}
136185
}
137186

138-
async fn commit_hash(&mut self, _fork_choice_state: ForkchoiceState) {}
187+
async fn commit_hash(&mut self, _fork_choice_state: ForkchoiceState) -> ForkchoiceUpdated {
188+
if let Some(override_response) = self.commit_hash_overrides.lock().unwrap().pop_front() {
189+
return override_response;
190+
}
191+
ForkchoiceUpdated {
192+
payload_status: PayloadStatus {
193+
status: PayloadStatusEnum::Valid,
194+
latest_valid_hash: Some([0u8; 32].into()),
195+
},
196+
payload_id: None,
197+
}
198+
}
139199
}
140200

141201
/// Minimal mock NetworkOracle

finalizer/src/tests/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
mod fork_handling;
22
mod mocks;
33
mod state_queries;
4+
mod syncing;
45
mod validator_lifecycle;

finalizer/src/tests/state_queries.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ fn test_get_latest_epoch() {
163163
archive_mode: false,
164164
mailbox_size: 100,
165165
db_prefix: "test_epoch".to_string(),
166-
engine_client: MockEngineClient,
166+
engine_client: MockEngineClient::new(),
167167
oracle: MockNetworkOracle,
168168
orchestrator_mailbox,
169169
protocol_consts: ProtocolConsts {
@@ -274,7 +274,7 @@ fn test_get_epoch_genesis_hash() {
274274
archive_mode: false,
275275
mailbox_size: 100,
276276
db_prefix: "test_epoch_hash".to_string(),
277-
engine_client: MockEngineClient,
277+
engine_client: MockEngineClient::new(),
278278
oracle: MockNetworkOracle,
279279
orchestrator_mailbox,
280280
protocol_consts: ProtocolConsts {
@@ -371,7 +371,7 @@ fn test_get_aux_data_from_canonical_chain() {
371371
archive_mode: false,
372372
mailbox_size: 100,
373373
db_prefix: "test_aux_data".to_string(),
374-
engine_client: MockEngineClient,
374+
engine_client: MockEngineClient::new(),
375375
oracle: MockNetworkOracle,
376376
orchestrator_mailbox,
377377
protocol_consts: ProtocolConsts {
@@ -442,7 +442,7 @@ fn test_get_aux_data_returns_none_for_invalid_parent() {
442442
archive_mode: false,
443443
mailbox_size: 100,
444444
db_prefix: "test_aux_invalid".to_string(),
445-
engine_client: MockEngineClient,
445+
engine_client: MockEngineClient::new(),
446446
oracle: MockNetworkOracle,
447447
orchestrator_mailbox,
448448
protocol_consts: ProtocolConsts {

0 commit comments

Comments
 (0)