Skip to content

Nexus panic inside sync_switch_configuration background task #8579

@jgallagher

Description

@jgallagher

During this week's dogfood update, @qdzlug pulled back a corefile from Nexus (/staff/core/dogfood-2025-07-10/core.oxz_nexus_65a11c18-7f59-41ac-b9e7-680627f996e7.nexus.13246.1751604858) that has this backtrace:

fffff5ffcf3fcda0 libc.so.1`_lwp_kill+0xa()
fffff5ffcf3fcdd0 libc.so.1`raise+0x22(6)
fffff5ffcf3fce20 libc.so.1`abort+0x58()
fffff5ffcf3fce30 ~panic_abort::__rust_start_panic::abort::h6b7c627c7df281d0+8()
fffff5ffcf3fce40 ~__rustc::__rust_start_panic+8()
fffff5ffcf3fcea0 __rustc::rust_panic+0xd()
fffff5ffcf3fcf60 std::panicking::rust_panic_with_hook::hb8223ccd7c528a7f+0x22f()
fffff5ffcf3fcfa0 std::panicking::begin_panic_handler::{{closure}}::hcec802fa388f0165+0x98()
fffff5ffcf3fcfb0 ~std::sys::backtrace::__rust_end_short_backtrace::hd75f19ce2a7de6c4+8()
fffff5ffcf3fcfe0 ~__rustc::rust_begin_unwind+0x1b()
fffff5ffcf3fd010 ~core::panicking::panic_fmt::hbf9bff9fdba4d13d+0x1e()
fffff5ffcf3fd070 ~core::option::expect_failed::h53529608804c4c93+0x59()
fffff5ffcf3fd0f0 _$LT$core..iter..adapters..map..Map$LT$I$C$F$GT$$u20$as$u20$core..iter..traits..iterator..Iterator$GT$::next::hcea4c484b37928bc +0x1d5()
fffff5ffcf3fd220 <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter::h471350ba108c5dcd+0x29()
fffff5ffcf3ff7e0 <omicron_nexus::app::background::tasks::sync_switch_configuration::SwitchPortSettingsManager as omicron_nexus::app::background::BackgroundTask>::activate::{{closure}}::hc91b9b54be96a915+0x20f29()
fffff5ffcf3ff9b0 omicron_nexus::app::background::driver::TaskExec::activate::{{closure}}::hdc65aaf0ee82f7e4+0x266()
fffff5ffcf3ffa30 omicron_nexus::app::background::driver::TaskExec::run::_$u7b$$u7b$closure$u7d$$u7d$::hb55c6c615dc48585 +0x1ba()
fffff5ffcf3ffad0 tokio::runtime::task::harness::Harness<T,S>::poll::hc86b69e23fcf020f+0x7a()
fffff5ffcf3ffb20 tokio::runtime::scheduler::multi_thread::worker::Context::run_task::h220372af2b69c236+0x125()
fffff5ffcf3ffbb0 tokio::runtime::scheduler::multi_thread::worker::Context::run::ha6137785420c341f+0xc0e()
fffff5ffcf3ffc10 tokio::runtime::context::scoped::Scoped<T>::set::h404d11d03fae2798+0x2a()
fffff5ffcf3ffcd0 tokio::runtime::context::runtime::enter_runtime::hbea92029ca951a9a+0xb1()
fffff5ffcf3ffd10 tokio::runtime::scheduler::multi_thread::worker::run::h85e66819c406cdd0+0xa0()
fffff5ffcf3ffd70 tokio::runtime::task::core::Core<T,S>::poll::hd8de6c001086b642+0x70()
fffff5ffcf3ffdc0 tokio::runtime::task::harness::Harness<T,S>::poll::hd41585bd4a0d94e4+0x8a()
fffff5ffcf3ffe70 tokio::runtime::blocking::pool::Inner::run::hfccadbb87964bbce+0xe4()
fffff5ffcf3ffec0 std::sys::backtrace::__rust_begin_short_backtrace::h0d8835be7d5c7d5b+0x72()
fffff5ffcf3fff60 core::ops::function::FnOnce::call_once{{vtable.shim}}::h2c58797cb5db6f94+0x97()
fffff5ffcf3fffb0 std::sys::pal::unix::thread::Thread::new::thread_start::hb8b719afb1b31961+0x2b()
fffff5ffcf3fffe0 libc.so.1`_thrp_setup+0x77(fffff5ffedd7ca40)
fffff5ffcf3ffff0 libc.so.1`_lwp_start()

I pulled the logfile from Nexus at this time back as /staff/core/dogfood-2025-07-10/oxz_nexus_65a11c18-7f59-41ac-b9e7-680627f996e7.log.1751605201; we have this for the panic:

thread 'tokio-runtime-worker-121' panicked at nexus/src/app/background/tasks/sync_switch_configuration.rs:894:26:
bgp config is present but announce set is not populated
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

which points to this .expect():

// TODO: is this correct? Do we place the BgpConfig for both switches in a single Vec to send to the bootstore?
let mut bgp: Vec<SledBgpConfig> = switch_bgp_config.iter().map(|(_location, (_id, config))| {
let announcements = bgp_announce_prefixes
.get(&config.bgp_announce_set_id)
.expect("bgp config is present but announce set is not populated")
.iter()
.map(|prefix| {
Ipv4Net::new(prefix.value, prefix.length)
.expect("Prefix4 and Ipv4Net's value types have diverged")
}).collect();

Couple questions:

  1. Can we change this .expect() to some kind of non-fatal error?
  2. Is this expect firing indicative of some other problem?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions