Skip to content

[sled-agent] Integrate config-reconciler #8064

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion clients/sled-agent-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ progenitor::generate_api!(
OmicronPhysicalDiskConfig = omicron_common::disk::OmicronPhysicalDiskConfig,
OmicronPhysicalDisksConfig = omicron_common::disk::OmicronPhysicalDisksConfig,
OmicronSledConfig = nexus_sled_agent_shared::inventory::OmicronSledConfig,
OmicronSledConfigResult = nexus_sled_agent_shared::inventory::OmicronSledConfigResult,
OmicronZoneConfig = nexus_sled_agent_shared::inventory::OmicronZoneConfig,
OmicronZoneDataset = nexus_sled_agent_shared::inventory::OmicronZoneDataset,
OmicronZoneImageSource = nexus_sled_agent_shared::inventory::OmicronZoneImageSource,
Expand Down
52 changes: 0 additions & 52 deletions dev-tools/omdb/src/bin/omdb/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,6 @@ enum SledAgentCommands {
#[clap(subcommand)]
Zones(ZoneCommands),

/// print information about zpools
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you expecting that inventory will supplant this info? Or are you planning on replacing this access to the sled agent later?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was expecting that inventory would supplant this. (I think maybe it already has, in practice? I definitely only look at inventory when I'm curious about zpools; I don't think I've ever used these omdb subcommands.)

#[clap(subcommand)]
Zpools(ZpoolCommands),

/// print information about datasets
#[clap(subcommand)]
Datasets(DatasetCommands),

/// print information about the local bootstore node
#[clap(subcommand)]
Bootstore(BootstoreCommands),
Expand Down Expand Up @@ -97,12 +89,6 @@ impl SledAgentArgs {
SledAgentCommands::Zones(ZoneCommands::List) => {
cmd_zones_list(&client).await
}
SledAgentCommands::Zpools(ZpoolCommands::List) => {
cmd_zpools_list(&client).await
}
SledAgentCommands::Datasets(DatasetCommands::List) => {
cmd_datasets_list(&client).await
}
SledAgentCommands::Bootstore(BootstoreCommands::Status) => {
cmd_bootstore_status(&client).await
}
Expand All @@ -129,44 +115,6 @@ async fn cmd_zones_list(
Ok(())
}

/// Runs `omdb sled-agent zpools list`
async fn cmd_zpools_list(
client: &sled_agent_client::Client,
) -> Result<(), anyhow::Error> {
let response = client.zpools_get().await.context("listing zpools")?;
let zpools = response.into_inner();

println!("zpools:");
if zpools.is_empty() {
println!(" <none>");
}
for zpool in &zpools {
println!(" {:?}", zpool);
}

Ok(())
}

/// Runs `omdb sled-agent datasets list`
async fn cmd_datasets_list(
client: &sled_agent_client::Client,
) -> Result<(), anyhow::Error> {
let response = client.datasets_get().await.context("listing datasets")?;
let response = response.into_inner();

println!("dataset configuration @ generation {}:", response.generation);
let datasets = response.datasets;

if datasets.is_empty() {
println!(" <none>");
}
for dataset in &datasets {
println!(" {:?}", dataset);
}

Ok(())
}

/// Runs `omdb sled-agent bootstore status`
async fn cmd_bootstore_status(
client: &sled_agent_client::Client,
Expand Down
15 changes: 1 addition & 14 deletions nexus-sled-agent-shared/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ use omicron_common::{
external::{ByteCount, Generation},
internal::shared::{NetworkInterface, SourceNatConfig},
},
disk::{
DatasetConfig, DatasetManagementStatus, DiskManagementStatus,
DiskVariant, OmicronPhysicalDiskConfig,
},
disk::{DatasetConfig, DiskVariant, OmicronPhysicalDiskConfig},
zpool_name::ZpoolName,
};
use omicron_uuid_kinds::{DatasetUuid, OmicronZoneUuid};
Expand Down Expand Up @@ -130,8 +127,6 @@ pub enum SledRole {
}

/// Describes the set of Reconfigurator-managed configuration elements of a sled
// TODO this struct should have a generation number; at the moment, each of
// the fields has a separete one internally.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)]
pub struct OmicronSledConfig {
pub generation: Generation,
Expand All @@ -140,14 +135,6 @@ pub struct OmicronSledConfig {
pub zones: IdMap<OmicronZoneConfig>,
}

/// Result of the currently-synchronous `omicron_config_put` endpoint.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[must_use = "this `DatasetManagementResult` may contain errors, which should be handled"]
pub struct OmicronSledConfigResult {
pub disks: Vec<DiskManagementStatus>,
pub datasets: Vec<DatasetManagementStatus>,
}

/// Describes the set of Omicron-managed zones running on a sled
#[derive(
Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq, Hash,
Expand Down
79 changes: 9 additions & 70 deletions nexus/reconfigurator/execution/src/omicron_sled_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,13 @@ use anyhow::anyhow;
use futures::StreamExt;
use futures::stream;
use nexus_db_queries::context::OpContext;
use nexus_sled_agent_shared::inventory::OmicronSledConfigResult;
use nexus_types::deployment::BlueprintSledConfig;
use omicron_uuid_kinds::GenericUuid;
use omicron_uuid_kinds::SledUuid;
use slog::Logger;
use slog::info;
use slog::warn;
use slog_error_chain::InlineErrorChain;
use std::collections::BTreeMap;
use update_engine::merge_anyhow_list;

/// Idempotently ensure that the specified Omicron sled configs are deployed to
/// the corresponding sleds
Expand Down Expand Up @@ -63,13 +61,14 @@ pub(crate) async fn deploy_sled_configs(
format!("Failed to put {config:#?} to sled {sled_id}")
});
match result {
Ok(_) => None,
Err(error) => {
warn!(log, "{error:#}");
warn!(
log, "failed to put sled config";
InlineErrorChain::new(error.as_ref()),
);
Some(error)
}
Ok(result) => {
parse_config_result(result.into_inner(), &log).err()
}
}
})
.collect()
Expand All @@ -78,69 +77,6 @@ pub(crate) async fn deploy_sled_configs(
if errors.is_empty() { Ok(()) } else { Err(errors) }
}

fn parse_config_result(
result: OmicronSledConfigResult,
log: &Logger,
) -> anyhow::Result<()> {
let (disk_errs, disk_successes): (Vec<_>, Vec<_>) =
result.disks.into_iter().partition(|status| status.err.is_some());

if !disk_errs.is_empty() {
warn!(
log,
"Failed to deploy disks for sled agent";
"successfully configured disks" => disk_successes.len(),
"failed disk configurations" => disk_errs.len(),
);
for err in &disk_errs {
warn!(log, "{err:?}");
}
return Err(merge_anyhow_list(disk_errs.into_iter().map(|status| {
anyhow!(
"failed to deploy disk {:?}: {:#}",
status.identity,
// `disk_errs` was partitioned by `status.err.is_some()`, so
// this is safe to unwrap.
status.err.unwrap(),
)
})));
}

let (dataset_errs, dataset_successes): (Vec<_>, Vec<_>) =
result.datasets.into_iter().partition(|status| status.err.is_some());

if !dataset_errs.is_empty() {
warn!(
log,
"Failed to deploy datasets for sled agent";
"successfully configured datasets" => dataset_successes.len(),
"failed dataset configurations" => dataset_errs.len(),
);
for err in &dataset_errs {
warn!(log, "{err:?}");
}
return Err(merge_anyhow_list(dataset_errs.into_iter().map(
|status| {
anyhow!(
"failed to deploy dataset {}: {:#}",
status.dataset_name.full_name(),
// `dataset_errs` was partitioned by `status.err.is_some()`,
// so this is safe to unwrap.
status.err.unwrap(),
)
},
)));
}

info!(
log,
"Successfully deployed config to sled agent";
"successfully configured disks" => disk_successes.len(),
"successfully configured datasets" => dataset_successes.len(),
);
Ok(())
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down Expand Up @@ -326,6 +262,9 @@ mod tests {

// Observe the latest configuration stored on the simulated sled agent,
// and verify that this output matches the input.
//
// TODO-cleanup Simulated sled-agent should report a unified
// `OmicronSledConfig`.
let observed_disks =
sim_sled_agent.omicron_physical_disks_list().unwrap();
let observed_datasets = sim_sled_agent.datasets_config_list().unwrap();
Expand Down
11 changes: 1 addition & 10 deletions nexus/reconfigurator/planning/src/example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use crate::planner::rng::PlannerRng;
use crate::system::SledBuilder;
use crate::system::SystemDescription;
use nexus_inventory::CollectionBuilderRng;
use nexus_sled_agent_shared::inventory::OmicronZonesConfig;
use nexus_types::deployment::Blueprint;
use nexus_types::deployment::OmicronZoneNic;
use nexus_types::deployment::PlanningInput;
Expand Down Expand Up @@ -486,15 +485,7 @@ impl ExampleSystemBuilder {

for (sled_id, sled_cfg) in &blueprint.sleds {
let sled_cfg = sled_cfg.clone().into_in_service_sled_config();
system
.sled_set_omicron_zones(
*sled_id,
OmicronZonesConfig {
generation: sled_cfg.generation,
zones: sled_cfg.zones.into_iter().collect(),
},
)
.unwrap();
system.sled_set_omicron_config(*sled_id, sled_cfg).unwrap();
}

// We just ensured that a handful of datasets should exist in
Expand Down
14 changes: 5 additions & 9 deletions nexus/reconfigurator/planning/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -996,7 +996,6 @@ pub(crate) mod test {
use clickhouse_admin_types::ClickhouseKeeperClusterMembership;
use clickhouse_admin_types::KeeperId;
use expectorate::assert_contents;
use nexus_sled_agent_shared::inventory::OmicronZonesConfig;
use nexus_types::deployment::BlueprintDatasetDisposition;
use nexus_types::deployment::BlueprintDiffSummary;
use nexus_types::deployment::BlueprintPhysicalDiskDisposition;
Expand Down Expand Up @@ -1181,18 +1180,15 @@ pub(crate) mod test {
// example.collection -- this should be addressed via API improvements.
example
.system
.sled_set_omicron_zones(new_sled_id, {
let sled_cfg = blueprint4
.sled_set_omicron_config(
new_sled_id,
blueprint4
.sleds
.get(&new_sled_id)
.expect("blueprint should contain zones for new sled")
.clone()
.into_in_service_sled_config();
OmicronZonesConfig {
generation: sled_cfg.generation,
zones: sled_cfg.zones.into_iter().collect(),
}
})
.into_in_service_sled_config(),
)
.unwrap();
let collection =
example.system.to_collection_builder().unwrap().build();
Expand Down
20 changes: 15 additions & 5 deletions nexus/reconfigurator/planning/src/system.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use nexus_sled_agent_shared::inventory::Inventory;
use nexus_sled_agent_shared::inventory::InventoryDataset;
use nexus_sled_agent_shared::inventory::InventoryDisk;
use nexus_sled_agent_shared::inventory::InventoryZpool;
use nexus_sled_agent_shared::inventory::OmicronSledConfig;
use nexus_sled_agent_shared::inventory::OmicronZonesConfig;
use nexus_sled_agent_shared::inventory::SledRole;
use nexus_types::deployment::ClickhousePolicy;
Expand Down Expand Up @@ -353,7 +354,7 @@ impl SystemDescription {
!self.sleds.is_empty()
}

/// Set Omicron zones for a sled.
/// Set Omicron config for a sled.
///
/// The zones will be reported in the collection generated by
/// [`Self::to_collection_builder`].
Expand All @@ -362,17 +363,26 @@ impl SystemDescription {
///
/// # Notes
///
/// It is okay to call `sled_set_omicron_zones` in ways that wouldn't
/// It is okay to call `sled_set_omicron_config` in ways that wouldn't
/// happen in production, such as to test illegal states.
pub fn sled_set_omicron_zones(
pub fn sled_set_omicron_config(
&mut self,
sled_id: SledUuid,
omicron_zones: OmicronZonesConfig,
sled_config: OmicronSledConfig,
) -> anyhow::Result<&mut Self> {
let sled = self.sleds.get_mut(&sled_id).with_context(|| {
format!("attempted to access sled {} not found in system", sled_id)
})?;
Arc::make_mut(sled).inventory_sled_agent.omicron_zones = omicron_zones;
let sled = Arc::make_mut(sled);
// TODO this inventory needs to be more robust for the reconciler;
// this is nontrivial so coming in a subsequent PR. For now just
Comment on lines +377 to +378
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

super-nit, link to PR, so this comment doesn't get lost?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR isn't open yet, so hard to link. 😅 However, I put a stub in RSS on this branch where we need the updated inventory:

// Wait until the config reconciler on the target sled has successfully
// reconciled the config at `generation`.
async fn wait_for_config_reconciliation_on_sled(
&self,
_sled_address: SocketAddrV6,
_generation: Generation,
) -> Result<(), SetupServiceError> {
unimplemented!("needs updated inventory")
}

which guarantees this PR can't pass the helios/deploy check until the inventory work is done.

// backfill the existing inventory structure.
sled.inventory_sled_agent.omicron_zones = OmicronZonesConfig {
generation: sled_config.generation,
zones: sled_config.zones.into_iter().collect(),
};
sled.inventory_sled_agent.omicron_physical_disks_generation =
sled_config.generation;
Ok(self)
}

Expand Down
Loading
Loading