Skip to content

Commit d24ba55

Browse files
authored
[sled-agent-config-reconciler] Flesh out main reconicilation task (#8162)
The primary change here is to implement `ReconcilerTask::run()`. There are quite a few ancillary changes (new helper methods, etc.). I didn't add any tests of this task explicitly; it's _mostly_ straightline code that calls into subsystems that themselves have tests. Open to feedback whether you think this is okay or it needs some explicit tests of its own.
1 parent 69a8d6b commit d24ba55

File tree

9 files changed

+586
-164
lines changed

9 files changed

+586
-164
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sled-agent/config-reconciler/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ chrono.workspace = true
1616
debug-ignore.workspace = true
1717
derive_more.workspace = true
1818
dropshot.workspace = true
19+
either.workspace = true
1920
futures.workspace = true
2021
glob.workspace = true
2122
id-map.workspace = true

sled-agent/config-reconciler/src/dataset_serialization_task.rs

+87-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ use illumos_utils::zfs::DestroyDatasetError;
2424
use illumos_utils::zfs::Mountpoint;
2525
use illumos_utils::zfs::WhichDatasets;
2626
use illumos_utils::zfs::Zfs;
27+
use illumos_utils::zpool::PathInPool;
28+
use illumos_utils::zpool::ZpoolOrRamdisk;
2729
use nexus_sled_agent_shared::inventory::InventoryDataset;
2830
use omicron_common::disk::DatasetConfig;
2931
use omicron_common::disk::DatasetKind;
@@ -32,6 +34,8 @@ use omicron_common::disk::SharedDatasetConfig;
3234
use omicron_common::zpool_name::ZpoolName;
3335
use omicron_uuid_kinds::DatasetUuid;
3436
use sled_storage::config::MountConfig;
37+
use sled_storage::dataset::U2_DEBUG_DATASET;
38+
use sled_storage::dataset::ZONE_DATASET;
3539
use sled_storage::manager::NestedDatasetConfig;
3640
use sled_storage::manager::NestedDatasetListOptions;
3741
use sled_storage::manager::NestedDatasetLocation;
@@ -83,6 +87,34 @@ pub enum DatasetEnsureError {
8387
TestError(&'static str),
8488
}
8589

90+
impl DatasetEnsureError {
91+
fn is_retryable(&self) -> bool {
92+
match self {
93+
// These errors might be retryable; there are probably cases where
94+
// they won't be, but we need more context than we have available
95+
// from just the error to know for sure. For now, assume they are
96+
// retryable - that may mean we churn on something doomed, but
97+
// that's better than failing to retry something we should have
98+
// retried.
99+
DatasetEnsureError::ZpoolNotFound(_)
100+
| DatasetEnsureError::EnsureFailed { .. } => true,
101+
102+
// Errors that we know aren't retryable: recovering from these
103+
// require config changes, so there's no need to retry until that
104+
// happens.
105+
DatasetEnsureError::TransientZoneRootNoConfig(_)
106+
| DatasetEnsureError::UuidMismatch { .. } => false,
107+
108+
DatasetEnsureError::TransientZoneRootFailure { err, .. } => {
109+
err.is_retryable()
110+
}
111+
112+
#[cfg(test)]
113+
DatasetEnsureError::TestError(_) => false,
114+
}
115+
}
116+
}
117+
86118
#[derive(Debug, thiserror::Error)]
87119
pub enum NestedDatasetMountError {
88120
#[error("could not mount dataset {}", .name)]
@@ -129,6 +161,60 @@ pub enum NestedDatasetListError {
129161
#[derive(Debug, Clone, Default)]
130162
pub(crate) struct DatasetEnsureResult(IdMap<SingleDatasetEnsureResult>);
131163

164+
impl DatasetEnsureResult {
165+
pub(crate) fn has_retryable_error(&self) -> bool {
166+
self.0.iter().any(|result| match &result.state {
167+
DatasetState::Ensured => false,
168+
DatasetState::FailedToEnsure(err) => err.is_retryable(),
169+
})
170+
}
171+
172+
pub(crate) fn all_mounted_debug_datasets<'a>(
173+
&'a self,
174+
mount_config: &'a MountConfig,
175+
) -> impl Iterator<Item = PathInPool> + 'a {
176+
self.all_mounted_datasets(mount_config, DatasetKind::Debug)
177+
}
178+
179+
pub(crate) fn all_mounted_zone_root_datasets<'a>(
180+
&'a self,
181+
mount_config: &'a MountConfig,
182+
) -> impl Iterator<Item = PathInPool> + 'a {
183+
self.all_mounted_datasets(mount_config, DatasetKind::TransientZoneRoot)
184+
}
185+
186+
fn all_mounted_datasets<'a>(
187+
&'a self,
188+
mount_config: &'a MountConfig,
189+
kind: DatasetKind,
190+
) -> impl Iterator<Item = PathInPool> + 'a {
191+
// We're a helper called by the pub methods on this type, so we only
192+
// have to handle the `kind`s they call us with.
193+
let mountpoint = match &kind {
194+
DatasetKind::Debug => U2_DEBUG_DATASET,
195+
DatasetKind::TransientZoneRoot => ZONE_DATASET,
196+
_ => unreachable!(
197+
"private function called with unexpected kind {kind:?}"
198+
),
199+
};
200+
self.0
201+
.iter()
202+
.filter(|result| match &result.state {
203+
DatasetState::Ensured => true,
204+
DatasetState::FailedToEnsure(_) => false,
205+
})
206+
.filter(move |result| *result.config.name.kind() == kind)
207+
.map(|result| {
208+
let pool = *result.config.name.pool();
209+
PathInPool {
210+
pool: ZpoolOrRamdisk::Zpool(pool),
211+
path: pool
212+
.dataset_mountpoint(&mount_config.root, mountpoint),
213+
}
214+
})
215+
}
216+
}
217+
132218
#[derive(Debug, Clone)]
133219
struct SingleDatasetEnsureResult {
134220
config: DatasetConfig,
@@ -149,7 +235,7 @@ enum DatasetState {
149235
FailedToEnsure(Arc<DatasetEnsureError>),
150236
}
151237

152-
#[derive(Debug)]
238+
#[derive(Debug, Clone)]
153239
pub(crate) struct DatasetTaskHandle(mpsc::Sender<DatasetTaskRequest>);
154240

155241
impl DatasetTaskHandle {

sled-agent/config-reconciler/src/handle.rs

+8-2
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ use crate::dump_setup_task;
5050
use crate::internal_disks::InternalDisksReceiver;
5151
use crate::ledger::LedgerTaskHandle;
5252
use crate::raw_disks;
53+
use crate::raw_disks::RawDisksReceiver;
5354
use crate::raw_disks::RawDisksSender;
5455
use crate::reconciler_task;
5556
use crate::reconciler_task::CurrentlyManagedZpools;
@@ -71,6 +72,7 @@ pub struct ConfigReconcilerSpawnToken {
7172
reconciler_result_tx: watch::Sender<ReconcilerResult>,
7273
currently_managed_zpools_tx: watch::Sender<Arc<CurrentlyManagedZpools>>,
7374
external_disks_tx: watch::Sender<HashSet<Disk>>,
75+
raw_disks_rx: RawDisksReceiver,
7476
ledger_task_log: Logger,
7577
reconciler_task_log: Logger,
7678
}
@@ -110,7 +112,7 @@ impl ConfigReconcilerHandle {
110112
let internal_disks_rx =
111113
InternalDisksReceiver::spawn_internal_disks_task(
112114
Arc::clone(&mount_config),
113-
raw_disks_rx,
115+
raw_disks_rx.clone(),
114116
base_log,
115117
);
116118

@@ -125,7 +127,7 @@ impl ConfigReconcilerHandle {
125127
);
126128

127129
let (reconciler_result_tx, reconciler_result_rx) =
128-
watch::channel(ReconcilerResult::default());
130+
watch::channel(ReconcilerResult::new(Arc::clone(&mount_config)));
129131
let (currently_managed_zpools_tx, currently_managed_zpools_rx) =
130132
watch::channel(Arc::default());
131133
let currently_managed_zpools_rx =
@@ -156,6 +158,7 @@ impl ConfigReconcilerHandle {
156158
reconciler_result_tx,
157159
currently_managed_zpools_tx,
158160
external_disks_tx,
161+
raw_disks_rx,
159162
ledger_task_log: base_log
160163
.new(slog::o!("component" => "SledConfigLedgerTask")),
161164
reconciler_task_log: base_log
@@ -188,6 +191,7 @@ impl ConfigReconcilerHandle {
188191
reconciler_result_tx,
189192
currently_managed_zpools_tx,
190193
external_disks_tx,
194+
raw_disks_rx,
191195
ledger_task_log,
192196
reconciler_task_log,
193197
} = token;
@@ -213,12 +217,14 @@ impl ConfigReconcilerHandle {
213217

214218
reconciler_task::spawn(
215219
Arc::clone(self.internal_disks_rx.mount_config()),
220+
self.dataset_task.clone(),
216221
key_requester,
217222
time_sync_config,
218223
current_config_rx,
219224
reconciler_result_tx,
220225
currently_managed_zpools_tx,
221226
external_disks_tx,
227+
raw_disks_rx,
222228
sled_agent_facilities,
223229
reconciler_task_log,
224230
);

sled-agent/config-reconciler/src/internal_disks.rs

+19-16
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ use tokio::sync::watch::error::RecvError;
4848
use crate::disks_common::MaybeUpdatedDisk;
4949
use crate::disks_common::update_properties_from_raw_disk;
5050
use crate::raw_disks::RawDiskWithId;
51+
use crate::raw_disks::RawDisksReceiver;
5152

5253
/// A thin wrapper around a [`watch::Receiver`] that presents a similar API.
5354
#[derive(Debug, Clone)]
@@ -143,7 +144,7 @@ impl InternalDisksReceiver {
143144

144145
pub(crate) fn spawn_internal_disks_task(
145146
mount_config: Arc<MountConfig>,
146-
raw_disks_rx: watch::Receiver<IdMap<RawDiskWithId>>,
147+
raw_disks_rx: RawDisksReceiver,
147148
base_log: &Logger,
148149
) -> Self {
149150
Self::spawn_with_disk_adopter(
@@ -160,7 +161,7 @@ impl InternalDisksReceiver {
160161

161162
fn spawn_with_disk_adopter<T: DiskAdopter>(
162163
mount_config: Arc<MountConfig>,
163-
raw_disks_rx: watch::Receiver<IdMap<RawDiskWithId>>,
164+
raw_disks_rx: RawDisksReceiver,
164165
base_log: &Logger,
165166
disk_adopter: T,
166167
) -> Self {
@@ -537,7 +538,7 @@ impl IdMappable for InternalDisk {
537538

538539
struct InternalDisksTask {
539540
// Input channel for changes to the raw disks sled-agent sees.
540-
raw_disks_rx: watch::Receiver<IdMap<RawDiskWithId>>,
541+
raw_disks_rx: RawDisksReceiver,
541542

542543
// The set of disks we've successfully started managing.
543544
disks_tx: watch::Sender<IdMap<InternalDisk>>,
@@ -934,11 +935,11 @@ mod tests {
934935
async fn only_m2_disks_are_adopted() {
935936
let logctx = dev::test_setup_log("only_m2_disks_are_adopted");
936937

937-
let (raw_disks_tx, raw_disks_rx) = watch::channel(IdMap::new());
938+
let (raw_disks_tx, raw_disks_rx) = watch::channel(Arc::default());
938939
let disk_adopter = Arc::new(TestDiskAdopter::default());
939940
let mut disks_rx = InternalDisksReceiver::spawn_with_disk_adopter(
940941
Arc::new(any_mount_config()),
941-
raw_disks_rx,
942+
RawDisksReceiver(raw_disks_rx),
942943
&logctx.log,
943944
Arc::clone(&disk_adopter),
944945
);
@@ -948,6 +949,7 @@ mod tests {
948949

949950
// Add four disks: two M.2 and two U.2.
950951
raw_disks_tx.send_modify(|disks| {
952+
let disks = Arc::make_mut(disks);
951953
for disk in [
952954
new_raw_test_disk(DiskVariant::M2, "m2-0"),
953955
new_raw_test_disk(DiskVariant::U2, "u2-0"),
@@ -994,12 +996,13 @@ mod tests {
994996

995997
// Setup: one disk.
996998
let mut raw_disk = new_raw_test_disk(DiskVariant::M2, "test-m2");
997-
let (raw_disks_tx, raw_disks_rx) =
998-
watch::channel([raw_disk.clone().into()].into_iter().collect());
999+
let (raw_disks_tx, raw_disks_rx) = watch::channel(Arc::new(
1000+
[raw_disk.clone().into()].into_iter().collect(),
1001+
));
9991002
let disk_adopter = Arc::new(TestDiskAdopter::default());
10001003
let mut disks_rx = InternalDisksReceiver::spawn_with_disk_adopter(
10011004
Arc::new(any_mount_config()),
1002-
raw_disks_rx,
1005+
RawDisksReceiver(raw_disks_rx),
10031006
&logctx.log,
10041007
Arc::clone(&disk_adopter),
10051008
);
@@ -1021,7 +1024,7 @@ mod tests {
10211024
);
10221025
*raw_disk.firmware_mut() = new_firmware;
10231026
raw_disks_tx.send_modify(|disks| {
1024-
disks.insert(raw_disk.clone().into());
1027+
Arc::make_mut(disks).insert(raw_disk.clone().into());
10251028
});
10261029

10271030
// Wait for the change to be noticed.
@@ -1051,17 +1054,17 @@ mod tests {
10511054
// Setup: two disks.
10521055
let raw_disk1 = new_raw_test_disk(DiskVariant::M2, "m2-1");
10531056
let raw_disk2 = new_raw_test_disk(DiskVariant::M2, "m2-2");
1054-
let (raw_disks_tx, raw_disks_rx) = watch::channel(
1057+
let (raw_disks_tx, raw_disks_rx) = watch::channel(Arc::new(
10551058
[&raw_disk1, &raw_disk2]
10561059
.into_iter()
10571060
.cloned()
10581061
.map(From::from)
10591062
.collect(),
1060-
);
1063+
));
10611064
let disk_adopter = Arc::new(TestDiskAdopter::default());
10621065
let mut disks_rx = InternalDisksReceiver::spawn_with_disk_adopter(
10631066
Arc::new(any_mount_config()),
1064-
raw_disks_rx,
1067+
RawDisksReceiver(raw_disks_rx),
10651068
&logctx.log,
10661069
Arc::clone(&disk_adopter),
10671070
);
@@ -1075,7 +1078,7 @@ mod tests {
10751078

10761079
// Remove test disk 1.
10771080
raw_disks_tx.send_modify(|raw_disks| {
1078-
raw_disks.remove(raw_disk1.identity());
1081+
Arc::make_mut(raw_disks).remove(raw_disk1.identity());
10791082
});
10801083

10811084
// Wait for the removal to be propagated.
@@ -1100,9 +1103,9 @@ mod tests {
11001103

11011104
// Setup: one disk, and configure the disk adopter to fail.
11021105
let raw_disk = new_raw_test_disk(DiskVariant::M2, "test-m2");
1103-
let (_raw_disks_tx, raw_disks_rx) = watch::channel(
1106+
let (_raw_disks_tx, raw_disks_rx) = watch::channel(Arc::new(
11041107
[&raw_disk].into_iter().cloned().map(From::from).collect(),
1105-
);
1108+
));
11061109
let disk_adopter = Arc::new(TestDiskAdopter::default());
11071110
disk_adopter.inner.lock().unwrap().should_fail_requests.insert(
11081111
raw_disk.identity().clone(),
@@ -1113,7 +1116,7 @@ mod tests {
11131116

11141117
let mut disks_rx = InternalDisksReceiver::spawn_with_disk_adopter(
11151118
Arc::new(any_mount_config()),
1116-
raw_disks_rx,
1119+
RawDisksReceiver(raw_disks_rx),
11171120
&logctx.log,
11181121
Arc::clone(&disk_adopter),
11191122
);

0 commit comments

Comments
 (0)