From 2a360e41ce00cd97d6da3821de4f6a515d64e60a Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Mon, 23 Jan 2023 09:19:32 +0100 Subject: [PATCH 01/11] feat(test): Enable PCIe device testing Allowing PCIe devices to be consumed by Mayastor tests. This can be used locally when developing new features and later on for cargo tests on real devices within the CI. Signed-off-by: Dennis Maisenbacher --- doc/test.md | 23 +++++++++++++++++++++++ io-engine/src/core/env.rs | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/test.md b/doc/test.md index 8000b5fdc..75b8ce882 100644 --- a/doc/test.md +++ b/doc/test.md @@ -55,6 +55,29 @@ Then, to run the tests: ./node_modules/mocha/bin/mocha test_csi.js ``` +## Using PCIe NVMe devices in cargo tests while developing + +When developing new features, testing those with real PCIe devices in the process might come in handy. +In order to do so, the PCIe device first needs to be bound to the vfio driver: + +```bash +sudo PCI_ALLOWED="" ./spdk-rs/spdk/scripts/setup.sh +``` + +The bdev name in the cargo test case can then follow the PCIe URI pattern: + +```rust +static BDEVNAME1: &str = "pcie:///"; +``` + +After testing the device may be rebound to the NVMe driver: + +```bash +sudo PCI_ALLOWED="" ./spdk-rs/spdk/scripts/setup.sh reset +``` + +Please do not submit pull requests with cargo test cases that require PCIe devices to be present. + [spdk]: https://spdk.io/ [doc-run]: ./run.md [mocha]: https://mochajs.org/ diff --git a/io-engine/src/core/env.rs b/io-engine/src/core/env.rs index f76a08548..b4ba8bcf6 100644 --- a/io-engine/src/core/env.rs +++ b/io-engine/src/core/env.rs @@ -283,7 +283,7 @@ impl Default for MayastorCliArgs { reactor_mask: "0x1".into(), mem_size: 0, rpc_address: "/var/tmp/mayastor.sock".to_string(), - no_pci: true, + no_pci: false, log_components: vec![], log_format: None, mayastor_config: None, From 4010610d380765475ce1b02349baa7b299b8064d Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Fri, 20 Jan 2023 14:23:35 +0100 Subject: [PATCH 02/11] feat(spdk): Allow SPDK tracing Init SPDK tracing when `MayastorEnvironment.num_enties` is set to a positive integer. This can be helpfull when developing new features. Traces can be found in `/dev/shm` and have the pid or if not negative the `MayastorEnvironment.shm_id` as a suffix. Further information about traces and how to read the captured traces can be found here: https://spdk.io/doc/nvmf_tgt_tracepoints.html Signed-off-by: Dennis Maisenbacher --- io-engine/src/core/env.rs | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/io-engine/src/core/env.rs b/io-engine/src/core/env.rs index b4ba8bcf6..100697721 100644 --- a/io-engine/src/core/env.rs +++ b/io-engine/src/core/env.rs @@ -328,6 +328,11 @@ pub static SIG_RECEIVED: Lazy = // FFI functions that are needed to initialize the environment extern "C" { pub fn rte_eal_init(argc: i32, argv: *mut *mut libc::c_char) -> i32; + pub fn spdk_trace_init(shm_name: *const c_char, num_entries: u64) -> i32; + pub fn spdk_trace_set_tpoints(group_id: u32, tpoint_mask: u64); + pub fn spdk_trace_create_tpoint_group_mask( + group_name: *const c_char, + ) -> u64; pub fn spdk_trace_cleanup(); pub fn spdk_env_dpdk_post_init(legacy_mem: bool) -> i32; pub fn spdk_env_fini(); @@ -941,6 +946,39 @@ impl MayastorEnvironment { None } + fn init_spdk_tracing(&self) { + const MAX_GROUP_IDS: u32 = 16; + let cshm_name = if self.shm_id >= 0 { + CString::new( + format!("/{}_trace.{}", self.name, self.shm_id).as_str(), + ) + .unwrap() + } else { + CString::new( + format!("/{}_trace.pid{}", self.name, std::process::id()) + .as_str(), + ) + .unwrap() + }; + unsafe { + if spdk_trace_init(cshm_name.as_ptr(), self.num_entries) != 0 { + error!("SPDK tracing init error"); + } + } + let tpoint_group_name = CString::new("all").unwrap(); + let tpoint_group_mask = unsafe { + spdk_trace_create_tpoint_group_mask(tpoint_group_name.as_ptr()) + }; + + for group_id in 0..MAX_GROUP_IDS { + if (tpoint_group_mask & (1 << group_id) as u64) > 0 { + unsafe { + spdk_trace_set_tpoints(group_id, u64::MAX); + } + } + } + } + /// initialize the core, call this before all else pub fn init(mut self) -> Self { // setup the logger as soon as possible @@ -1009,6 +1047,11 @@ impl MayastorEnvironment { // ensure we are within the context of a spdk thread from here Mthread::primary().set_current(); + // To enable SPDK tracing set self.num_entries (eg. to 32768). + if self.num_entries > 0 { + self.init_spdk_tracing(); + } + Reactor::block_on(async { let (sender, receiver) = oneshot::channel::(); From d79cc74f5ad80e5cb5c700fe9e669d4913bbc3e8 Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Fri, 20 Jan 2023 13:02:29 +0100 Subject: [PATCH 03/11] feat(nix): Add btrfs-progs package Add btrfs-progs package for btrfs support. This is a preperation commit for zoned storage support as btrfs also supports zoned storage. Signed-off-by: Dennis Maisenbacher --- ci.nix | 1 + io-engine-tests/src/lib.rs | 7 ++-- io-engine/tests/mount_fs.rs | 1 + nix/overlay.nix | 1 + nix/pkgs/btrfs-progs/default.nix | 63 ++++++++++++++++++++++++++++ nix/pkgs/images/default.nix | 1 + nix/pkgs/io-engine/cargo-package.nix | 1 + nix/pkgs/io-engine/default.nix | 1 + 8 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 nix/pkgs/btrfs-progs/default.nix diff --git a/ci.nix b/ci.nix index b0b4bbbb6..c04f46e45 100644 --- a/ci.nix +++ b/ci.nix @@ -26,6 +26,7 @@ mkShell { buildInputs = [ autoconf automake + btrfs-progs clang cowsay docker diff --git a/io-engine-tests/src/lib.rs b/io-engine-tests/src/lib.rs index f3163c71d..628b80b26 100644 --- a/io-engine-tests/src/lib.rs +++ b/io-engine-tests/src/lib.rs @@ -143,7 +143,7 @@ pub fn mayastor_test_init_ex(log_format: LogFormat) { }) } - ["dd", "mkfs.xfs", "mkfs.ext4", "cmp", "fsck", "truncate"] + ["dd", "mkfs.xfs", "mkfs.ext4", "mkfs.btrfs", "cmp", "fsck", "truncate"] .iter() .for_each(|binary| { if binary_present(binary).is_err() { @@ -202,8 +202,9 @@ pub fn fscheck(device: &str) { pub fn mkfs(path: &str, fstype: &str) -> bool { let (fs, args) = match fstype { - "xfs" => ("mkfs.xfs", ["-f", path]), - "ext4" => ("mkfs.ext4", ["-F", path]), + "xfs" => ("mkfs.xfs", vec!["-f", path]), + "ext4" => ("mkfs.ext4", vec!["-F", path]), + "btrfs" => ("mkfs.btrfs", vec!["-f", "-m", "single", "-d", "single", path]), _ => { panic!("unsupported fstype"); } diff --git a/io-engine/tests/mount_fs.rs b/io-engine/tests/mount_fs.rs index 014f5b0da..1faa8cfc5 100644 --- a/io-engine/tests/mount_fs.rs +++ b/io-engine/tests/mount_fs.rs @@ -124,6 +124,7 @@ async fn mount_fs_mirror() { mount_test(ms, "xfs").await; mount_test(ms, "ext4").await; + mount_test(ms, "btrfs").await; } #[tokio::test] diff --git a/nix/overlay.nix b/nix/overlay.nix index a23923cdc..99a87df15 100644 --- a/nix/overlay.nix +++ b/nix/overlay.nix @@ -4,6 +4,7 @@ let img_prefix = if product_prefix == "" then config.product_prefix else product_prefix; in self: super: rec { + btrfs-progs = super.callPackage ./pkgs/btrfs-progs { }; fio = super.callPackage ./pkgs/fio { }; sourcer = super.callPackage ./lib/sourcer.nix { }; images = super.callPackage ./pkgs/images { inherit img_tag img_org img_prefix; }; diff --git a/nix/pkgs/btrfs-progs/default.nix b/nix/pkgs/btrfs-progs/default.nix new file mode 100644 index 000000000..9eeab400c --- /dev/null +++ b/nix/pkgs/btrfs-progs/default.nix @@ -0,0 +1,63 @@ +# Source: https://github.com/NixOS/nixpkgs/blob/b483ef3cc866ae6afed65115ba6f0c6b19efce49/pkgs/tools/filesystems/btrfs-progs/default.nix +{ lib +, stdenv +, fetchurl +, pkg-config +, attr +, acl +, zlib +, libuuid +, e2fsprogs +, lzo +, asciidoc +, xmlto +, docbook_xml_dtd_45 +, docbook_xsl +, libxslt +, zstd +, python3 +}: + +stdenv.mkDerivation rec { + pname = "btrfs-progs"; + version = "5.14.1"; + + src = fetchurl { + url = "mirror://kernel/linux/kernel/people/kdave/btrfs-progs/btrfs-progs-v${version}.tar.xz"; + sha256 = "sha256-1UqTRlRcpG3xKOPMt31gwJfZDJO34xSZAjbijPr4xVs="; + }; + + nativeBuildInputs = [ + pkg-config + asciidoc + xmlto + docbook_xml_dtd_45 + docbook_xsl + libxslt + python3 + python3.pkgs.setuptools + ]; + + buildInputs = [ attr acl zlib libuuid e2fsprogs lzo zstd python3 ]; + + # for python cross-compiling + _PYTHON_HOST_PLATFORM = stdenv.hostPlatform.config; + + # gcc bug with -O1 on ARM with gcc 4.8 + # This should be fine on all platforms so apply universally + postPatch = "sed -i s/-O1/-O2/ configure"; + + postInstall = '' + install -v -m 444 -D btrfs-completion $out/share/bash-completion/completions/btrfs + ''; + + configureFlags = lib.optional stdenv.hostPlatform.isMusl "--disable-backtrace"; + + meta = with lib; { + description = "Utilities for the btrfs filesystem"; + homepage = "https://btrfs.wiki.kernel.org/"; + license = licenses.gpl2; + maintainers = with maintainers; [ raskin ]; + platforms = platforms.linux; + }; +} diff --git a/nix/pkgs/images/default.nix b/nix/pkgs/images/default.nix index 867b94821..82aa7b62b 100644 --- a/nix/pkgs/images/default.nix +++ b/nix/pkgs/images/default.nix @@ -19,6 +19,7 @@ , utillinux , writeScriptBin , xfsprogs +, btrfs-progs , runCommand , tini , sourcer diff --git a/nix/pkgs/io-engine/cargo-package.nix b/nix/pkgs/io-engine/cargo-package.nix index bf03e2518..64bc6eb07 100644 --- a/nix/pkgs/io-engine/cargo-package.nix +++ b/nix/pkgs/io-engine/cargo-package.nix @@ -19,6 +19,7 @@ , protobuf , sources , xfsprogs +, btrfs-progs , utillinux , llvmPackages , targetPackages diff --git a/nix/pkgs/io-engine/default.nix b/nix/pkgs/io-engine/default.nix index 90bd6a585..ad32082c8 100644 --- a/nix/pkgs/io-engine/default.nix +++ b/nix/pkgs/io-engine/default.nix @@ -17,6 +17,7 @@ , protobuf , sources , xfsprogs +, btrfs-progs , utillinux , llvmPackages , targetPackages From 7b8dac82674b9432b556ef3dfacffa43d09642a1 Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Mon, 23 Jan 2023 10:59:31 +0100 Subject: [PATCH 04/11] refactor: Move device creation into helper function Move children device creation out from `Nexus::new_child` into a helper function. Create the children devices before the Nexus is created and add them after the Nexus's initialization. This is in preperation for zoned storge support, as we need to inspect the children device features before creating a Nexus such that we can decide whether the Nexus must be zoned or not. Signed-off-by: Dennis Maisenbacher --- io-engine/src/bdev/nexus/nexus_bdev.rs | 39 ++++++++++++++++++- .../src/bdev/nexus/nexus_bdev_children.rs | 11 +++--- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/io-engine/src/bdev/nexus/nexus_bdev.rs b/io-engine/src/bdev/nexus/nexus_bdev.rs index 0ac9526a0..55073ee0d 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev.rs @@ -38,7 +38,9 @@ use super::{ use crate::{ bdev::{ + device_create, device_destroy, + device_lookup, nexus::{ nexus_io_subsystem::NexusPauseState, nexus_persistence::PersistentNexusInfo, @@ -1572,6 +1574,34 @@ pub async fn nexus_create_v2( } } +async fn destroy_created_devices(devices: &[(String, String)]) { + for (uri, _) in devices { + if let Err(e) = device_destroy(uri).await { + error!("Destroying the device with {uri} was not successfull. This device is dangling now. Error: {e:?}"); + } + } +} + +async fn create_children_devices( + children: &[String], +) -> Result, Error> { + let mut children_devices = Vec::new(); + + for uri in children { + let device_name = match device_create(uri).await { + Ok(d) => d, + Err(e) => { + destroy_created_devices(&children_devices).await; + return Err(e).context(nexus_err::CreateChild { name: uri }); + } + }; + + children_devices.push((uri.clone(), device_name.clone())); + } + + Ok(children_devices) +} + async fn nexus_create_internal( name: &str, size: u64, @@ -1607,6 +1637,8 @@ async fn nexus_create_internal( return Ok(()); } + let mut children_devices = create_children_devices(children).await?; + // Create a new Nexus object, and immediately add it to the global list. // This is necessary to ensure proper cleanup, as the code responsible for // closing a child assumes that the nexus to which it belongs will appear @@ -1621,8 +1653,9 @@ async fn nexus_create_internal( nexus_info_key, ); - for uri in children { - if let Err(error) = nexus_bdev.data_mut().new_child(uri).await { + + for (uri, device_name) in &children_devices { + if let Err(error) = nexus_bdev.data_mut().new_child(&uri, &device_name).await { error!( "{n:?}: failed to add child '{uri}': {e}", n = nexus_bdev.data(), @@ -1636,6 +1669,8 @@ async fn nexus_create_internal( uri ); + destroy_created_devices(&children_devices).await; + return Err(Error::CreateChild { source: error, name: name.to_owned(), diff --git a/io-engine/src/bdev/nexus/nexus_bdev_children.rs b/io-engine/src/bdev/nexus/nexus_bdev_children.rs index 2e5eca333..fd6a3381a 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev_children.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev_children.rs @@ -74,19 +74,20 @@ impl<'n> Nexus<'n> { /// nexus init phase pub async fn new_child( mut self: Pin<&mut Self>, - uri: &str, + device_uri: &str, + device_name: &str, ) -> Result<(), BdevError> { assert_eq!(*self.state.lock(), NexusState::Init); - info!("{:?}: adding child: '{}'...", self, uri); + info!("{:?}: adding child: '{}'...", self, device_uri); let nexus_name = self.nexus_name().to_owned(); - let device_name = device_create(uri).await?; + let dev = device_lookup(device_name); let c = NexusChild::new( - uri.to_string(), + device_uri.to_string(), nexus_name, - device_lookup(&device_name), + dev, ); info!("{:?}: added to nexus", c); From 2c3f6e830bd99970455ca4dc2affef154c1f282c Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Mon, 23 Jan 2023 11:52:33 +0100 Subject: [PATCH 05/11] feat(nexus): Add features for Zoned Storage Support Adding setter and getter for zone storage related fields to the Nexus and BlockDevice. Inspect the children before creating the Nexus to decide whether the Nexus needs to be zoned or not. The Nexus inherrits the zoned related fields form its child. For now a zoned Nexus does not allow replication. Partitioning of a zoned Nexus is also not supported for now. The user will optain the whole zoned Nexus if requested. Signed-off-by: Dennis Maisenbacher --- io-engine/src/bdev/device.rs | 42 ++++++- io-engine/src/bdev/nexus/nexus_bdev.rs | 94 +++++++++++++- io-engine/src/bdev/nexus/nexus_bdev_error.rs | 4 + io-engine/src/bdev/nexus/nexus_io.rs | 7 ++ io-engine/src/bdev/null_ng.rs | 2 + io-engine/src/bdev/nvmx/device.rs | 39 ++++++ io-engine/src/bdev/nvmx/namespace.rs | 39 ++++++ io-engine/src/core/block_device.rs | 126 ++++++++++++++++++- io-engine/src/core/mod.rs | 1 + nix/pkgs/libspdk/default.nix | 1 - spdk-rs | 2 +- 11 files changed, 350 insertions(+), 7 deletions(-) diff --git a/io-engine/src/bdev/device.rs b/io-engine/src/bdev/device.rs index f0fdde754..81085c882 100644 --- a/io-engine/src/bdev/device.rs +++ b/io-engine/src/bdev/device.rs @@ -60,6 +60,7 @@ use crate::{ UntypedBdev, UntypedBdevHandle, UntypedDescriptorGuard, + ZonedBlockDevice, }, lvs::Lvol, }; @@ -96,7 +97,7 @@ impl SpdkBlockDevice { pub fn lookup_by_name(name: &str) -> Option> { debug!("Searching SPDK devices for '{}'...", name); let bdev = UntypedBdev::lookup_by_name(name)?; - debug!("SPDK {} device found: '{}'", bdev.driver(), name); + debug!("SPDK {} device found: '{}'", bdev.driver(), bdev.name()); Some(Box::new(SpdkBlockDevice::new(bdev))) } @@ -146,8 +147,16 @@ impl BlockDevice for SpdkBlockDevice { } /// returns true if the IO type is supported fn io_type_supported(&self, io_type: IoType) -> bool { + match io_type { + //IoType::NvmeIo => true, + _ => self.io_type_supported_by_device(io_type), + } + } + + fn io_type_supported_by_device(&self, io_type: IoType) -> bool { self.0.io_type_supported(io_type) } + /// returns the IO statistics async fn io_stats(&self) -> Result { self.0.stats_async().await @@ -177,6 +186,37 @@ impl BlockDevice for SpdkBlockDevice { } } +#[async_trait(?Send)] +impl ZonedBlockDevice for SpdkBlockDevice { + fn is_zoned(&self) -> bool { + self.0.is_zoned() + } + + fn zone_size(&self) -> u64 { + self.0.zone_size() + } + + fn num_zones(&self) -> u64 { + self.0.num_zones() + } + + fn max_zone_append_size(&self) -> u32 { + self.0.max_zone_append_size() + } + + fn max_open_zones(&self) -> u32 { + self.0.max_open_zones() + } + + fn max_active_zones(&self) -> u32 { + self.0.max_active_zones() + } + + fn optimal_open_zones(&self) -> u32 { + self.0.optimal_open_zones() + } +} + /// Wrapper around native SPDK block device descriptor, which mimics target SPDK /// descriptor as an abstract BlockDeviceDescriptor instance. struct SpdkBlockDeviceDescriptor(Arc); diff --git a/io-engine/src/bdev/nexus/nexus_bdev.rs b/io-engine/src/bdev/nexus/nexus_bdev.rs index 55073ee0d..53a0f2416 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev.rs @@ -75,11 +75,13 @@ use spdk_rs::{ libspdk::spdk_bdev_notify_blockcnt_change, BdevIo, BdevOps, + BdevZoneInfo, ChannelTraverseStatus, IoChannel, IoDevice, IoDeviceChannelTraverse, JsonWriteContext, + libspdk::spdk_bdev_is_zoned, }; pub static NVME_MIN_CNTLID: u16 = 1; @@ -372,6 +374,7 @@ impl<'n> Nexus<'n> { nexus_uuid: Option, nvme_params: NexusNvmeParams, nexus_info_key: Option, + bdev_zone_info: BdevZoneInfo, ) -> spdk_rs::Bdev> { let n = Nexus { name: name.to_string(), @@ -405,6 +408,7 @@ impl<'n> Nexus<'n> { .with_block_count(0) .with_required_alignment(9) .with_data(n) + .with_zoned_info(bdev_zone_info) .build(); unsafe { @@ -575,6 +579,11 @@ impl<'n> Nexus<'n> { unsafe { self.bdev().required_alignment() } } + /// Check if the bdev is a zoned block device (ZBD) + pub fn is_zoned(&self) -> bool { + unsafe { spdk_bdev_is_zoned(self.bdev().unsafe_inner_ptr()) } + } + /// TODO pub fn children(&self) -> &Vec> { &self.children @@ -774,6 +783,13 @@ impl<'n> Nexus<'n> { }) } } + + if dev.is_zoned() { + //TODO: Implement partitioning zoned block devices. This requires handling drive resources like max active/open zones. + warn!("The device '{}' is zoned. Partitioning zoned block devices into smaller devices is not implemented. Using the whole device.", dev.device_name()); + start_blk = 0; + end_blk = nb; + } } unsafe { @@ -1430,7 +1446,12 @@ impl<'n> BdevOps for Nexus<'n> { IoType::Flush | IoType::Reset | IoType::Unmap - | IoType::WriteZeros => { + | IoType::WriteZeros + | IoType::ZoneAppend + | IoType::ZoneInfo + | IoType::ZoneManagement + | IoType::NvmeIo + | IoType::ZeroCopy => { let supported = self.io_is_supported(io_type); if !supported { if io_type == IoType::Flush { @@ -1574,6 +1595,50 @@ pub async fn nexus_create_v2( } } +async fn prepare_nexus_zone_info_from_children( + children_devices: &mut Vec<(String, String)>, + nexus_name: &str, +) -> Result { + // if we find non-zoned block devices + let mut found_conventional = false; + let mut nexus_zone_info: Option = None; + + for (_uri, device_name) in &*children_devices { + let dev = device_lookup(&device_name).ok_or(Error::ChildMissing { + child: device_name.clone(), + name: nexus_name.to_string(), + })?; + if dev.is_zoned() { + if let Some(nexus_zone_info) = nexus_zone_info { + if nexus_zone_info != dev.bdev_zone_info() { + error!("Can not use ZBD's with different parameters as nexus children"); + return Err(Error::MixedZonedChild { + child: device_name.to_string(), + }); + } + } else { + nexus_zone_info = Some(dev.bdev_zone_info().clone()); + } + } else { + found_conventional = true; + } + + if nexus_zone_info.is_some() && found_conventional { + error!("{nexus_name} - can not handle conventional and zoned storage at the same time in a nexus"); + return Err(Error::MixedZonedChild { + child: device_name.to_string(), + }); + } + } + + if let Some(nexus_zone_info) = nexus_zone_info { + return Ok(nexus_zone_info); + } + + // For conventional devices return the default BlkZoneInfo where `zoned == false` + Ok(BdevZoneInfo::default()) +} + async fn destroy_created_devices(devices: &[(String, String)]) { for (uri, _) in devices { if let Err(e) = device_destroy(uri).await { @@ -1597,6 +1662,12 @@ async fn create_children_devices( }; children_devices.push((uri.clone(), device_name.clone())); + + if device_lookup(&device_name).unwrap().is_zoned() && children.len() > 1 + { + destroy_created_devices(&children_devices).await; + return Err(Error::ZonedReplicationNotImplemented {}); + } } Ok(children_devices) @@ -1639,6 +1710,26 @@ async fn nexus_create_internal( let mut children_devices = create_children_devices(children).await?; + let nexus_zone_info = match prepare_nexus_zone_info_from_children( + &mut children_devices, + name, + ) + .await + { + Err(e) => { + destroy_created_devices(&children_devices).await; + return Err(e); + } + Ok(nexus_zone_info) => nexus_zone_info, + }; + + if nexus_zone_info.zoned { + info!( + "The Nexus will be zoned with the properies {:?}", + nexus_zone_info + ); + } + // Create a new Nexus object, and immediately add it to the global list. // This is necessary to ensure proper cleanup, as the code responsible for // closing a child assumes that the nexus to which it belongs will appear @@ -1651,6 +1742,7 @@ async fn nexus_create_internal( nexus_uuid, nvme_params, nexus_info_key, + nexus_zone_info, ); diff --git a/io-engine/src/bdev/nexus/nexus_bdev_error.rs b/io-engine/src/bdev/nexus/nexus_bdev_error.rs index ca25065c8..23eede96f 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev_error.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev_error.rs @@ -84,6 +84,8 @@ pub enum Error { }, #[snafu(display("Children of nexus {} have mixed block sizes", name))] MixedBlockSizes { name: String }, + #[snafu(display("Child {} is incompatible with its (zoned) siblings", child))] + MixedZonedChild { child: String }, #[snafu(display( "Child {} of nexus {} has incompatible size or block size", child, @@ -222,6 +224,8 @@ pub enum Error { UpdateShareProperties { source: CoreError, name: String }, #[snafu(display("failed to save nexus state {}", name))] SaveStateFailed { source: StoreError, name: String }, + #[snafu(display("Replication for zoned storage is not implemented. Consider adding a single zoned storage device to the nexus"))] + ZonedReplicationNotImplemented, } impl From for Error { diff --git a/io-engine/src/bdev/nexus/nexus_io.rs b/io-engine/src/bdev/nexus/nexus_io.rs index 317f18200..bc35434e8 100644 --- a/io-engine/src/bdev/nexus/nexus_io.rs +++ b/io-engine/src/bdev/nexus/nexus_io.rs @@ -192,6 +192,13 @@ impl<'n> NexusBio<'n> { | IoType::Reset | IoType::Unmap | IoType::Flush => self.submit_all(), + IoType::ZoneAppend => { + warn!("{self:?} - ZoneAppend is explicitly disallowed, otherwise reading from different replicas won't work."); + self.fail(); + Err(CoreError::NotSupported { + source: Errno::EOPNOTSUPP, + }) + } IoType::NvmeAdmin => { self.fail(); Err(CoreError::NotSupported { diff --git a/io-engine/src/bdev/null_ng.rs b/io-engine/src/bdev/null_ng.rs index a09970a57..3ca14bf6c 100644 --- a/io-engine/src/bdev/null_ng.rs +++ b/io-engine/src/bdev/null_ng.rs @@ -6,6 +6,7 @@ use spdk_rs::{ BdevModule, BdevModuleBuild, BdevOps, + BdevZoneInfo, IoChannel, IoDevice, IoType, @@ -142,6 +143,7 @@ impl<'a> NullIoDevice<'a> { .with_block_length(1 << 12) .with_block_count(1 << 20) .with_required_alignment(12) + .with_zoned_info(BdevZoneInfo::default()) .build(); bdev.data().register_io_device(Some(name)); diff --git a/io-engine/src/bdev/nvmx/device.rs b/io-engine/src/bdev/nvmx/device.rs index 7ff985e01..ffa459138 100644 --- a/io-engine/src/bdev/nvmx/device.rs +++ b/io-engine/src/bdev/nvmx/device.rs @@ -24,6 +24,7 @@ use crate::{ DeviceIoController, DeviceTimeoutAction, IoType, + ZonedBlockDevice, }, ffihelper::{cb_arg, done_cb}, }; @@ -204,10 +205,17 @@ impl BlockDevice for NvmeBlockDevice { IoType::Unmap => self.ns.supports_deallocate(), IoType::WriteZeros => self.ns.supports_write_zeroes(), IoType::CompareAndWrite => false, + IoType::ZoneAppend | IoType::ZoneInfo | IoType::ZoneManagement => { + true + } _ => false, } } + fn io_type_supported_by_device(&self, io_type: IoType) -> bool { + self.io_type_supported(io_type) + } + async fn io_stats(&self) -> Result { let carc = NVME_CONTROLLERS.lookup_by_name(&self.name).ok_or( CoreError::BdevNotFound { @@ -257,6 +265,37 @@ impl BlockDevice for NvmeBlockDevice { } } +#[async_trait(?Send)] +impl ZonedBlockDevice for NvmeBlockDevice { + fn is_zoned(&self) -> bool { + self.ns.is_zoned() + } + + fn zone_size(&self) -> u64 { + self.ns.zone_size() + } + + fn num_zones(&self) -> u64 { + self.ns.num_zones() + } + + fn max_zone_append_size(&self) -> u32 { + self.ns.max_zone_append_size() + } + + fn max_open_zones(&self) -> u32 { + self.ns.max_open_zones() + } + + fn max_active_zones(&self) -> u32 { + self.ns.max_active_zones() + } + + fn optimal_open_zones(&self) -> u32 { + self.ns.optimal_open_zones() + } +} + struct NvmeDeviceIoController { name: String, } diff --git a/io-engine/src/bdev/nvmx/namespace.rs b/io-engine/src/bdev/nvmx/namespace.rs index 8123d6a4c..8c51886d6 100644 --- a/io-engine/src/bdev/nvmx/namespace.rs +++ b/io-engine/src/bdev/nvmx/namespace.rs @@ -10,6 +10,13 @@ use spdk_rs::libspdk::{ spdk_nvme_ns_get_size, spdk_nvme_ns_get_uuid, spdk_nvme_ns_supports_compare, + spdk_nvme_zns_ns_get_data, + spdk_nvme_zns_ns_get_zone_size, + spdk_nvme_zns_ns_get_num_zones, + spdk_nvme_ns_get_ctrlr, + spdk_nvme_zns_ctrlr_get_max_zone_append_size, + spdk_nvme_zns_ns_get_max_open_zones, + spdk_nvme_zns_ns_get_max_active_zones, SPDK_NVME_NS_DEALLOCATE_SUPPORTED, SPDK_NVME_NS_WRITE_ZEROES_SUPPORTED, }; @@ -78,4 +85,36 @@ impl NvmeNamespace { pub fn as_ptr(&self) -> *mut spdk_nvme_ns { self.0.as_ptr() } + + pub fn is_zoned(&self) -> bool { + unsafe { !spdk_nvme_zns_ns_get_data(self.0.as_ptr()).is_null() } + } + + pub fn zone_size(&self) -> u64 { + unsafe { spdk_nvme_zns_ns_get_zone_size(self.0.as_ptr()) } + } + + pub fn num_zones(&self) -> u64 { + unsafe { spdk_nvme_zns_ns_get_num_zones(self.0.as_ptr()) } + } + + pub fn max_zone_append_size(&self) -> u32 { + unsafe { + spdk_nvme_zns_ctrlr_get_max_zone_append_size( + spdk_nvme_ns_get_ctrlr(self.0.as_ptr()), + ) + } + } + + pub fn max_open_zones(&self) -> u32 { + unsafe { spdk_nvme_zns_ns_get_max_open_zones(self.0.as_ptr()) } + } + + pub fn max_active_zones(&self) -> u32 { + unsafe { spdk_nvme_zns_ns_get_max_active_zones(self.0.as_ptr()) } + } + + pub fn optimal_open_zones(&self) -> u32 { + self.max_open_zones() + } } diff --git a/io-engine/src/core/block_device.rs b/io-engine/src/core/block_device.rs index 225e7178f..f06cb1de0 100644 --- a/io-engine/src/core/block_device.rs +++ b/io-engine/src/core/block_device.rs @@ -6,7 +6,7 @@ use super::{ SnapshotParams, }; -use spdk_rs::{DmaBuf, DmaError, IoVec}; +use spdk_rs::{BdevZoneInfo, DmaBuf, DmaError, IoVec}; use async_trait::async_trait; use futures::channel::oneshot; @@ -56,7 +56,7 @@ pub struct BlockDeviceIoStats { /// Core trait that represents a block device. /// TODO: Add text. #[async_trait(?Send)] -pub trait BlockDevice { +pub trait BlockDevice: ZonedBlockDevice { /// Returns total size in bytes of the device. fn size_in_bytes(&self) -> u64; @@ -81,9 +81,12 @@ pub trait BlockDevice { /// Returns aligment of the device. fn alignment(&self) -> u64; - /// Checks whether target I/O type is supported by the device. + /// Checks whether target I/O type is supported by the device or storage stack. fn io_type_supported(&self, io_type: IoType) -> bool; + /// Checks whether target I/O type is supported by the device. + fn io_type_supported_by_device(&self, io_type: IoType) -> bool; + /// Obtains I/O statistics for the device. async fn io_stats(&self) -> Result; @@ -103,6 +106,58 @@ pub trait BlockDevice { ) -> Result<(), CoreError>; } +/// Trait to represent zoned storage related fields for zoned block devices. +#[async_trait(?Send)] +pub trait ZonedBlockDevice { + /// Returns if the device to which this ZoneInfo is linked to is a + /// zoned block device (ZBD) or not. If true, the following fields are + /// also relavant. + fn is_zoned(&self) -> bool; + + /// Returns the number of zones available on the device. + fn zone_size(&self) -> u64; + + /// Returns size of each zone (in blocks). Typically alligned to a power of 2. + /// In SPDK the actuall writable zone capacity has to be queried for each + /// individual zone through a zone report. + /// zone_capacity <= zone_size. + /// zone_capacity * num_zones = device capacity + fn num_zones(&self) -> u64; + + /// Returns maximum data transfer size for a single zone append command (in blocks). + /// Normal (seq) writes must respect the device's general max transfer size. + fn max_zone_append_size(&self) -> u32; + + /// Returns maximum number of open zones for a given device. + /// This essentially limits the amount of parallel open zones that can be written to. + /// Refere to NVMe ZNS specification (Figure 7 Zone State Machine) for more details. + /// https://nvmexpress.org/wp-content/uploads/NVM-Express-Zoned-Namespace-Command-Set-Specification-1.1d-2023.12.28-Ratified.pdf + fn max_open_zones(&self) -> u32; + + /// Returns maximum number of active zones for a given device. + /// max_open_zones is a subset of max_active_zones. Closed zones are still active until they + /// get finished (finished zones are in effect immutabel until reset). + /// Refere to NVMe ZNS specification (Figure 7 Zone State Machine) for more details. + /// https://nvmexpress.org/wp-content/uploads/NVM-Express-Zoned-Namespace-Command-Set-Specification-1.1d-2023.12.28-Ratified.pdf + fn max_active_zones(&self) -> u32; + + /// Returns the drives prefered number of open zones. + fn optimal_open_zones(&self) -> u32; + + /// Returns all zoned storage relavant fields in a condensed BdevZoneInfo struct. + fn bdev_zone_info(&self) -> BdevZoneInfo { + BdevZoneInfo { + zoned: self.is_zoned(), + zone_size: self.zone_size(), + num_zones: self.num_zones(), + max_zone_append_size: self.max_zone_append_size(), + max_open_zones: self.max_open_zones(), + max_active_zones: self.max_active_zones(), + optimal_open_zones: self.optimal_open_zones(), + } + } +} + /// Core trait that represents a descriptor for an opened block device. /// TODO: Add text. #[async_trait(?Send)] @@ -406,8 +461,73 @@ pub trait BlockDeviceHandle { cb_arg: IoCompletionCallbackArg, ) -> Result<(), CoreError>; + /// Emulates the zone management send NvmeIo command for devices that do not support this + /// command natively. + /// + /// * `nvme_cmd` - The nvme command to emulate. + /// * `_buffer` - The data buffer for the nvme command. + /// * `_buffer_size` - The data buffer for the nvme command. + /// * `_cb` - The completion callback function for the nvme command. + /// * `_cb_arg` - The completion callback function arguments. + fn emulate_zone_mgmt_send_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + _buffer: *mut c_void, + _buffer_size: u64, + _cb: IoCompletionCallback, + _cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }) + } + + /// Emulates the zone management receive NvmeIo command for devices that do not support this + /// command natively. + /// + /// * `nvme_cmd` - The nvme command to emulate. + /// * `_buffer` - The data buffer for the nvme command. + /// * `_buffer_size` - The data buffer for the nvme command. + /// * `_cb` - The completion callback function for the nvme command. + /// * `_cb_arg` - The completion callback function arguments. + fn emulate_zone_mgmt_recv_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + _buffer: *mut c_void, + _buffer_size: u64, + _cb: IoCompletionCallback, + _cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }) + } + // NVMe only. + /// Submits an NVMe IO Passthrough command to the device. + /// + /// * `nvme_cmd` - The nvme command to emulate. + /// * `_buffer` - The data buffer for the nvme command. + /// * `_buffer_size` - The data buffer for the nvme command. + /// * `_cb` - The completion callback function for the nvme command. + /// * `_cb_arg` - The completion callback function arguments. + fn submit_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + _buffer: *mut c_void, + _buffer_size: u64, + _cb: IoCompletionCallback, + _cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }) + } + /// TODO async fn nvme_admin_custom(&self, opcode: u8) -> Result<(), CoreError>; diff --git a/io-engine/src/core/mod.rs b/io-engine/src/core/mod.rs index a225c5ec5..7c7ed4812 100644 --- a/io-engine/src/core/mod.rs +++ b/io-engine/src/core/mod.rs @@ -22,6 +22,7 @@ pub use block_device::{ OpCompletionCallback, OpCompletionCallbackArg, ReadOptions, + ZonedBlockDevice, }; pub use cpu_cores::{Core, Cores}; pub use descriptor::{DescriptorGuard, UntypedDescriptorGuard}; diff --git a/nix/pkgs/libspdk/default.nix b/nix/pkgs/libspdk/default.nix index 1f6797466..91ab8d208 100644 --- a/nix/pkgs/libspdk/default.nix +++ b/nix/pkgs/libspdk/default.nix @@ -116,7 +116,6 @@ let (if with-fio then [ "--with-fio=${fio-include}" ] else [ ]) ++ [ "--with-uring" - "--without-uring-zns" "--disable-unit-tests" "--disable-tests" ]; diff --git a/spdk-rs b/spdk-rs index a1efae6c1..bc229bbfb 160000 --- a/spdk-rs +++ b/spdk-rs @@ -1 +1 @@ -Subproject commit a1efae6c1d8c6eaf4f6ce54b5c919f664fd466f3 +Subproject commit bc229bbfb631e8d79cc12e0fdce1aa925047efd7 From a997bfb90e646b55c71a25ec2163899b4a377289 Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Mon, 23 Jan 2023 13:31:02 +0100 Subject: [PATCH 06/11] feat(nexus): Add NVMe IO passthrough support Passthrough NvmeIo if the child (NVMe device) supports it. This is necessary for the Zoned Storage support, as the Zone Management Send and Receive commands are issued to the Nexus as NvmeIo. Those commands are issued when zones are reported or the state of a zone is actively changed. Signed-off-by: Dennis Maisenbacher --- io-engine/src/bdev/device.rs | 40 ++++++++++++++++++++++++++++ io-engine/src/bdev/nexus/nexus_io.rs | 39 +++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/io-engine/src/bdev/device.rs b/io-engine/src/bdev/device.rs index 81085c882..18e94f44c 100644 --- a/io-engine/src/bdev/device.rs +++ b/io-engine/src/bdev/device.rs @@ -18,6 +18,7 @@ use spdk_rs::{ spdk_bdev_flush, spdk_bdev_free_io, spdk_bdev_io, + spdk_bdev_nvme_io_passthru, spdk_bdev_readv_blocks_with_flags, spdk_bdev_reset, spdk_bdev_unmap_blocks, @@ -63,6 +64,7 @@ use crate::{ ZonedBlockDevice, }, lvs::Lvol, + ffihelper::FfiResult, }; #[cfg(feature = "fault-injection")] @@ -627,6 +629,44 @@ impl BlockDeviceHandle for SpdkBlockDeviceHandle { }) } + fn submit_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + buffer: *mut c_void, + buffer_size: u64, + cb: IoCompletionCallback, + cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + + let ctx = alloc_bdev_io_ctx( + IoType::NvmeIo, + IoCtx { + device: self.device, + cb, + cb_arg, + }, + 0, + 0, + )?; + + let (desc, ch) = self.handle.io_tuple(); + + unsafe { + spdk_bdev_nvme_io_passthru( + desc, + ch, + nvme_cmd, + buffer, + buffer_size, + Some(bdev_io_completion), + ctx as *mut c_void, + ) + }.to_result(|e| CoreError::NvmeIoPassthruDispatch { + source: Errno::from_i32(e), + opcode: nvme_cmd.opc(), + }) + } + // NVMe commands are not applicable for non-NVMe devices. async fn create_snapshot( &self, diff --git a/io-engine/src/bdev/nexus/nexus_io.rs b/io-engine/src/bdev/nexus/nexus_io.rs index bc35434e8..24af1d97d 100644 --- a/io-engine/src/bdev/nexus/nexus_io.rs +++ b/io-engine/src/bdev/nexus/nexus_io.rs @@ -12,6 +12,7 @@ use spdk_rs::{ spdk_bdev_io, spdk_bdev_io_complete_nvme_status, spdk_io_channel, + spdk_nvme_cmd, SPDK_NVME_SC_ABORTED_SQ_DELETION, SPDK_NVME_SC_CAPACITY_EXCEEDED, SPDK_NVME_SC_INVALID_OPCODE, @@ -189,6 +190,7 @@ impl<'n> NexusBio<'n> { // these IOs are submitted to all the underlying children IoType::Write | IoType::WriteZeros + | IoType::NvmeIo | IoType::Reset | IoType::Unmap | IoType::Flush => self.submit_all(), @@ -509,6 +511,42 @@ impl<'n> NexusBio<'n> { } } + #[inline] + fn submit_io_passthru( + &self, + hdl: &dyn BlockDeviceHandle, + ) -> Result<(), CoreError> { + let orig_nvme_cmd = self.nvme_cmd(); + let buffer = self.nvme_buf(); + let buffer_size = self.nvme_nbytes(); + + let mut passthru_nvme_cmd = spdk_nvme_cmd::default(); + passthru_nvme_cmd.set_opc(orig_nvme_cmd.opc()); + unsafe { + passthru_nvme_cmd.__bindgen_anon_1.cdw10 = orig_nvme_cmd.__bindgen_anon_1.cdw10; + passthru_nvme_cmd.__bindgen_anon_2.cdw11 = orig_nvme_cmd.__bindgen_anon_2.cdw11; + passthru_nvme_cmd.__bindgen_anon_3.cdw12 = orig_nvme_cmd.__bindgen_anon_3.cdw12; + } + passthru_nvme_cmd.cdw13 = orig_nvme_cmd.cdw13; + passthru_nvme_cmd.cdw14 = orig_nvme_cmd.cdw14; + passthru_nvme_cmd.cdw15 = orig_nvme_cmd.cdw15; + + if hdl.get_device().io_type_supported_by_device(self.io_type()) { + return hdl.submit_io_passthru( + &passthru_nvme_cmd, + buffer, + buffer_size, + Self::child_completion, + self.as_ptr().cast(), + ); + } else { + return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: orig_nvme_cmd.opc(), + }); + } + } + #[inline] fn submit_write( &self, @@ -613,6 +651,7 @@ impl<'n> NexusBio<'n> { IoType::WriteZeros => self.submit_write_zeroes(h), IoType::Reset => self.submit_reset(h), IoType::Flush => self.submit_flush(h), + IoType::NvmeIo => self.submit_io_passthru(h), // we should never reach here, if we do it is a bug. _ => unreachable!(), } From 3ad7a615d4817da9cc69406c9066f123b5a66097 Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Mon, 23 Jan 2023 13:44:50 +0100 Subject: [PATCH 07/11] feat(nexus): Emulate zone mgmt NvmeIo for SpdkBlockDevices Zoned block devices that do not support NvmeIo (zoned uring) need to get the zone management send and receive NvmeIo translated that is beeing issued on the Nexus. Those commands are issued when zones are reported or the state of a zone is actively changed. Inspect fields of the incoming NVMe command and fill the buffer according to the 'NVM Express Zoned Namespace Command Set Specification' (see https://nvmexpress.org/developers/nvme-command-set-specifications/) with the help of the SPDK zoned bdev wrapper functions. Signed-off-by: Dennis Maisenbacher --- Cargo.lock | 18 ++ io-engine/Cargo.toml | 1 + io-engine/src/bdev/device.rs | 427 ++++++++++++++++++++++++++- io-engine/src/bdev/nexus/nexus_io.rs | 28 +- io-engine/src/core/mod.rs | 10 + 5 files changed, 471 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ce860f7a1..9d0a0f51e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1168,6 +1168,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + [[package]] name = "fsio" version = "0.4.0" @@ -1606,6 +1612,7 @@ dependencies = [ "io-engine-tests", "io-uring", "ioctl-gen", + "jemalloc-sys", "jsonrpc", "lazy_static", "libc", @@ -1837,6 +1844,17 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "jemalloc-sys" +version = "0.5.2+5.3.0-patched" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "134163979b6eed9564c98637b710b40979939ba351f59952708234ea11b5f3f8" +dependencies = [ + "cc", + "fs_extra", + "libc", +] + [[package]] name = "js-sys" version = "0.3.64" diff --git a/io-engine/Cargo.toml b/io-engine/Cargo.toml index e4e1df05a..644170c8a 100644 --- a/io-engine/Cargo.toml +++ b/io-engine/Cargo.toml @@ -98,6 +98,7 @@ async-process = { version = "1.8.1" } rstack = { version = "0.3.3" } tokio-stream = "0.1.14" rustls = "0.21.12" +jemalloc-sys = "0.5.2+5.3.0-patched" devinfo = { path = "../utils/dependencies/devinfo" } jsonrpc = { path = "../jsonrpc"} diff --git a/io-engine/src/bdev/device.rs b/io-engine/src/bdev/device.rs index 18e94f44c..f714611f2 100644 --- a/io-engine/src/bdev/device.rs +++ b/io-engine/src/bdev/device.rs @@ -4,6 +4,7 @@ use std::{ collections::HashMap, convert::TryFrom, + mem, os::raw::c_void, sync::{Arc, Mutex}, }; @@ -17,6 +18,7 @@ use spdk_rs::{ spdk_bdev_comparev_blocks, spdk_bdev_flush, spdk_bdev_free_io, + spdk_bdev_get_zone_info, spdk_bdev_io, spdk_bdev_nvme_io_passthru, spdk_bdev_readv_blocks_with_flags, @@ -24,8 +26,37 @@ use spdk_rs::{ spdk_bdev_unmap_blocks, spdk_bdev_write_zeroes_blocks, spdk_bdev_writev_blocks, + spdk_bdev_zone_info, + spdk_bdev_zone_management, + SPDK_BDEV_ZONE_CLOSE, + SPDK_BDEV_ZONE_FINISH, + SPDK_BDEV_ZONE_OFFLINE, + SPDK_BDEV_ZONE_OPEN, + SPDK_BDEV_ZONE_RESET, + SPDK_BDEV_ZONE_STATE_CLOSED, + SPDK_BDEV_ZONE_STATE_EMPTY, + SPDK_BDEV_ZONE_STATE_EXP_OPEN, + SPDK_BDEV_ZONE_STATE_FULL, + SPDK_BDEV_ZONE_STATE_IMP_OPEN, + SPDK_BDEV_ZONE_STATE_OFFLINE, + SPDK_BDEV_ZONE_STATE_READ_ONLY, SPDK_NVME_IO_FLAGS_UNWRITTEN_READ_FAIL, SPDK_NVME_IO_FLAG_CURRENT_UNWRITTEN_READ_FAIL, + SPDK_NVME_ZONE_STATE_CLOSED, + SPDK_NVME_ZONE_STATE_EMPTY, + SPDK_NVME_ZONE_STATE_EOPEN, + SPDK_NVME_ZONE_STATE_FULL, + SPDK_NVME_ZONE_STATE_IOPEN, + SPDK_NVME_ZONE_STATE_OFFLINE, + SPDK_NVME_ZONE_STATE_RONLY, + SPDK_NVME_ZRA_LIST_ALL, + SPDK_NVME_ZRA_LIST_ZSC, + SPDK_NVME_ZRA_LIST_ZSE, + SPDK_NVME_ZRA_LIST_ZSEO, + SPDK_NVME_ZRA_LIST_ZSF, + SPDK_NVME_ZRA_LIST_ZSIO, + SPDK_NVME_ZRA_LIST_ZSO, + SPDK_NVME_ZRA_LIST_ZSRO, }, nvme_admin_opc, AsIoVecPtr, @@ -54,6 +85,7 @@ use crate::{ IoCompletionCallback, IoCompletionCallbackArg, IoCompletionStatus, + NvmeCmdOpc, NvmeStatus, ReadOptions, SnapshotParams, @@ -75,6 +107,8 @@ use crate::core::fault_injection::{ InjectIoCtx, }; +use jemalloc_sys::{calloc, free}; + /// TODO type EventDispatcherMap = HashMap; @@ -85,6 +119,69 @@ static BDEV_EVENT_DISPATCHER: Lazy> = // Memory pool for bdev I/O context. static BDEV_IOCTX_POOL: OnceCell> = OnceCell::new(); +/// TODO +fn bdev_zone_state_to_nvme_zns_zone_state( + bdev_zone_state: u32, +) -> Result { + match bdev_zone_state { + SPDK_BDEV_ZONE_STATE_EMPTY => Ok(SPDK_NVME_ZONE_STATE_EMPTY), + SPDK_BDEV_ZONE_STATE_IMP_OPEN => Ok(SPDK_NVME_ZONE_STATE_IOPEN), + SPDK_BDEV_ZONE_STATE_FULL => Ok(SPDK_NVME_ZONE_STATE_FULL), + SPDK_BDEV_ZONE_STATE_CLOSED => Ok(SPDK_NVME_ZONE_STATE_CLOSED), + SPDK_BDEV_ZONE_STATE_READ_ONLY => Ok(SPDK_NVME_ZONE_STATE_RONLY), + SPDK_BDEV_ZONE_STATE_OFFLINE => Ok(SPDK_NVME_ZONE_STATE_OFFLINE), + SPDK_BDEV_ZONE_STATE_EXP_OPEN => Ok(SPDK_NVME_ZONE_STATE_EOPEN), + _ => { + error!("Can't map SPDK_BDEV_ZONE_STATE {bdev_zone_state} to any SPDK_NVME_ZONE_STATE"); + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EINVAL, + opcode: NvmeCmdOpc::ZoneMgmtReceive as u16, + }) + } + } +} + +/// TODO +fn zone_send_action_to_bdev_zone_action( + zone_send_action: u8, +) -> Result { + match zone_send_action { + 0x01 => Ok(SPDK_BDEV_ZONE_CLOSE), + 0x02 => Ok(SPDK_BDEV_ZONE_FINISH), + 0x03 => Ok(SPDK_BDEV_ZONE_OPEN), + 0x04 => Ok(SPDK_BDEV_ZONE_RESET), + 0x05 => Ok(SPDK_BDEV_ZONE_OFFLINE), + _ => { + error!( + "Can not map Zone Send Action {} to any spdk_bdev_zone_action", + zone_send_action + ); + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EINVAL, + opcode: NvmeCmdOpc::ZoneMgmtSend as u16, + }) + } + } +} + +/// TODO +fn is_zra_list_matching_zone_state( + zra_report_opt: u32, + zns_zone_state: u32, +) -> bool { + match (zra_report_opt, zns_zone_state) { + (SPDK_NVME_ZRA_LIST_ALL, _) => true, + (SPDK_NVME_ZRA_LIST_ZSE, SPDK_NVME_ZONE_STATE_EMPTY) => true, + (SPDK_NVME_ZRA_LIST_ZSIO, SPDK_NVME_ZONE_STATE_IOPEN) => true, + (SPDK_NVME_ZRA_LIST_ZSEO, SPDK_NVME_ZONE_STATE_EOPEN) => true, + (SPDK_NVME_ZRA_LIST_ZSC, SPDK_NVME_ZONE_STATE_CLOSED) => true, + (SPDK_NVME_ZRA_LIST_ZSF, SPDK_NVME_ZONE_STATE_FULL) => true, + (SPDK_NVME_ZRA_LIST_ZSRO, SPDK_NVME_ZONE_STATE_RONLY) => true, + (SPDK_NVME_ZRA_LIST_ZSO, SPDK_NVME_ZONE_STATE_OFFLINE) => true, + _ => false, + } +} + /// Wrapper around native SPDK block devices, which mimics target SPDK block /// device as an abstract BlockDevice instance. #[derive(Copy, Clone)] @@ -150,7 +247,7 @@ impl BlockDevice for SpdkBlockDevice { /// returns true if the IO type is supported fn io_type_supported(&self, io_type: IoType) -> bool { match io_type { - //IoType::NvmeIo => true, + IoType::NvmeIo => true, _ => self.io_type_supported_by_device(io_type), } } @@ -629,7 +726,7 @@ impl BlockDeviceHandle for SpdkBlockDeviceHandle { }) } - fn submit_io_passthru( + fn emulate_zone_mgmt_send_io_passthru( &self, nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, buffer: *mut c_void, @@ -637,6 +734,25 @@ impl BlockDeviceHandle for SpdkBlockDeviceHandle { cb: IoCompletionCallback, cb_arg: IoCompletionCallbackArg, ) -> Result<(), CoreError> { + unsafe { buffer.write_bytes(0, buffer_size as usize) }; + + // Read relevant fields for a 'Zone Management Send' command, see 'NVMe Zoned Namespace Command Set Specification, Revision 1.1c' + // Bit 63:00 Dword11:Dword10 > Starting LBA + let mut slba = unsafe { + ((nvme_cmd.__bindgen_anon_2.cdw11 as u64) << 32) + | nvme_cmd.__bindgen_anon_1.cdw10 as u64 + }; + + // Bit 07:00 Dword 13 > Zone Send Action + let zsa = + zone_send_action_to_bdev_zone_action(nvme_cmd.cdw13 as u8)?; + + // Bit 08 Dword 13 > Select All + let select_all = nvme_cmd.cdw13 & (1 << 8) != 0; + + if select_all { + slba = 0; + } let ctx = alloc_bdev_io_ctx( IoType::NvmeIo, @@ -651,6 +767,302 @@ impl BlockDeviceHandle for SpdkBlockDeviceHandle { let (desc, ch) = self.handle.io_tuple(); + let num_zones = self.device.num_zones(); + let zone_size = self.device.zone_size(); + + let mut result; + loop { + result = unsafe { + spdk_bdev_zone_management( + desc, + ch, + slba, + zsa, + Some(bdev_io_completion), + ctx as *mut c_void, + ) + } + .to_result(|e| CoreError::NvmeIoPassthruDispatch { + source: Errno::from_i32(e), + opcode: nvme_cmd.opc(), + }); + let continue_next_zone = + select_all && slba == num_zones * zone_size; + if !continue_next_zone || result.is_err() { + break result; + } + slba += zone_size; + } + } + + fn emulate_zone_mgmt_recv_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + buffer: *mut c_void, + buffer_size: u64, + cb: IoCompletionCallback, + cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + let ctx = alloc_bdev_io_ctx( + IoType::NvmeIo, + IoCtx { + device: self.device, + cb, + cb_arg, + }, + 0, + 0, + )?; + + let (desc, ch) = self.handle.io_tuple(); + + let size_of_spdk_bdev_zone_info = + mem::size_of::() as usize; + + // Bit 63:00 Dword11:Dword10 > Starting LBA + let slba = unsafe { + ((nvme_cmd.__bindgen_anon_2.cdw11 as u64) << 32) + | nvme_cmd.__bindgen_anon_1.cdw10 as u64 + }; + + // Bit 07:00 Dword13 > Zone Receive Action + let zra = nvme_cmd.cdw13 as u8; + if zra != 0x0u8 { + error!("Zone Management Receive 'Zone Receive Action' (cdw13) != 00h (Report Zones) not implemented"); + return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }); + } + + // Bit 16 Dword13 > Partial Report + let partial_report = nvme_cmd.cdw13 & (1 << 16) != 0; + if !partial_report { + error!("Zone Management Receive 'Partial Report' (cdw13) == 0 not implemented"); + return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }); + } + + // Bit 15:08 Dword13 > Reporting Options + let zra_report_opt = (nvme_cmd.cdw13 >> 8) as u8; + + let max_num_zones = self.device.num_zones(); + let zone_size = self.device.zone_size(); + let zone_report_offset = slba / zone_size; + let max_num_zones_to_report = max_num_zones - zone_report_offset; + + // Bit 31:00 Dword12 > Number of Dwords + let num_of_dwords = unsafe { nvme_cmd.__bindgen_anon_3.cdw12 } + 1; + if u64::from(((num_of_dwords * 4) - 64) / 64) < max_num_zones_to_report + { + error!("Zone Management Receive 'Number of Dwords' (cdw12) indicates to less space of the number of zones ({}) that will be reported.", max_num_zones_to_report); + return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }); + } + + let bdev_zone_infos; + + let ret = unsafe { + bdev_zone_infos = calloc( + max_num_zones_to_report as usize, + size_of_spdk_bdev_zone_info, + ); + spdk_bdev_get_zone_info( + desc, + ch, + slba, + max_num_zones_to_report, + bdev_zone_infos as *mut spdk_bdev_zone_info, + Some(bdev_io_completion), + ctx as *mut c_void, + ) + } + .to_result(|e| CoreError::NvmeIoPassthruDispatch { + source: Errno::from_i32(e), + opcode: nvme_cmd.opc(), + }); + + // Populate buff with the 'Extended Report Zones Data Structure' of the 'NVMe Zoned Namespace Command Set Specification, Revision 1.1c' + unsafe { buffer.write_bytes(0, buffer_size as usize) }; + + if ret.is_err() { + unsafe { free(bdev_zone_infos) }; + return ret; + } + // Bytes 07:00 > Number of Zones + // Deferred until we know how many zones we actuallay reported + + // Bytes 63:08 > Reserved + let erzds_rsvd_offset: isize = 64; + + // Bytes 127:64 and the following 64 * (max_num_zones - 1) bytes > Zone Descriptor + let zone_desc_size: isize = 64; + + // Zone Descriptor Extention not needed + let zone_desc_ext_size: isize = 0; + + let mut zone = 0u64; + let mut num_zones_reported = 0u64; + + let bdev_zone_info_c_void = + unsafe { calloc(1, size_of_spdk_bdev_zone_info) }; + loop { + if zone >= max_num_zones_to_report { + break; + } + unsafe { + // Fetch and cast the current zone info + std::ptr::copy_nonoverlapping( + bdev_zone_infos.offset( + (zone as usize * size_of_spdk_bdev_zone_info) as isize, + ), + bdev_zone_info_c_void, + size_of_spdk_bdev_zone_info, + ); + let bdev_zone_info: *mut spdk_bdev_zone_info = + std::ptr::slice_from_raw_parts_mut( + bdev_zone_info_c_void, + size_of_spdk_bdev_zone_info, + ) as _; + + if !is_zra_list_matching_zone_state( + zra_report_opt as u32, + (*bdev_zone_info).state, + ) { + zone += 1; + continue; + } + + // Byte 00 of Zone Descriptor > Zone Type (always sequential = 0x2u8) + let mut byte_offset: isize = 0; + let mut zt = 0x2u8; + std::ptr::copy_nonoverlapping( + &mut zt as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 1, + ); + byte_offset += 1; + + // Byte 01, bits 7:4 > Zone State + let mut zs = bdev_zone_state_to_nvme_zns_zone_state( + (*bdev_zone_info).state, + )? as u8; + zs = zs << 4; + std::ptr::copy_nonoverlapping( + &mut zs as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 1, + ); + byte_offset += 1; + + //Byte 02 > Zone Attributes (always 0x0u8) + byte_offset += 1; + + //Byte 03 > Zone Attributes Information (always 0x0u8) + byte_offset += 1; + + //Byte 07:04 > Reserved (always 0x0u32) + byte_offset += 4; + + //Byte 15:08 > Zone Capacity + let mut zcap = (*bdev_zone_info).capacity; + std::ptr::copy_nonoverlapping( + &mut zcap as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 8, + ); + byte_offset += 8; + + //Byte 23:16 > Zone Start Logical Block Address + let mut zslba = (*bdev_zone_info).zone_id as u64; + std::ptr::copy_nonoverlapping( + &mut zslba as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 8, + ); + byte_offset += 8; + + //Byte 31:24 > Write Pointer + let mut wp = (*bdev_zone_info).write_pointer as u64; + std::ptr::copy_nonoverlapping( + &mut wp as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 8, + ); + //byte_offset += 8; + + // Byte 32:63 > Reserved + zone += 1; + num_zones_reported += 1; + } + } + + // Bytes 07:00 > Number of Zones + unsafe { + std::ptr::copy_nonoverlapping( + &mut num_zones_reported as *mut _ as *mut c_void, + buffer, + mem::size_of::() as usize, + ); + } + + unsafe { + free(bdev_zone_info_c_void); + free(bdev_zone_infos); + } + ret + } + + fn submit_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + buffer: *mut c_void, + buffer_size: u64, + cb: IoCompletionCallback, + cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + let ctx = alloc_bdev_io_ctx( + IoType::NvmeIo, + IoCtx { + device: self.device, + cb, + cb_arg, + }, + 0, + 0, + )?; + + let (desc, ch) = self.handle.io_tuple(); + unsafe { spdk_bdev_nvme_io_passthru( desc, @@ -661,7 +1073,8 @@ impl BlockDeviceHandle for SpdkBlockDeviceHandle { Some(bdev_io_completion), ctx as *mut c_void, ) - }.to_result(|e| CoreError::NvmeIoPassthruDispatch { + } + .to_result(|e| CoreError::NvmeIoPassthruDispatch { source: Errno::from_i32(e), opcode: nvme_cmd.opc(), }) @@ -775,14 +1188,10 @@ pub fn io_type_to_err( offset, len, }, - IoType::Reset => CoreError::ResetDispatch { - source, - }, + IoType::Reset => CoreError::ResetDispatch { source }, _ => { warn!("Unsupported I/O operation: {:?}", op); - CoreError::NotSupported { - source, - } + CoreError::NotSupported { source } } } } diff --git a/io-engine/src/bdev/nexus/nexus_io.rs b/io-engine/src/bdev/nexus/nexus_io.rs index 24af1d97d..c6e8632e3 100644 --- a/io-engine/src/bdev/nexus/nexus_io.rs +++ b/io-engine/src/bdev/nexus/nexus_io.rs @@ -34,6 +34,7 @@ use crate::core::{ IoType, LvolFailure, Mthread, + NvmeCmdOpc, NvmeStatus, ReadOptions, }; @@ -540,10 +541,29 @@ impl<'n> NexusBio<'n> { self.as_ptr().cast(), ); } else { - return Err(CoreError::NvmeIoPassthruDispatch { - source: Errno::EOPNOTSUPP, - opcode: orig_nvme_cmd.opc(), - }); + let opc = orig_nvme_cmd.opc(); + match opc { + // Zone Management Send + opc if opc == NvmeCmdOpc::ZoneMgmtSend as u16 => return hdl.emulate_zone_mgmt_send_io_passthru( + &passthru_nvme_cmd, + buffer, + buffer_size, + Self::child_completion, + self.as_ptr().cast(), + ), + // Zone Management Receive + opc if opc == NvmeCmdOpc::ZoneMgmtReceive as u16 => return hdl.emulate_zone_mgmt_recv_io_passthru( + &passthru_nvme_cmd, + buffer, + buffer_size, + Self::child_completion, + self.as_ptr().cast(), + ), + _ => return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: opc, + }), + } } } diff --git a/io-engine/src/core/mod.rs b/io-engine/src/core/mod.rs index 7c7ed4812..e141aad16 100644 --- a/io-engine/src/core/mod.rs +++ b/io-engine/src/core/mod.rs @@ -478,6 +478,16 @@ pub enum IoSubmissionFailure { Write, } +/// Supported NVMe command passthrough opcodes +#[derive(Debug, Copy, Clone, Eq, PartialOrd, PartialEq)] +#[repr(u16)] +pub enum NvmeCmdOpc{ + // Zone Management Send opcode: 79h = 121 + ZoneMgmtSend = 121, + // Zone Management Receive opcode: 7Ah = 122 + ZoneMgmtReceive = 122, +} + // Generic I/O completion status for block devices, which supports per-protocol // error domains. #[derive(Copy, Clone, Eq, PartialOrd, PartialEq)] From 5e320a3c8b6c34ca8181bca258b8a1446fd055bf Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Mon, 23 Jan 2023 13:58:13 +0100 Subject: [PATCH 08/11] feat(test): Add cargo test for Zoned Storage Support Testing zoned storage support through zoned uring with a backing zoned nullblk device. Signed-off-by: Dennis Maisenbacher --- io-engine-tests/src/lib.rs | 112 ++++++++++- io-engine/src/bdev/nexus/mod.rs | 2 +- io-engine/src/bdev/nexus/nexus_bdev_error.rs | 2 +- io-engine/tests/zns.rs | 190 +++++++++++++++++++ 4 files changed, 303 insertions(+), 3 deletions(-) create mode 100644 io-engine/tests/zns.rs diff --git a/io-engine-tests/src/lib.rs b/io-engine-tests/src/lib.rs index 628b80b26..4d059d06e 100644 --- a/io-engine-tests/src/lib.rs +++ b/io-engine-tests/src/lib.rs @@ -4,7 +4,7 @@ //! panic macros. The caller can decide how to handle the error appropriately. //! Panics and asserts in this file are still ok for usage & programming errors. -use std::{io, io::Write, process::Command, time::Duration}; +use std::{fmt, io, io::Write, process::Command, time::Duration}; use crossbeam::channel::{after, select, unbounded}; use once_cell::sync::OnceCell; @@ -569,4 +569,114 @@ macro_rules! test_diag { }} } +/// The null block device driver emulates block devices and is used for benchmarking and testing. +/// https://docs.kernel.org/block/null_blk.html +pub struct NullBlk(u32); +impl Drop for NullBlk { + fn drop(&mut self) { + delete_nullblk_device(self.0); + } +} +impl fmt::Display for NullBlk { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +/// Create a zoned nullblk device with the given parameters. This emulated device exists entirely +/// in memory. +pub fn create_zoned_nullblk_device( + block_size: u32, + zone_size: u32, + zone_cap: u32, + nr_conv_zones: u32, + nr_seq_zones: u32, + max_active_zones: u32, + max_open_zones: u32, +) -> Result { + //Get the next free nullblk device number + let mut nid = 1; + while std::path::Path::new(&format!( + "/sys/kernel/config/nullb/nullb{}", + nid + )) + .exists() + { + nid += 1; + } + let (exit, stdout, stderr) = run_script::run( + r#" + set -e + modprobe null_blk nr_devices=0 > /dev/null || return $? + nid=$1 + bs=$2 + zs=$3 + zc=$4 + nr_conv=$5 + nr_seq=$6 + max_active_zones=$7 + max_open_zones=$8 + + cap=$(( zs * (nr_conv + nr_seq) )) + + dev="/sys/kernel/config/nullb/nullb$nid" + mkdir "$dev" + + echo $bs > "$dev"/blocksize + echo 0 > "$dev"/completion_nsec + echo 0 > "$dev"/irqmode + echo 2 > "$dev"/queue_mode + echo 1024 > "$dev"/hw_queue_depth + echo 1 > "$dev"/memory_backed + echo 1 > "$dev"/zoned + + echo $cap > "$dev"/size + echo $zs > "$dev"/zone_size + echo $zc > "$dev"/zone_capacity + echo $nr_conv > "$dev"/zone_nr_conv + echo $max_active_zones > "$dev"/zone_max_active + echo $max_open_zones > "$dev"/zone_max_open + + echo 1 > "$dev"/power + + echo mq-deadline > /sys/block/nullb$nid/queue/scheduler + + echo "$nid" + "#, + &vec![ + nid.to_string(), + block_size.to_string(), + zone_size.to_string(), + zone_cap.to_string(), + nr_conv_zones.to_string(), + nr_seq_zones.to_string(), + max_active_zones.to_string(), + max_open_zones.to_string(), + ], + &run_script::ScriptOptions::new(), + ) + .unwrap(); + if exit != 0 { + return Err((exit, stderr)); + } + return Ok(NullBlk(stdout.trim().parse::().unwrap())); +} + +pub fn delete_nullblk_device(nid: u32) -> i32 { + let (exit, _, _) = run_script::run( + r#" + set -e + nid=$1 + dev="/sys/kernel/config/nullb/nullb$nid" + + echo 0 > "$dev"/power + rmdir $dev + "#, + &vec![nid.to_string()], + &run_script::ScriptOptions::new(), + ) + .unwrap(); + exit +} + pub use io_engine_tests_macros::spdk_test; diff --git a/io-engine/src/bdev/nexus/mod.rs b/io-engine/src/bdev/nexus/mod.rs index 4f8624cd0..9e511cfa7 100644 --- a/io-engine/src/bdev/nexus/mod.rs +++ b/io-engine/src/bdev/nexus/mod.rs @@ -8,7 +8,7 @@ use futures::{future::Future, FutureExt}; mod nexus_bdev; mod nexus_bdev_children; -mod nexus_bdev_error; +pub mod nexus_bdev_error; mod nexus_bdev_rebuild; mod nexus_bdev_snapshot; mod nexus_channel; diff --git a/io-engine/src/bdev/nexus/nexus_bdev_error.rs b/io-engine/src/bdev/nexus/nexus_bdev_error.rs index 23eede96f..1f3531b6b 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev_error.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev_error.rs @@ -15,7 +15,7 @@ use crate::{ /// Common errors for nexus basic operations and child operations /// which are part of nexus object. #[derive(Debug, Snafu)] -#[snafu(visibility(pub(crate)), context(suffix(false)), module(nexus_err))] +#[snafu(visibility(pub), context(suffix(false)), module(nexus_err))] pub enum Error { #[snafu(display("Nexus {} does not exist", name))] NexusNotFound { name: String }, diff --git a/io-engine/tests/zns.rs b/io-engine/tests/zns.rs new file mode 100644 index 000000000..d93b1e0f1 --- /dev/null +++ b/io-engine/tests/zns.rs @@ -0,0 +1,190 @@ +use lazy_static::lazy_static; +use once_cell::sync::OnceCell; +use std::{convert::TryFrom, sync::Mutex}; + +extern crate libnvme_rs; + +use io_engine::{ + bdev::nexus::{nexus_bdev_error::Error, nexus_create, nexus_lookup_mut}, + core::{MayastorCliArgs, Protocol, UntypedBdevHandle}, +}; + +pub mod common; +use common::compose::MayastorTest; +use io_engine_tests::NullBlk; +use run_script::{self}; + +//TODO: Also test pcie and nvmf +//static BDEVNAME1: &str = "pcie:///0000:00:03.0"; +//static BDEVNAME1: &str = "nvmf://192.168.0.1:4420/nvmet-always"; +lazy_static! { + static ref BDEVNAME1: Mutex = Mutex::new(String::new()); +} +fn get_bdevname1() -> String { + BDEVNAME1.lock().unwrap().clone() +} +fn set_bdevname1(name: String) { + *BDEVNAME1.lock().unwrap() = name; +} + +static DISKNAME2: &str = "/tmp/disk2.img"; +static BDEVNAME2: &str = "uring:///tmp/disk2.img?blk_size=4096"; + +static MAYASTOR: OnceCell = OnceCell::new(); + +fn prepare_storage() -> NullBlk { + common::delete_file(&[DISKNAME2.into()]); + common::truncate_file(DISKNAME2, 64 * 1024); + let ret = + common::create_zoned_nullblk_device(4096, 2048, 1077, 0, 16, 14, 14); + let nullblk_id = ret.unwrap(); + set_bdevname1(format!("uring:///dev/nullb{}?blk_size=4096", nullblk_id)); + nullblk_id +} + +fn get_ms() -> &'static MayastorTest<'static> { + MAYASTOR.get_or_init(|| MayastorTest::new(MayastorCliArgs::default())) +} + +async fn create_connected_nvmf_nexus( + ms: &'static MayastorTest<'static>, +) -> (libnvme_rs::NvmeTarget, String) { + let uri = ms + .spawn(async { + create_nexus().await; + // Claim the bdev + let hdl = UntypedBdevHandle::open(&get_bdevname1(), true, true); + let nexus = nexus_lookup_mut("nexus").unwrap(); + let ret = nexus.share(Protocol::Nvmf, None).await.unwrap(); + drop(hdl); + ret + }) + .await; + // Create and connect NVMF target. + let target = libnvme_rs::NvmeTarget::try_from(uri) + .unwrap() + .with_rand_hostnqn(true); + + target.connect().unwrap(); + + let devices = target.block_devices(2).unwrap(); + + assert_eq!(devices.len(), 1); + (target, devices[0].to_string()) +} + +fn fio_run_zoned_verify(device: &str) -> Result { + println!("Running fio workload ..."); + //This writes sequentially two zones, resets them, writes them again and reads from them to do the crc32 check + let (exit, stdout, stderr) = run_script::run( + r#" + fio --name=zonedwrite --rw=write --ioengine=libaio --direct=1 --zonemode=zbd \ + --size=2z --io_size=4z --bs=128k --verify=crc32 --filename=$1 + "#, + &vec![device.into()], + &run_script::ScriptOptions::new(), + ).unwrap(); + + if exit == 0 { + Ok(stdout) + } else { + Err(stderr) + } +} + +fn blkzone(device: &str, subcommand: &str) -> Result { + let (exit, stdout, stderr) = run_script::run( + r#" + blkzone $1 $2 + "#, + &vec![subcommand.into(), device.into()], + &run_script::ScriptOptions::new(), + ) + .unwrap(); + + if exit == 0 { + Ok(stdout) + } else { + Err(stderr) + } +} + +#[tokio::test] +async fn zns_fio() { + let ms = get_ms(); + + let _nullblk_id = prepare_storage(); + let (target, nvmf_dev) = create_connected_nvmf_nexus(ms).await; + + let fio_result = fio_run_zoned_verify(&nvmf_dev); + match fio_result { + Ok(ref ok) => println!("{}", ok), + Err(ref err) => println!("{}", err), + } + + target.disconnect().unwrap(); + + ms.spawn(async move { + let mut nexus = nexus_lookup_mut("nexus").unwrap(); + nexus.as_mut().unshare_nexus().await.unwrap(); + nexus.destroy().await.unwrap(); + }) + .await; + + assert_eq!(true, fio_result.is_ok()); +} + +#[tokio::test] +async fn zns_blkzone() { + let ms = get_ms(); + + let _nullblk_id = prepare_storage(); + let (target, nvmf_dev) = create_connected_nvmf_nexus(ms).await; + + let blkzone_report_result = blkzone(&nvmf_dev, "report"); + match blkzone_report_result { + Ok(ref ok) => println!("{}", ok), + Err(ref err) => println!("{}", err), + } + + let blkzone_reset_result = blkzone(&nvmf_dev, "reset"); + match blkzone_reset_result { + Ok(ref ok) => println!("{}", ok), + Err(ref err) => println!("{}", err), + } + + target.disconnect().unwrap(); + + ms.spawn(async move { + let mut nexus = nexus_lookup_mut("nexus").unwrap(); + nexus.as_mut().unshare_nexus().await.unwrap(); + nexus.destroy().await.unwrap(); + }) + .await; + + assert_eq!(true, blkzone_report_result.is_ok()); + assert_eq!(true, blkzone_reset_result.is_ok()); +} + +#[tokio::test] +async fn zns_replicated() { + let ms = get_ms(); + + let _nullblk_id = prepare_storage(); + let ret = ms.spawn(async { create_replicated_nexus().await }).await; + + assert_eq!(true, ret.is_err()); +} + +async fn create_nexus() { + let ch = vec![get_bdevname1()]; + //TODO: test different sizes and a splitted nexus + nexus_create("nexus", 1024 * 1024 * 1024 * 32, None, &ch) + .await + .unwrap(); +} + +async fn create_replicated_nexus() -> Result<(), Error> { + let ch = vec![get_bdevname1(), BDEVNAME2.to_string()]; + nexus_create("nexus", 1024 * 1024 * 1024 * 32, None, &ch).await +} From 5995d31dc3ce62bcbdca44610fb0f5c1c97be7e9 Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Mon, 23 Jan 2023 15:29:01 +0100 Subject: [PATCH 09/11] docs: Improve docs for hacking on Mayastor Signed-off-by: Dennis Maisenbacher --- doc/build.md | 8 +++++++- doc/run.md | 4 ++-- doc/test.md | 30 ++++++++++++++++++++++++++++-- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/doc/build.md b/doc/build.md index c68dd5073..90ea78acf 100644 --- a/doc/build.md +++ b/doc/build.md @@ -91,6 +91,12 @@ $ sudo nixos-rebuild switch --update > > Don't want to use `nixUnstable`? **That's ok!** Use `nix-shell` and `nix-build` as you normally would. +Check out the submodules: + +```bash +git submodule update --init +``` + **Want to run or hack on Mayastor?** _You need more configuration!_ See [running][doc-run], then [testing][doc-test]. @@ -127,7 +133,7 @@ cargo build --release ``` **Want to run or hack on Mayastor?** _You need more configuration!_ See -[running][doc-running], then [testing][doc-testing]. +[running][doc-run], then [testing][doc-test]. Whilst the nix develop will allow you to build mayastor exactly as the image build, it might not have all the necessary components required for testing. For that you might want to use the explicit shell configuration file: ci.nix: diff --git a/doc/run.md b/doc/run.md index bac682cb1..50f3552c0 100644 --- a/doc/run.md +++ b/doc/run.md @@ -83,14 +83,14 @@ In order to use the full feature set of Mayastor, some or all of the following c ```nix # /etc/nixos/configuration.nix boot.kernelModules = [ - "nbd" "xfs" "nvmet" "nvme_fabrics" "nvmet_rdma" "nvme_tcp" "nvme_rdma" "nvme_loop" + "nbd" "xfs" "btrfs" "nvmet" "nvme_fabrics" "nvmet_rdma" "nvme_tcp" "nvme_rdma" "nvme_loop" ]; ``` To load these on non-NixOS machines: ```bash - modprobe nbd nvmet nvmet_rdma nvme_fabrics nvme_tcp nvme_rdma nvme_loop + modprobe nbd xfs btrfs nvmet nvmet_rdma nvme_fabrics nvme_tcp nvme_rdma nvme_loop ``` - For Asymmetric Namespace Access (ANA) support (early preview), the following kernel build configuration enabled: diff --git a/doc/test.md b/doc/test.md index 75b8ce882..dece709f9 100644 --- a/doc/test.md +++ b/doc/test.md @@ -14,7 +14,7 @@ Or, for ad-hoc: - Ensure several kernel modules are installed: ```bash - modprobe nbd xfs nvmet nvme_fabrics nvmet_rdma nvme_tcp nvme_rdma nvme_loop + modprobe nbd xfs btrfs nvmet nvme_fabrics nvmet_rdma nvme_tcp nvme_rdma nvme_loop ``` ## Running the test suite @@ -29,7 +29,33 @@ Mayastor's unit tests, integration tests, and documentation tests via the conven Mayastor uses [spdk][spdk] which is quite senistive to threading. This means tests need to run one at a time: ```bash -cargo test -- --test-threads 1 +cd io-engine +cargo test -- --test-threads 1 --nocapture +``` + +## Using your own SPDK version + +In order to use your own SPDK version, your SPDK tree must rebase the commit of the latest `vYY.mm.x-mayastor` +branch from the https://github.com/openebs/spdk repo. +Build SPDK with these instructions inside of your nix shell: + +```bash +cd spdk-rs +git clone https://github.com/openebs/spdk +cd spdk +git checkout vYY.mm.x-mayastor +# Rebase your branch +git submodule update --init +cd - +./build_spdk.sh +``` + +Before you run the cargo tests again, make sure spdk-rs is rebuild: + +```bash +cd ../io-engine +cargo clean -p spdk-rs +cargo test -- --test-threads 1 --nocapture ``` ## Running the end-to-end test suite From 0e10011a40e27a2baf3ee5e92b2db87494afadf7 Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Fri, 27 Jan 2023 11:41:23 +0100 Subject: [PATCH 10/11] docs: Add doc for introduced zoned storage support Signed-off-by: Dennis Maisenbacher --- doc/zns.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 doc/zns.md diff --git a/doc/zns.md b/doc/zns.md new file mode 100644 index 000000000..91dc4cdc6 --- /dev/null +++ b/doc/zns.md @@ -0,0 +1,25 @@ +# Zoned Storage Support +Mayastor supports zoned storage in the form of PCIe ZNS devices and zoned SPDK uring block devices. + +## Overview +Zoned storage is a class of storage that divides its address space into zones. These zones come with a sequential write constraint. Therefore, writes can just be issued to the zones write pointer, which will be advanced with a successful write operation. If the zone's capacity is reached, the zone is being transferred to the 'Full' state by the device controller and can not be rewritten until the zone is actively reset by the user. As of now zoned storage is available in the form of SMR HDDs and ZNS SSDs. This proposal focuses on ZNS SSDs. +For more information about zoned storage visit [zonedstorage.io](https://zonedstorage.io/). + +Zoned Namespace (ZNS) NVMe SSDs are defined as part of a NVMe Command Set (see 'NVM Express Zoned Namespace Command Set Specification' in the [NVMe Command Set Specifications](https://nvmexpress.org/developers/nvme-command-set-specifications/)) and is supported since Linux kernel v5.9. SPDK supports zoned storage since v20.10. + +Because ZNS SSDs align their flash media with zones, no on device garbage collection is needed. This results in better throughput, predictable latency and higher capacities per dollar (because over provisioning and DRAM for page mapping is not needed) in comparison to conventional SSDs. + +The concept of ZNS SSDs and its advantages are discussed in depth in the ['ZNS: Avoiding the Block Interface Tax for Flash-based SSDs'](https://www.usenix.org/conference/atc21/presentation/bjorling) paper. + +[RocksDB](https://github.com/facebook/rocksdb) and [TerarkDB](https://github.com/bytedance/terarkdb) are example applications of end to end integration with zoned storage through [ZenFS](https://github.com/westerndigitalcorporation/zenfs). +POSIX file systems like f2fs and btrfs also have zone support. + +## Requirements for Mayastor +Initially the ZNS support in Mayastor is targeting the non-replicated volume I/O path with a disabled volume partitioning. +Replication and volume partitioning can be addressed later on as those features require special care in regards to the sequential write constrain and the devices max active zones and max open zones restrictions. + +The NexusChild of a non-replicated Nexus should allow ZNS NVMe devices via the PCIe URI scheme as well as zoned SPDK uring devices via the uring URI scheme. This results automatically in a zoned nexus which is exposed to the user as a raw zoned NVMe-oF target or formated with btrfs. + +## Prerequisites +- Linux kernel v5.15.68 or higher is needed because of the patch [nvmet: fix mar and mor off-by-one errors](https://lore.kernel.org/lkml/20220906073929.3292899-1-Dennis.Maisenbacher@wdc.com/) +- SPDK 23.01 is needed because of [ZNS support for NVMe-oF](https://review.spdk.io/gerrit/c/spdk/spdk/+/16044/7) From 344c3fb181dda66e1ebc6418df4cecebea0e210e Mon Sep 17 00:00:00 2001 From: Dennis Maisenbacher Date: Mon, 20 Mar 2023 11:22:27 +0100 Subject: [PATCH 11/11] feat(nvme): On command completion handle zoned nvme errors Zoned related CommandSpecificStatusCodes's should not take out the nexus. Signed-off-by: Dennis Maisenbacher --- io-engine/src/bdev/nexus/nexus_io.rs | 8 ++++++-- io-engine/src/core/mod.rs | 30 +++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/io-engine/src/bdev/nexus/nexus_io.rs b/io-engine/src/bdev/nexus/nexus_io.rs index c6e8632e3..aad02ee47 100644 --- a/io-engine/src/bdev/nexus/nexus_io.rs +++ b/io-engine/src/bdev/nexus/nexus_io.rs @@ -29,6 +29,7 @@ use crate::core::{ CoreError, Cores, IoCompletionStatus, + is_zoned_nvme_error, IoStatus, IoSubmissionFailure, IoType, @@ -271,9 +272,12 @@ impl<'n> NexusBio<'n> { self.ctx_mut().successful += 1; } else { self.ctx_mut().status = IoStatus::Failed; - self.ctx_mut().failed += 1; - self.completion_error(child, status); + // Don't take zoned child out on zoned related nvme errors + if !is_zoned_nvme_error(status) { + self.ctx_mut().failed += 1; + self.completion_error(child, status); + } } if self.ctx().in_flight > 0 { diff --git a/io-engine/src/core/mod.rs b/io-engine/src/core/mod.rs index e141aad16..2bba3ba56 100644 --- a/io-engine/src/core/mod.rs +++ b/io-engine/src/core/mod.rs @@ -86,7 +86,17 @@ pub use snapshot::{ SnapshotXattrs, }; -use spdk_rs::libspdk::SPDK_NVME_SC_CAPACITY_EXCEEDED; +use spdk_rs::libspdk::{ + SPDK_NVME_SC_CAPACITY_EXCEEDED, + SPDK_NVME_SC_ZONED_BOUNDARY_ERROR, + SPDK_NVME_SC_ZONE_IS_FULL, + SPDK_NVME_SC_ZONE_IS_READ_ONLY, + SPDK_NVME_SC_ZONE_IS_OFFLINE, + SPDK_NVME_SC_ZONE_INVALID_WRITE, + SPDK_NVME_SC_TOO_MANY_ACTIVE_ZONES, + SPDK_NVME_SC_TOO_MANY_OPEN_ZONES, + SPDK_NVME_SC_INVALID_ZONE_STATE_TRANSITION, +}; mod bdev; mod block_device; @@ -528,6 +538,24 @@ impl From for IoCompletionStatus { } } +/// Returns true if the given IoCompletionStatus NvmeError can be matched to a Zoned Namespace Command Specific Status Code +pub fn is_zoned_nvme_error(status: IoCompletionStatus) -> bool { + match status { + IoCompletionStatus::NvmeError(NvmeStatus::CmdSpecific(cssc)) => match cssc { + SPDK_NVME_SC_ZONED_BOUNDARY_ERROR | + SPDK_NVME_SC_ZONE_IS_FULL | + SPDK_NVME_SC_ZONE_IS_READ_ONLY | + SPDK_NVME_SC_ZONE_IS_OFFLINE | + SPDK_NVME_SC_ZONE_INVALID_WRITE | + SPDK_NVME_SC_TOO_MANY_ACTIVE_ZONES | + SPDK_NVME_SC_TOO_MANY_OPEN_ZONES | + SPDK_NVME_SC_INVALID_ZONE_STATE_TRANSITION => true, + _ => false, + }, + _ => false, + } +} + // TODO move this elsewhere ASAP pub static PAUSING: AtomicUsize = AtomicUsize::new(0); pub static PAUSED: AtomicUsize = AtomicUsize::new(0);