diff --git a/Cargo.lock b/Cargo.lock index ce860f7a1..9d0a0f51e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1168,6 +1168,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + [[package]] name = "fsio" version = "0.4.0" @@ -1606,6 +1612,7 @@ dependencies = [ "io-engine-tests", "io-uring", "ioctl-gen", + "jemalloc-sys", "jsonrpc", "lazy_static", "libc", @@ -1837,6 +1844,17 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "jemalloc-sys" +version = "0.5.2+5.3.0-patched" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "134163979b6eed9564c98637b710b40979939ba351f59952708234ea11b5f3f8" +dependencies = [ + "cc", + "fs_extra", + "libc", +] + [[package]] name = "js-sys" version = "0.3.64" diff --git a/ci.nix b/ci.nix index b0b4bbbb6..c04f46e45 100644 --- a/ci.nix +++ b/ci.nix @@ -26,6 +26,7 @@ mkShell { buildInputs = [ autoconf automake + btrfs-progs clang cowsay docker diff --git a/doc/build.md b/doc/build.md index c68dd5073..90ea78acf 100644 --- a/doc/build.md +++ b/doc/build.md @@ -91,6 +91,12 @@ $ sudo nixos-rebuild switch --update > > Don't want to use `nixUnstable`? **That's ok!** Use `nix-shell` and `nix-build` as you normally would. +Check out the submodules: + +```bash +git submodule update --init +``` + **Want to run or hack on Mayastor?** _You need more configuration!_ See [running][doc-run], then [testing][doc-test]. @@ -127,7 +133,7 @@ cargo build --release ``` **Want to run or hack on Mayastor?** _You need more configuration!_ See -[running][doc-running], then [testing][doc-testing]. +[running][doc-run], then [testing][doc-test]. Whilst the nix develop will allow you to build mayastor exactly as the image build, it might not have all the necessary components required for testing. For that you might want to use the explicit shell configuration file: ci.nix: diff --git a/doc/run.md b/doc/run.md index bac682cb1..50f3552c0 100644 --- a/doc/run.md +++ b/doc/run.md @@ -83,14 +83,14 @@ In order to use the full feature set of Mayastor, some or all of the following c ```nix # /etc/nixos/configuration.nix boot.kernelModules = [ - "nbd" "xfs" "nvmet" "nvme_fabrics" "nvmet_rdma" "nvme_tcp" "nvme_rdma" "nvme_loop" + "nbd" "xfs" "btrfs" "nvmet" "nvme_fabrics" "nvmet_rdma" "nvme_tcp" "nvme_rdma" "nvme_loop" ]; ``` To load these on non-NixOS machines: ```bash - modprobe nbd nvmet nvmet_rdma nvme_fabrics nvme_tcp nvme_rdma nvme_loop + modprobe nbd xfs btrfs nvmet nvmet_rdma nvme_fabrics nvme_tcp nvme_rdma nvme_loop ``` - For Asymmetric Namespace Access (ANA) support (early preview), the following kernel build configuration enabled: diff --git a/doc/test.md b/doc/test.md index 8000b5fdc..dece709f9 100644 --- a/doc/test.md +++ b/doc/test.md @@ -14,7 +14,7 @@ Or, for ad-hoc: - Ensure several kernel modules are installed: ```bash - modprobe nbd xfs nvmet nvme_fabrics nvmet_rdma nvme_tcp nvme_rdma nvme_loop + modprobe nbd xfs btrfs nvmet nvme_fabrics nvmet_rdma nvme_tcp nvme_rdma nvme_loop ``` ## Running the test suite @@ -29,7 +29,33 @@ Mayastor's unit tests, integration tests, and documentation tests via the conven Mayastor uses [spdk][spdk] which is quite senistive to threading. This means tests need to run one at a time: ```bash -cargo test -- --test-threads 1 +cd io-engine +cargo test -- --test-threads 1 --nocapture +``` + +## Using your own SPDK version + +In order to use your own SPDK version, your SPDK tree must rebase the commit of the latest `vYY.mm.x-mayastor` +branch from the https://github.com/openebs/spdk repo. +Build SPDK with these instructions inside of your nix shell: + +```bash +cd spdk-rs +git clone https://github.com/openebs/spdk +cd spdk +git checkout vYY.mm.x-mayastor +# Rebase your branch +git submodule update --init +cd - +./build_spdk.sh +``` + +Before you run the cargo tests again, make sure spdk-rs is rebuild: + +```bash +cd ../io-engine +cargo clean -p spdk-rs +cargo test -- --test-threads 1 --nocapture ``` ## Running the end-to-end test suite @@ -55,6 +81,29 @@ Then, to run the tests: ./node_modules/mocha/bin/mocha test_csi.js ``` +## Using PCIe NVMe devices in cargo tests while developing + +When developing new features, testing those with real PCIe devices in the process might come in handy. +In order to do so, the PCIe device first needs to be bound to the vfio driver: + +```bash +sudo PCI_ALLOWED="" ./spdk-rs/spdk/scripts/setup.sh +``` + +The bdev name in the cargo test case can then follow the PCIe URI pattern: + +```rust +static BDEVNAME1: &str = "pcie:///"; +``` + +After testing the device may be rebound to the NVMe driver: + +```bash +sudo PCI_ALLOWED="" ./spdk-rs/spdk/scripts/setup.sh reset +``` + +Please do not submit pull requests with cargo test cases that require PCIe devices to be present. + [spdk]: https://spdk.io/ [doc-run]: ./run.md [mocha]: https://mochajs.org/ diff --git a/doc/zns.md b/doc/zns.md new file mode 100644 index 000000000..91dc4cdc6 --- /dev/null +++ b/doc/zns.md @@ -0,0 +1,25 @@ +# Zoned Storage Support +Mayastor supports zoned storage in the form of PCIe ZNS devices and zoned SPDK uring block devices. + +## Overview +Zoned storage is a class of storage that divides its address space into zones. These zones come with a sequential write constraint. Therefore, writes can just be issued to the zones write pointer, which will be advanced with a successful write operation. If the zone's capacity is reached, the zone is being transferred to the 'Full' state by the device controller and can not be rewritten until the zone is actively reset by the user. As of now zoned storage is available in the form of SMR HDDs and ZNS SSDs. This proposal focuses on ZNS SSDs. +For more information about zoned storage visit [zonedstorage.io](https://zonedstorage.io/). + +Zoned Namespace (ZNS) NVMe SSDs are defined as part of a NVMe Command Set (see 'NVM Express Zoned Namespace Command Set Specification' in the [NVMe Command Set Specifications](https://nvmexpress.org/developers/nvme-command-set-specifications/)) and is supported since Linux kernel v5.9. SPDK supports zoned storage since v20.10. + +Because ZNS SSDs align their flash media with zones, no on device garbage collection is needed. This results in better throughput, predictable latency and higher capacities per dollar (because over provisioning and DRAM for page mapping is not needed) in comparison to conventional SSDs. + +The concept of ZNS SSDs and its advantages are discussed in depth in the ['ZNS: Avoiding the Block Interface Tax for Flash-based SSDs'](https://www.usenix.org/conference/atc21/presentation/bjorling) paper. + +[RocksDB](https://github.com/facebook/rocksdb) and [TerarkDB](https://github.com/bytedance/terarkdb) are example applications of end to end integration with zoned storage through [ZenFS](https://github.com/westerndigitalcorporation/zenfs). +POSIX file systems like f2fs and btrfs also have zone support. + +## Requirements for Mayastor +Initially the ZNS support in Mayastor is targeting the non-replicated volume I/O path with a disabled volume partitioning. +Replication and volume partitioning can be addressed later on as those features require special care in regards to the sequential write constrain and the devices max active zones and max open zones restrictions. + +The NexusChild of a non-replicated Nexus should allow ZNS NVMe devices via the PCIe URI scheme as well as zoned SPDK uring devices via the uring URI scheme. This results automatically in a zoned nexus which is exposed to the user as a raw zoned NVMe-oF target or formated with btrfs. + +## Prerequisites +- Linux kernel v5.15.68 or higher is needed because of the patch [nvmet: fix mar and mor off-by-one errors](https://lore.kernel.org/lkml/20220906073929.3292899-1-Dennis.Maisenbacher@wdc.com/) +- SPDK 23.01 is needed because of [ZNS support for NVMe-oF](https://review.spdk.io/gerrit/c/spdk/spdk/+/16044/7) diff --git a/io-engine-tests/src/lib.rs b/io-engine-tests/src/lib.rs index f3163c71d..4d059d06e 100644 --- a/io-engine-tests/src/lib.rs +++ b/io-engine-tests/src/lib.rs @@ -4,7 +4,7 @@ //! panic macros. The caller can decide how to handle the error appropriately. //! Panics and asserts in this file are still ok for usage & programming errors. -use std::{io, io::Write, process::Command, time::Duration}; +use std::{fmt, io, io::Write, process::Command, time::Duration}; use crossbeam::channel::{after, select, unbounded}; use once_cell::sync::OnceCell; @@ -143,7 +143,7 @@ pub fn mayastor_test_init_ex(log_format: LogFormat) { }) } - ["dd", "mkfs.xfs", "mkfs.ext4", "cmp", "fsck", "truncate"] + ["dd", "mkfs.xfs", "mkfs.ext4", "mkfs.btrfs", "cmp", "fsck", "truncate"] .iter() .for_each(|binary| { if binary_present(binary).is_err() { @@ -202,8 +202,9 @@ pub fn fscheck(device: &str) { pub fn mkfs(path: &str, fstype: &str) -> bool { let (fs, args) = match fstype { - "xfs" => ("mkfs.xfs", ["-f", path]), - "ext4" => ("mkfs.ext4", ["-F", path]), + "xfs" => ("mkfs.xfs", vec!["-f", path]), + "ext4" => ("mkfs.ext4", vec!["-F", path]), + "btrfs" => ("mkfs.btrfs", vec!["-f", "-m", "single", "-d", "single", path]), _ => { panic!("unsupported fstype"); } @@ -568,4 +569,114 @@ macro_rules! test_diag { }} } +/// The null block device driver emulates block devices and is used for benchmarking and testing. +/// https://docs.kernel.org/block/null_blk.html +pub struct NullBlk(u32); +impl Drop for NullBlk { + fn drop(&mut self) { + delete_nullblk_device(self.0); + } +} +impl fmt::Display for NullBlk { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +/// Create a zoned nullblk device with the given parameters. This emulated device exists entirely +/// in memory. +pub fn create_zoned_nullblk_device( + block_size: u32, + zone_size: u32, + zone_cap: u32, + nr_conv_zones: u32, + nr_seq_zones: u32, + max_active_zones: u32, + max_open_zones: u32, +) -> Result { + //Get the next free nullblk device number + let mut nid = 1; + while std::path::Path::new(&format!( + "/sys/kernel/config/nullb/nullb{}", + nid + )) + .exists() + { + nid += 1; + } + let (exit, stdout, stderr) = run_script::run( + r#" + set -e + modprobe null_blk nr_devices=0 > /dev/null || return $? + nid=$1 + bs=$2 + zs=$3 + zc=$4 + nr_conv=$5 + nr_seq=$6 + max_active_zones=$7 + max_open_zones=$8 + + cap=$(( zs * (nr_conv + nr_seq) )) + + dev="/sys/kernel/config/nullb/nullb$nid" + mkdir "$dev" + + echo $bs > "$dev"/blocksize + echo 0 > "$dev"/completion_nsec + echo 0 > "$dev"/irqmode + echo 2 > "$dev"/queue_mode + echo 1024 > "$dev"/hw_queue_depth + echo 1 > "$dev"/memory_backed + echo 1 > "$dev"/zoned + + echo $cap > "$dev"/size + echo $zs > "$dev"/zone_size + echo $zc > "$dev"/zone_capacity + echo $nr_conv > "$dev"/zone_nr_conv + echo $max_active_zones > "$dev"/zone_max_active + echo $max_open_zones > "$dev"/zone_max_open + + echo 1 > "$dev"/power + + echo mq-deadline > /sys/block/nullb$nid/queue/scheduler + + echo "$nid" + "#, + &vec![ + nid.to_string(), + block_size.to_string(), + zone_size.to_string(), + zone_cap.to_string(), + nr_conv_zones.to_string(), + nr_seq_zones.to_string(), + max_active_zones.to_string(), + max_open_zones.to_string(), + ], + &run_script::ScriptOptions::new(), + ) + .unwrap(); + if exit != 0 { + return Err((exit, stderr)); + } + return Ok(NullBlk(stdout.trim().parse::().unwrap())); +} + +pub fn delete_nullblk_device(nid: u32) -> i32 { + let (exit, _, _) = run_script::run( + r#" + set -e + nid=$1 + dev="/sys/kernel/config/nullb/nullb$nid" + + echo 0 > "$dev"/power + rmdir $dev + "#, + &vec![nid.to_string()], + &run_script::ScriptOptions::new(), + ) + .unwrap(); + exit +} + pub use io_engine_tests_macros::spdk_test; diff --git a/io-engine/Cargo.toml b/io-engine/Cargo.toml index e4e1df05a..644170c8a 100644 --- a/io-engine/Cargo.toml +++ b/io-engine/Cargo.toml @@ -98,6 +98,7 @@ async-process = { version = "1.8.1" } rstack = { version = "0.3.3" } tokio-stream = "0.1.14" rustls = "0.21.12" +jemalloc-sys = "0.5.2+5.3.0-patched" devinfo = { path = "../utils/dependencies/devinfo" } jsonrpc = { path = "../jsonrpc"} diff --git a/io-engine/src/bdev/device.rs b/io-engine/src/bdev/device.rs index f0fdde754..f714611f2 100644 --- a/io-engine/src/bdev/device.rs +++ b/io-engine/src/bdev/device.rs @@ -4,6 +4,7 @@ use std::{ collections::HashMap, convert::TryFrom, + mem, os::raw::c_void, sync::{Arc, Mutex}, }; @@ -17,14 +18,45 @@ use spdk_rs::{ spdk_bdev_comparev_blocks, spdk_bdev_flush, spdk_bdev_free_io, + spdk_bdev_get_zone_info, spdk_bdev_io, + spdk_bdev_nvme_io_passthru, spdk_bdev_readv_blocks_with_flags, spdk_bdev_reset, spdk_bdev_unmap_blocks, spdk_bdev_write_zeroes_blocks, spdk_bdev_writev_blocks, + spdk_bdev_zone_info, + spdk_bdev_zone_management, + SPDK_BDEV_ZONE_CLOSE, + SPDK_BDEV_ZONE_FINISH, + SPDK_BDEV_ZONE_OFFLINE, + SPDK_BDEV_ZONE_OPEN, + SPDK_BDEV_ZONE_RESET, + SPDK_BDEV_ZONE_STATE_CLOSED, + SPDK_BDEV_ZONE_STATE_EMPTY, + SPDK_BDEV_ZONE_STATE_EXP_OPEN, + SPDK_BDEV_ZONE_STATE_FULL, + SPDK_BDEV_ZONE_STATE_IMP_OPEN, + SPDK_BDEV_ZONE_STATE_OFFLINE, + SPDK_BDEV_ZONE_STATE_READ_ONLY, SPDK_NVME_IO_FLAGS_UNWRITTEN_READ_FAIL, SPDK_NVME_IO_FLAG_CURRENT_UNWRITTEN_READ_FAIL, + SPDK_NVME_ZONE_STATE_CLOSED, + SPDK_NVME_ZONE_STATE_EMPTY, + SPDK_NVME_ZONE_STATE_EOPEN, + SPDK_NVME_ZONE_STATE_FULL, + SPDK_NVME_ZONE_STATE_IOPEN, + SPDK_NVME_ZONE_STATE_OFFLINE, + SPDK_NVME_ZONE_STATE_RONLY, + SPDK_NVME_ZRA_LIST_ALL, + SPDK_NVME_ZRA_LIST_ZSC, + SPDK_NVME_ZRA_LIST_ZSE, + SPDK_NVME_ZRA_LIST_ZSEO, + SPDK_NVME_ZRA_LIST_ZSF, + SPDK_NVME_ZRA_LIST_ZSIO, + SPDK_NVME_ZRA_LIST_ZSO, + SPDK_NVME_ZRA_LIST_ZSRO, }, nvme_admin_opc, AsIoVecPtr, @@ -53,6 +85,7 @@ use crate::{ IoCompletionCallback, IoCompletionCallbackArg, IoCompletionStatus, + NvmeCmdOpc, NvmeStatus, ReadOptions, SnapshotParams, @@ -60,8 +93,10 @@ use crate::{ UntypedBdev, UntypedBdevHandle, UntypedDescriptorGuard, + ZonedBlockDevice, }, lvs::Lvol, + ffihelper::FfiResult, }; #[cfg(feature = "fault-injection")] @@ -72,6 +107,8 @@ use crate::core::fault_injection::{ InjectIoCtx, }; +use jemalloc_sys::{calloc, free}; + /// TODO type EventDispatcherMap = HashMap; @@ -82,6 +119,69 @@ static BDEV_EVENT_DISPATCHER: Lazy> = // Memory pool for bdev I/O context. static BDEV_IOCTX_POOL: OnceCell> = OnceCell::new(); +/// TODO +fn bdev_zone_state_to_nvme_zns_zone_state( + bdev_zone_state: u32, +) -> Result { + match bdev_zone_state { + SPDK_BDEV_ZONE_STATE_EMPTY => Ok(SPDK_NVME_ZONE_STATE_EMPTY), + SPDK_BDEV_ZONE_STATE_IMP_OPEN => Ok(SPDK_NVME_ZONE_STATE_IOPEN), + SPDK_BDEV_ZONE_STATE_FULL => Ok(SPDK_NVME_ZONE_STATE_FULL), + SPDK_BDEV_ZONE_STATE_CLOSED => Ok(SPDK_NVME_ZONE_STATE_CLOSED), + SPDK_BDEV_ZONE_STATE_READ_ONLY => Ok(SPDK_NVME_ZONE_STATE_RONLY), + SPDK_BDEV_ZONE_STATE_OFFLINE => Ok(SPDK_NVME_ZONE_STATE_OFFLINE), + SPDK_BDEV_ZONE_STATE_EXP_OPEN => Ok(SPDK_NVME_ZONE_STATE_EOPEN), + _ => { + error!("Can't map SPDK_BDEV_ZONE_STATE {bdev_zone_state} to any SPDK_NVME_ZONE_STATE"); + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EINVAL, + opcode: NvmeCmdOpc::ZoneMgmtReceive as u16, + }) + } + } +} + +/// TODO +fn zone_send_action_to_bdev_zone_action( + zone_send_action: u8, +) -> Result { + match zone_send_action { + 0x01 => Ok(SPDK_BDEV_ZONE_CLOSE), + 0x02 => Ok(SPDK_BDEV_ZONE_FINISH), + 0x03 => Ok(SPDK_BDEV_ZONE_OPEN), + 0x04 => Ok(SPDK_BDEV_ZONE_RESET), + 0x05 => Ok(SPDK_BDEV_ZONE_OFFLINE), + _ => { + error!( + "Can not map Zone Send Action {} to any spdk_bdev_zone_action", + zone_send_action + ); + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EINVAL, + opcode: NvmeCmdOpc::ZoneMgmtSend as u16, + }) + } + } +} + +/// TODO +fn is_zra_list_matching_zone_state( + zra_report_opt: u32, + zns_zone_state: u32, +) -> bool { + match (zra_report_opt, zns_zone_state) { + (SPDK_NVME_ZRA_LIST_ALL, _) => true, + (SPDK_NVME_ZRA_LIST_ZSE, SPDK_NVME_ZONE_STATE_EMPTY) => true, + (SPDK_NVME_ZRA_LIST_ZSIO, SPDK_NVME_ZONE_STATE_IOPEN) => true, + (SPDK_NVME_ZRA_LIST_ZSEO, SPDK_NVME_ZONE_STATE_EOPEN) => true, + (SPDK_NVME_ZRA_LIST_ZSC, SPDK_NVME_ZONE_STATE_CLOSED) => true, + (SPDK_NVME_ZRA_LIST_ZSF, SPDK_NVME_ZONE_STATE_FULL) => true, + (SPDK_NVME_ZRA_LIST_ZSRO, SPDK_NVME_ZONE_STATE_RONLY) => true, + (SPDK_NVME_ZRA_LIST_ZSO, SPDK_NVME_ZONE_STATE_OFFLINE) => true, + _ => false, + } +} + /// Wrapper around native SPDK block devices, which mimics target SPDK block /// device as an abstract BlockDevice instance. #[derive(Copy, Clone)] @@ -96,7 +196,7 @@ impl SpdkBlockDevice { pub fn lookup_by_name(name: &str) -> Option> { debug!("Searching SPDK devices for '{}'...", name); let bdev = UntypedBdev::lookup_by_name(name)?; - debug!("SPDK {} device found: '{}'", bdev.driver(), name); + debug!("SPDK {} device found: '{}'", bdev.driver(), bdev.name()); Some(Box::new(SpdkBlockDevice::new(bdev))) } @@ -146,8 +246,16 @@ impl BlockDevice for SpdkBlockDevice { } /// returns true if the IO type is supported fn io_type_supported(&self, io_type: IoType) -> bool { + match io_type { + IoType::NvmeIo => true, + _ => self.io_type_supported_by_device(io_type), + } + } + + fn io_type_supported_by_device(&self, io_type: IoType) -> bool { self.0.io_type_supported(io_type) } + /// returns the IO statistics async fn io_stats(&self) -> Result { self.0.stats_async().await @@ -177,6 +285,37 @@ impl BlockDevice for SpdkBlockDevice { } } +#[async_trait(?Send)] +impl ZonedBlockDevice for SpdkBlockDevice { + fn is_zoned(&self) -> bool { + self.0.is_zoned() + } + + fn zone_size(&self) -> u64 { + self.0.zone_size() + } + + fn num_zones(&self) -> u64 { + self.0.num_zones() + } + + fn max_zone_append_size(&self) -> u32 { + self.0.max_zone_append_size() + } + + fn max_open_zones(&self) -> u32 { + self.0.max_open_zones() + } + + fn max_active_zones(&self) -> u32 { + self.0.max_active_zones() + } + + fn optimal_open_zones(&self) -> u32 { + self.0.optimal_open_zones() + } +} + /// Wrapper around native SPDK block device descriptor, which mimics target SPDK /// descriptor as an abstract BlockDeviceDescriptor instance. struct SpdkBlockDeviceDescriptor(Arc); @@ -587,6 +726,360 @@ impl BlockDeviceHandle for SpdkBlockDeviceHandle { }) } + fn emulate_zone_mgmt_send_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + buffer: *mut c_void, + buffer_size: u64, + cb: IoCompletionCallback, + cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + unsafe { buffer.write_bytes(0, buffer_size as usize) }; + + // Read relevant fields for a 'Zone Management Send' command, see 'NVMe Zoned Namespace Command Set Specification, Revision 1.1c' + // Bit 63:00 Dword11:Dword10 > Starting LBA + let mut slba = unsafe { + ((nvme_cmd.__bindgen_anon_2.cdw11 as u64) << 32) + | nvme_cmd.__bindgen_anon_1.cdw10 as u64 + }; + + // Bit 07:00 Dword 13 > Zone Send Action + let zsa = + zone_send_action_to_bdev_zone_action(nvme_cmd.cdw13 as u8)?; + + // Bit 08 Dword 13 > Select All + let select_all = nvme_cmd.cdw13 & (1 << 8) != 0; + + if select_all { + slba = 0; + } + + let ctx = alloc_bdev_io_ctx( + IoType::NvmeIo, + IoCtx { + device: self.device, + cb, + cb_arg, + }, + 0, + 0, + )?; + + let (desc, ch) = self.handle.io_tuple(); + + let num_zones = self.device.num_zones(); + let zone_size = self.device.zone_size(); + + let mut result; + loop { + result = unsafe { + spdk_bdev_zone_management( + desc, + ch, + slba, + zsa, + Some(bdev_io_completion), + ctx as *mut c_void, + ) + } + .to_result(|e| CoreError::NvmeIoPassthruDispatch { + source: Errno::from_i32(e), + opcode: nvme_cmd.opc(), + }); + let continue_next_zone = + select_all && slba == num_zones * zone_size; + if !continue_next_zone || result.is_err() { + break result; + } + slba += zone_size; + } + } + + fn emulate_zone_mgmt_recv_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + buffer: *mut c_void, + buffer_size: u64, + cb: IoCompletionCallback, + cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + let ctx = alloc_bdev_io_ctx( + IoType::NvmeIo, + IoCtx { + device: self.device, + cb, + cb_arg, + }, + 0, + 0, + )?; + + let (desc, ch) = self.handle.io_tuple(); + + let size_of_spdk_bdev_zone_info = + mem::size_of::() as usize; + + // Bit 63:00 Dword11:Dword10 > Starting LBA + let slba = unsafe { + ((nvme_cmd.__bindgen_anon_2.cdw11 as u64) << 32) + | nvme_cmd.__bindgen_anon_1.cdw10 as u64 + }; + + // Bit 07:00 Dword13 > Zone Receive Action + let zra = nvme_cmd.cdw13 as u8; + if zra != 0x0u8 { + error!("Zone Management Receive 'Zone Receive Action' (cdw13) != 00h (Report Zones) not implemented"); + return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }); + } + + // Bit 16 Dword13 > Partial Report + let partial_report = nvme_cmd.cdw13 & (1 << 16) != 0; + if !partial_report { + error!("Zone Management Receive 'Partial Report' (cdw13) == 0 not implemented"); + return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }); + } + + // Bit 15:08 Dword13 > Reporting Options + let zra_report_opt = (nvme_cmd.cdw13 >> 8) as u8; + + let max_num_zones = self.device.num_zones(); + let zone_size = self.device.zone_size(); + let zone_report_offset = slba / zone_size; + let max_num_zones_to_report = max_num_zones - zone_report_offset; + + // Bit 31:00 Dword12 > Number of Dwords + let num_of_dwords = unsafe { nvme_cmd.__bindgen_anon_3.cdw12 } + 1; + if u64::from(((num_of_dwords * 4) - 64) / 64) < max_num_zones_to_report + { + error!("Zone Management Receive 'Number of Dwords' (cdw12) indicates to less space of the number of zones ({}) that will be reported.", max_num_zones_to_report); + return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }); + } + + let bdev_zone_infos; + + let ret = unsafe { + bdev_zone_infos = calloc( + max_num_zones_to_report as usize, + size_of_spdk_bdev_zone_info, + ); + spdk_bdev_get_zone_info( + desc, + ch, + slba, + max_num_zones_to_report, + bdev_zone_infos as *mut spdk_bdev_zone_info, + Some(bdev_io_completion), + ctx as *mut c_void, + ) + } + .to_result(|e| CoreError::NvmeIoPassthruDispatch { + source: Errno::from_i32(e), + opcode: nvme_cmd.opc(), + }); + + // Populate buff with the 'Extended Report Zones Data Structure' of the 'NVMe Zoned Namespace Command Set Specification, Revision 1.1c' + unsafe { buffer.write_bytes(0, buffer_size as usize) }; + + if ret.is_err() { + unsafe { free(bdev_zone_infos) }; + return ret; + } + // Bytes 07:00 > Number of Zones + // Deferred until we know how many zones we actuallay reported + + // Bytes 63:08 > Reserved + let erzds_rsvd_offset: isize = 64; + + // Bytes 127:64 and the following 64 * (max_num_zones - 1) bytes > Zone Descriptor + let zone_desc_size: isize = 64; + + // Zone Descriptor Extention not needed + let zone_desc_ext_size: isize = 0; + + let mut zone = 0u64; + let mut num_zones_reported = 0u64; + + let bdev_zone_info_c_void = + unsafe { calloc(1, size_of_spdk_bdev_zone_info) }; + loop { + if zone >= max_num_zones_to_report { + break; + } + unsafe { + // Fetch and cast the current zone info + std::ptr::copy_nonoverlapping( + bdev_zone_infos.offset( + (zone as usize * size_of_spdk_bdev_zone_info) as isize, + ), + bdev_zone_info_c_void, + size_of_spdk_bdev_zone_info, + ); + let bdev_zone_info: *mut spdk_bdev_zone_info = + std::ptr::slice_from_raw_parts_mut( + bdev_zone_info_c_void, + size_of_spdk_bdev_zone_info, + ) as _; + + if !is_zra_list_matching_zone_state( + zra_report_opt as u32, + (*bdev_zone_info).state, + ) { + zone += 1; + continue; + } + + // Byte 00 of Zone Descriptor > Zone Type (always sequential = 0x2u8) + let mut byte_offset: isize = 0; + let mut zt = 0x2u8; + std::ptr::copy_nonoverlapping( + &mut zt as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 1, + ); + byte_offset += 1; + + // Byte 01, bits 7:4 > Zone State + let mut zs = bdev_zone_state_to_nvme_zns_zone_state( + (*bdev_zone_info).state, + )? as u8; + zs = zs << 4; + std::ptr::copy_nonoverlapping( + &mut zs as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 1, + ); + byte_offset += 1; + + //Byte 02 > Zone Attributes (always 0x0u8) + byte_offset += 1; + + //Byte 03 > Zone Attributes Information (always 0x0u8) + byte_offset += 1; + + //Byte 07:04 > Reserved (always 0x0u32) + byte_offset += 4; + + //Byte 15:08 > Zone Capacity + let mut zcap = (*bdev_zone_info).capacity; + std::ptr::copy_nonoverlapping( + &mut zcap as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 8, + ); + byte_offset += 8; + + //Byte 23:16 > Zone Start Logical Block Address + let mut zslba = (*bdev_zone_info).zone_id as u64; + std::ptr::copy_nonoverlapping( + &mut zslba as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 8, + ); + byte_offset += 8; + + //Byte 31:24 > Write Pointer + let mut wp = (*bdev_zone_info).write_pointer as u64; + std::ptr::copy_nonoverlapping( + &mut wp as *mut _ as *mut c_void, + buffer.offset( + erzds_rsvd_offset + + (zone as isize + * (zone_desc_size + zone_desc_ext_size)) + + byte_offset, + ), + 8, + ); + //byte_offset += 8; + + // Byte 32:63 > Reserved + zone += 1; + num_zones_reported += 1; + } + } + + // Bytes 07:00 > Number of Zones + unsafe { + std::ptr::copy_nonoverlapping( + &mut num_zones_reported as *mut _ as *mut c_void, + buffer, + mem::size_of::() as usize, + ); + } + + unsafe { + free(bdev_zone_info_c_void); + free(bdev_zone_infos); + } + ret + } + + fn submit_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + buffer: *mut c_void, + buffer_size: u64, + cb: IoCompletionCallback, + cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + let ctx = alloc_bdev_io_ctx( + IoType::NvmeIo, + IoCtx { + device: self.device, + cb, + cb_arg, + }, + 0, + 0, + )?; + + let (desc, ch) = self.handle.io_tuple(); + + unsafe { + spdk_bdev_nvme_io_passthru( + desc, + ch, + nvme_cmd, + buffer, + buffer_size, + Some(bdev_io_completion), + ctx as *mut c_void, + ) + } + .to_result(|e| CoreError::NvmeIoPassthruDispatch { + source: Errno::from_i32(e), + opcode: nvme_cmd.opc(), + }) + } + // NVMe commands are not applicable for non-NVMe devices. async fn create_snapshot( &self, @@ -695,14 +1188,10 @@ pub fn io_type_to_err( offset, len, }, - IoType::Reset => CoreError::ResetDispatch { - source, - }, + IoType::Reset => CoreError::ResetDispatch { source }, _ => { warn!("Unsupported I/O operation: {:?}", op); - CoreError::NotSupported { - source, - } + CoreError::NotSupported { source } } } } diff --git a/io-engine/src/bdev/nexus/mod.rs b/io-engine/src/bdev/nexus/mod.rs index 4f8624cd0..9e511cfa7 100644 --- a/io-engine/src/bdev/nexus/mod.rs +++ b/io-engine/src/bdev/nexus/mod.rs @@ -8,7 +8,7 @@ use futures::{future::Future, FutureExt}; mod nexus_bdev; mod nexus_bdev_children; -mod nexus_bdev_error; +pub mod nexus_bdev_error; mod nexus_bdev_rebuild; mod nexus_bdev_snapshot; mod nexus_channel; diff --git a/io-engine/src/bdev/nexus/nexus_bdev.rs b/io-engine/src/bdev/nexus/nexus_bdev.rs index 0ac9526a0..53a0f2416 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev.rs @@ -38,7 +38,9 @@ use super::{ use crate::{ bdev::{ + device_create, device_destroy, + device_lookup, nexus::{ nexus_io_subsystem::NexusPauseState, nexus_persistence::PersistentNexusInfo, @@ -73,11 +75,13 @@ use spdk_rs::{ libspdk::spdk_bdev_notify_blockcnt_change, BdevIo, BdevOps, + BdevZoneInfo, ChannelTraverseStatus, IoChannel, IoDevice, IoDeviceChannelTraverse, JsonWriteContext, + libspdk::spdk_bdev_is_zoned, }; pub static NVME_MIN_CNTLID: u16 = 1; @@ -370,6 +374,7 @@ impl<'n> Nexus<'n> { nexus_uuid: Option, nvme_params: NexusNvmeParams, nexus_info_key: Option, + bdev_zone_info: BdevZoneInfo, ) -> spdk_rs::Bdev> { let n = Nexus { name: name.to_string(), @@ -403,6 +408,7 @@ impl<'n> Nexus<'n> { .with_block_count(0) .with_required_alignment(9) .with_data(n) + .with_zoned_info(bdev_zone_info) .build(); unsafe { @@ -573,6 +579,11 @@ impl<'n> Nexus<'n> { unsafe { self.bdev().required_alignment() } } + /// Check if the bdev is a zoned block device (ZBD) + pub fn is_zoned(&self) -> bool { + unsafe { spdk_bdev_is_zoned(self.bdev().unsafe_inner_ptr()) } + } + /// TODO pub fn children(&self) -> &Vec> { &self.children @@ -772,6 +783,13 @@ impl<'n> Nexus<'n> { }) } } + + if dev.is_zoned() { + //TODO: Implement partitioning zoned block devices. This requires handling drive resources like max active/open zones. + warn!("The device '{}' is zoned. Partitioning zoned block devices into smaller devices is not implemented. Using the whole device.", dev.device_name()); + start_blk = 0; + end_blk = nb; + } } unsafe { @@ -1428,7 +1446,12 @@ impl<'n> BdevOps for Nexus<'n> { IoType::Flush | IoType::Reset | IoType::Unmap - | IoType::WriteZeros => { + | IoType::WriteZeros + | IoType::ZoneAppend + | IoType::ZoneInfo + | IoType::ZoneManagement + | IoType::NvmeIo + | IoType::ZeroCopy => { let supported = self.io_is_supported(io_type); if !supported { if io_type == IoType::Flush { @@ -1572,6 +1595,84 @@ pub async fn nexus_create_v2( } } +async fn prepare_nexus_zone_info_from_children( + children_devices: &mut Vec<(String, String)>, + nexus_name: &str, +) -> Result { + // if we find non-zoned block devices + let mut found_conventional = false; + let mut nexus_zone_info: Option = None; + + for (_uri, device_name) in &*children_devices { + let dev = device_lookup(&device_name).ok_or(Error::ChildMissing { + child: device_name.clone(), + name: nexus_name.to_string(), + })?; + if dev.is_zoned() { + if let Some(nexus_zone_info) = nexus_zone_info { + if nexus_zone_info != dev.bdev_zone_info() { + error!("Can not use ZBD's with different parameters as nexus children"); + return Err(Error::MixedZonedChild { + child: device_name.to_string(), + }); + } + } else { + nexus_zone_info = Some(dev.bdev_zone_info().clone()); + } + } else { + found_conventional = true; + } + + if nexus_zone_info.is_some() && found_conventional { + error!("{nexus_name} - can not handle conventional and zoned storage at the same time in a nexus"); + return Err(Error::MixedZonedChild { + child: device_name.to_string(), + }); + } + } + + if let Some(nexus_zone_info) = nexus_zone_info { + return Ok(nexus_zone_info); + } + + // For conventional devices return the default BlkZoneInfo where `zoned == false` + Ok(BdevZoneInfo::default()) +} + +async fn destroy_created_devices(devices: &[(String, String)]) { + for (uri, _) in devices { + if let Err(e) = device_destroy(uri).await { + error!("Destroying the device with {uri} was not successfull. This device is dangling now. Error: {e:?}"); + } + } +} + +async fn create_children_devices( + children: &[String], +) -> Result, Error> { + let mut children_devices = Vec::new(); + + for uri in children { + let device_name = match device_create(uri).await { + Ok(d) => d, + Err(e) => { + destroy_created_devices(&children_devices).await; + return Err(e).context(nexus_err::CreateChild { name: uri }); + } + }; + + children_devices.push((uri.clone(), device_name.clone())); + + if device_lookup(&device_name).unwrap().is_zoned() && children.len() > 1 + { + destroy_created_devices(&children_devices).await; + return Err(Error::ZonedReplicationNotImplemented {}); + } + } + + Ok(children_devices) +} + async fn nexus_create_internal( name: &str, size: u64, @@ -1607,6 +1708,28 @@ async fn nexus_create_internal( return Ok(()); } + let mut children_devices = create_children_devices(children).await?; + + let nexus_zone_info = match prepare_nexus_zone_info_from_children( + &mut children_devices, + name, + ) + .await + { + Err(e) => { + destroy_created_devices(&children_devices).await; + return Err(e); + } + Ok(nexus_zone_info) => nexus_zone_info, + }; + + if nexus_zone_info.zoned { + info!( + "The Nexus will be zoned with the properies {:?}", + nexus_zone_info + ); + } + // Create a new Nexus object, and immediately add it to the global list. // This is necessary to ensure proper cleanup, as the code responsible for // closing a child assumes that the nexus to which it belongs will appear @@ -1619,10 +1742,12 @@ async fn nexus_create_internal( nexus_uuid, nvme_params, nexus_info_key, + nexus_zone_info, ); - for uri in children { - if let Err(error) = nexus_bdev.data_mut().new_child(uri).await { + + for (uri, device_name) in &children_devices { + if let Err(error) = nexus_bdev.data_mut().new_child(&uri, &device_name).await { error!( "{n:?}: failed to add child '{uri}': {e}", n = nexus_bdev.data(), @@ -1636,6 +1761,8 @@ async fn nexus_create_internal( uri ); + destroy_created_devices(&children_devices).await; + return Err(Error::CreateChild { source: error, name: name.to_owned(), diff --git a/io-engine/src/bdev/nexus/nexus_bdev_children.rs b/io-engine/src/bdev/nexus/nexus_bdev_children.rs index 2e5eca333..fd6a3381a 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev_children.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev_children.rs @@ -74,19 +74,20 @@ impl<'n> Nexus<'n> { /// nexus init phase pub async fn new_child( mut self: Pin<&mut Self>, - uri: &str, + device_uri: &str, + device_name: &str, ) -> Result<(), BdevError> { assert_eq!(*self.state.lock(), NexusState::Init); - info!("{:?}: adding child: '{}'...", self, uri); + info!("{:?}: adding child: '{}'...", self, device_uri); let nexus_name = self.nexus_name().to_owned(); - let device_name = device_create(uri).await?; + let dev = device_lookup(device_name); let c = NexusChild::new( - uri.to_string(), + device_uri.to_string(), nexus_name, - device_lookup(&device_name), + dev, ); info!("{:?}: added to nexus", c); diff --git a/io-engine/src/bdev/nexus/nexus_bdev_error.rs b/io-engine/src/bdev/nexus/nexus_bdev_error.rs index ca25065c8..1f3531b6b 100644 --- a/io-engine/src/bdev/nexus/nexus_bdev_error.rs +++ b/io-engine/src/bdev/nexus/nexus_bdev_error.rs @@ -15,7 +15,7 @@ use crate::{ /// Common errors for nexus basic operations and child operations /// which are part of nexus object. #[derive(Debug, Snafu)] -#[snafu(visibility(pub(crate)), context(suffix(false)), module(nexus_err))] +#[snafu(visibility(pub), context(suffix(false)), module(nexus_err))] pub enum Error { #[snafu(display("Nexus {} does not exist", name))] NexusNotFound { name: String }, @@ -84,6 +84,8 @@ pub enum Error { }, #[snafu(display("Children of nexus {} have mixed block sizes", name))] MixedBlockSizes { name: String }, + #[snafu(display("Child {} is incompatible with its (zoned) siblings", child))] + MixedZonedChild { child: String }, #[snafu(display( "Child {} of nexus {} has incompatible size or block size", child, @@ -222,6 +224,8 @@ pub enum Error { UpdateShareProperties { source: CoreError, name: String }, #[snafu(display("failed to save nexus state {}", name))] SaveStateFailed { source: StoreError, name: String }, + #[snafu(display("Replication for zoned storage is not implemented. Consider adding a single zoned storage device to the nexus"))] + ZonedReplicationNotImplemented, } impl From for Error { diff --git a/io-engine/src/bdev/nexus/nexus_io.rs b/io-engine/src/bdev/nexus/nexus_io.rs index 317f18200..aad02ee47 100644 --- a/io-engine/src/bdev/nexus/nexus_io.rs +++ b/io-engine/src/bdev/nexus/nexus_io.rs @@ -12,6 +12,7 @@ use spdk_rs::{ spdk_bdev_io, spdk_bdev_io_complete_nvme_status, spdk_io_channel, + spdk_nvme_cmd, SPDK_NVME_SC_ABORTED_SQ_DELETION, SPDK_NVME_SC_CAPACITY_EXCEEDED, SPDK_NVME_SC_INVALID_OPCODE, @@ -28,11 +29,13 @@ use crate::core::{ CoreError, Cores, IoCompletionStatus, + is_zoned_nvme_error, IoStatus, IoSubmissionFailure, IoType, LvolFailure, Mthread, + NvmeCmdOpc, NvmeStatus, ReadOptions, }; @@ -189,9 +192,17 @@ impl<'n> NexusBio<'n> { // these IOs are submitted to all the underlying children IoType::Write | IoType::WriteZeros + | IoType::NvmeIo | IoType::Reset | IoType::Unmap | IoType::Flush => self.submit_all(), + IoType::ZoneAppend => { + warn!("{self:?} - ZoneAppend is explicitly disallowed, otherwise reading from different replicas won't work."); + self.fail(); + Err(CoreError::NotSupported { + source: Errno::EOPNOTSUPP, + }) + } IoType::NvmeAdmin => { self.fail(); Err(CoreError::NotSupported { @@ -261,9 +272,12 @@ impl<'n> NexusBio<'n> { self.ctx_mut().successful += 1; } else { self.ctx_mut().status = IoStatus::Failed; - self.ctx_mut().failed += 1; - self.completion_error(child, status); + // Don't take zoned child out on zoned related nvme errors + if !is_zoned_nvme_error(status) { + self.ctx_mut().failed += 1; + self.completion_error(child, status); + } } if self.ctx().in_flight > 0 { @@ -502,6 +516,61 @@ impl<'n> NexusBio<'n> { } } + #[inline] + fn submit_io_passthru( + &self, + hdl: &dyn BlockDeviceHandle, + ) -> Result<(), CoreError> { + let orig_nvme_cmd = self.nvme_cmd(); + let buffer = self.nvme_buf(); + let buffer_size = self.nvme_nbytes(); + + let mut passthru_nvme_cmd = spdk_nvme_cmd::default(); + passthru_nvme_cmd.set_opc(orig_nvme_cmd.opc()); + unsafe { + passthru_nvme_cmd.__bindgen_anon_1.cdw10 = orig_nvme_cmd.__bindgen_anon_1.cdw10; + passthru_nvme_cmd.__bindgen_anon_2.cdw11 = orig_nvme_cmd.__bindgen_anon_2.cdw11; + passthru_nvme_cmd.__bindgen_anon_3.cdw12 = orig_nvme_cmd.__bindgen_anon_3.cdw12; + } + passthru_nvme_cmd.cdw13 = orig_nvme_cmd.cdw13; + passthru_nvme_cmd.cdw14 = orig_nvme_cmd.cdw14; + passthru_nvme_cmd.cdw15 = orig_nvme_cmd.cdw15; + + if hdl.get_device().io_type_supported_by_device(self.io_type()) { + return hdl.submit_io_passthru( + &passthru_nvme_cmd, + buffer, + buffer_size, + Self::child_completion, + self.as_ptr().cast(), + ); + } else { + let opc = orig_nvme_cmd.opc(); + match opc { + // Zone Management Send + opc if opc == NvmeCmdOpc::ZoneMgmtSend as u16 => return hdl.emulate_zone_mgmt_send_io_passthru( + &passthru_nvme_cmd, + buffer, + buffer_size, + Self::child_completion, + self.as_ptr().cast(), + ), + // Zone Management Receive + opc if opc == NvmeCmdOpc::ZoneMgmtReceive as u16 => return hdl.emulate_zone_mgmt_recv_io_passthru( + &passthru_nvme_cmd, + buffer, + buffer_size, + Self::child_completion, + self.as_ptr().cast(), + ), + _ => return Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: opc, + }), + } + } + } + #[inline] fn submit_write( &self, @@ -606,6 +675,7 @@ impl<'n> NexusBio<'n> { IoType::WriteZeros => self.submit_write_zeroes(h), IoType::Reset => self.submit_reset(h), IoType::Flush => self.submit_flush(h), + IoType::NvmeIo => self.submit_io_passthru(h), // we should never reach here, if we do it is a bug. _ => unreachable!(), } diff --git a/io-engine/src/bdev/null_ng.rs b/io-engine/src/bdev/null_ng.rs index a09970a57..3ca14bf6c 100644 --- a/io-engine/src/bdev/null_ng.rs +++ b/io-engine/src/bdev/null_ng.rs @@ -6,6 +6,7 @@ use spdk_rs::{ BdevModule, BdevModuleBuild, BdevOps, + BdevZoneInfo, IoChannel, IoDevice, IoType, @@ -142,6 +143,7 @@ impl<'a> NullIoDevice<'a> { .with_block_length(1 << 12) .with_block_count(1 << 20) .with_required_alignment(12) + .with_zoned_info(BdevZoneInfo::default()) .build(); bdev.data().register_io_device(Some(name)); diff --git a/io-engine/src/bdev/nvmx/device.rs b/io-engine/src/bdev/nvmx/device.rs index 7ff985e01..ffa459138 100644 --- a/io-engine/src/bdev/nvmx/device.rs +++ b/io-engine/src/bdev/nvmx/device.rs @@ -24,6 +24,7 @@ use crate::{ DeviceIoController, DeviceTimeoutAction, IoType, + ZonedBlockDevice, }, ffihelper::{cb_arg, done_cb}, }; @@ -204,10 +205,17 @@ impl BlockDevice for NvmeBlockDevice { IoType::Unmap => self.ns.supports_deallocate(), IoType::WriteZeros => self.ns.supports_write_zeroes(), IoType::CompareAndWrite => false, + IoType::ZoneAppend | IoType::ZoneInfo | IoType::ZoneManagement => { + true + } _ => false, } } + fn io_type_supported_by_device(&self, io_type: IoType) -> bool { + self.io_type_supported(io_type) + } + async fn io_stats(&self) -> Result { let carc = NVME_CONTROLLERS.lookup_by_name(&self.name).ok_or( CoreError::BdevNotFound { @@ -257,6 +265,37 @@ impl BlockDevice for NvmeBlockDevice { } } +#[async_trait(?Send)] +impl ZonedBlockDevice for NvmeBlockDevice { + fn is_zoned(&self) -> bool { + self.ns.is_zoned() + } + + fn zone_size(&self) -> u64 { + self.ns.zone_size() + } + + fn num_zones(&self) -> u64 { + self.ns.num_zones() + } + + fn max_zone_append_size(&self) -> u32 { + self.ns.max_zone_append_size() + } + + fn max_open_zones(&self) -> u32 { + self.ns.max_open_zones() + } + + fn max_active_zones(&self) -> u32 { + self.ns.max_active_zones() + } + + fn optimal_open_zones(&self) -> u32 { + self.ns.optimal_open_zones() + } +} + struct NvmeDeviceIoController { name: String, } diff --git a/io-engine/src/bdev/nvmx/namespace.rs b/io-engine/src/bdev/nvmx/namespace.rs index 8123d6a4c..8c51886d6 100644 --- a/io-engine/src/bdev/nvmx/namespace.rs +++ b/io-engine/src/bdev/nvmx/namespace.rs @@ -10,6 +10,13 @@ use spdk_rs::libspdk::{ spdk_nvme_ns_get_size, spdk_nvme_ns_get_uuid, spdk_nvme_ns_supports_compare, + spdk_nvme_zns_ns_get_data, + spdk_nvme_zns_ns_get_zone_size, + spdk_nvme_zns_ns_get_num_zones, + spdk_nvme_ns_get_ctrlr, + spdk_nvme_zns_ctrlr_get_max_zone_append_size, + spdk_nvme_zns_ns_get_max_open_zones, + spdk_nvme_zns_ns_get_max_active_zones, SPDK_NVME_NS_DEALLOCATE_SUPPORTED, SPDK_NVME_NS_WRITE_ZEROES_SUPPORTED, }; @@ -78,4 +85,36 @@ impl NvmeNamespace { pub fn as_ptr(&self) -> *mut spdk_nvme_ns { self.0.as_ptr() } + + pub fn is_zoned(&self) -> bool { + unsafe { !spdk_nvme_zns_ns_get_data(self.0.as_ptr()).is_null() } + } + + pub fn zone_size(&self) -> u64 { + unsafe { spdk_nvme_zns_ns_get_zone_size(self.0.as_ptr()) } + } + + pub fn num_zones(&self) -> u64 { + unsafe { spdk_nvme_zns_ns_get_num_zones(self.0.as_ptr()) } + } + + pub fn max_zone_append_size(&self) -> u32 { + unsafe { + spdk_nvme_zns_ctrlr_get_max_zone_append_size( + spdk_nvme_ns_get_ctrlr(self.0.as_ptr()), + ) + } + } + + pub fn max_open_zones(&self) -> u32 { + unsafe { spdk_nvme_zns_ns_get_max_open_zones(self.0.as_ptr()) } + } + + pub fn max_active_zones(&self) -> u32 { + unsafe { spdk_nvme_zns_ns_get_max_active_zones(self.0.as_ptr()) } + } + + pub fn optimal_open_zones(&self) -> u32 { + self.max_open_zones() + } } diff --git a/io-engine/src/core/block_device.rs b/io-engine/src/core/block_device.rs index 225e7178f..f06cb1de0 100644 --- a/io-engine/src/core/block_device.rs +++ b/io-engine/src/core/block_device.rs @@ -6,7 +6,7 @@ use super::{ SnapshotParams, }; -use spdk_rs::{DmaBuf, DmaError, IoVec}; +use spdk_rs::{BdevZoneInfo, DmaBuf, DmaError, IoVec}; use async_trait::async_trait; use futures::channel::oneshot; @@ -56,7 +56,7 @@ pub struct BlockDeviceIoStats { /// Core trait that represents a block device. /// TODO: Add text. #[async_trait(?Send)] -pub trait BlockDevice { +pub trait BlockDevice: ZonedBlockDevice { /// Returns total size in bytes of the device. fn size_in_bytes(&self) -> u64; @@ -81,9 +81,12 @@ pub trait BlockDevice { /// Returns aligment of the device. fn alignment(&self) -> u64; - /// Checks whether target I/O type is supported by the device. + /// Checks whether target I/O type is supported by the device or storage stack. fn io_type_supported(&self, io_type: IoType) -> bool; + /// Checks whether target I/O type is supported by the device. + fn io_type_supported_by_device(&self, io_type: IoType) -> bool; + /// Obtains I/O statistics for the device. async fn io_stats(&self) -> Result; @@ -103,6 +106,58 @@ pub trait BlockDevice { ) -> Result<(), CoreError>; } +/// Trait to represent zoned storage related fields for zoned block devices. +#[async_trait(?Send)] +pub trait ZonedBlockDevice { + /// Returns if the device to which this ZoneInfo is linked to is a + /// zoned block device (ZBD) or not. If true, the following fields are + /// also relavant. + fn is_zoned(&self) -> bool; + + /// Returns the number of zones available on the device. + fn zone_size(&self) -> u64; + + /// Returns size of each zone (in blocks). Typically alligned to a power of 2. + /// In SPDK the actuall writable zone capacity has to be queried for each + /// individual zone through a zone report. + /// zone_capacity <= zone_size. + /// zone_capacity * num_zones = device capacity + fn num_zones(&self) -> u64; + + /// Returns maximum data transfer size for a single zone append command (in blocks). + /// Normal (seq) writes must respect the device's general max transfer size. + fn max_zone_append_size(&self) -> u32; + + /// Returns maximum number of open zones for a given device. + /// This essentially limits the amount of parallel open zones that can be written to. + /// Refere to NVMe ZNS specification (Figure 7 Zone State Machine) for more details. + /// https://nvmexpress.org/wp-content/uploads/NVM-Express-Zoned-Namespace-Command-Set-Specification-1.1d-2023.12.28-Ratified.pdf + fn max_open_zones(&self) -> u32; + + /// Returns maximum number of active zones for a given device. + /// max_open_zones is a subset of max_active_zones. Closed zones are still active until they + /// get finished (finished zones are in effect immutabel until reset). + /// Refere to NVMe ZNS specification (Figure 7 Zone State Machine) for more details. + /// https://nvmexpress.org/wp-content/uploads/NVM-Express-Zoned-Namespace-Command-Set-Specification-1.1d-2023.12.28-Ratified.pdf + fn max_active_zones(&self) -> u32; + + /// Returns the drives prefered number of open zones. + fn optimal_open_zones(&self) -> u32; + + /// Returns all zoned storage relavant fields in a condensed BdevZoneInfo struct. + fn bdev_zone_info(&self) -> BdevZoneInfo { + BdevZoneInfo { + zoned: self.is_zoned(), + zone_size: self.zone_size(), + num_zones: self.num_zones(), + max_zone_append_size: self.max_zone_append_size(), + max_open_zones: self.max_open_zones(), + max_active_zones: self.max_active_zones(), + optimal_open_zones: self.optimal_open_zones(), + } + } +} + /// Core trait that represents a descriptor for an opened block device. /// TODO: Add text. #[async_trait(?Send)] @@ -406,8 +461,73 @@ pub trait BlockDeviceHandle { cb_arg: IoCompletionCallbackArg, ) -> Result<(), CoreError>; + /// Emulates the zone management send NvmeIo command for devices that do not support this + /// command natively. + /// + /// * `nvme_cmd` - The nvme command to emulate. + /// * `_buffer` - The data buffer for the nvme command. + /// * `_buffer_size` - The data buffer for the nvme command. + /// * `_cb` - The completion callback function for the nvme command. + /// * `_cb_arg` - The completion callback function arguments. + fn emulate_zone_mgmt_send_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + _buffer: *mut c_void, + _buffer_size: u64, + _cb: IoCompletionCallback, + _cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }) + } + + /// Emulates the zone management receive NvmeIo command for devices that do not support this + /// command natively. + /// + /// * `nvme_cmd` - The nvme command to emulate. + /// * `_buffer` - The data buffer for the nvme command. + /// * `_buffer_size` - The data buffer for the nvme command. + /// * `_cb` - The completion callback function for the nvme command. + /// * `_cb_arg` - The completion callback function arguments. + fn emulate_zone_mgmt_recv_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + _buffer: *mut c_void, + _buffer_size: u64, + _cb: IoCompletionCallback, + _cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }) + } + // NVMe only. + /// Submits an NVMe IO Passthrough command to the device. + /// + /// * `nvme_cmd` - The nvme command to emulate. + /// * `_buffer` - The data buffer for the nvme command. + /// * `_buffer_size` - The data buffer for the nvme command. + /// * `_cb` - The completion callback function for the nvme command. + /// * `_cb_arg` - The completion callback function arguments. + fn submit_io_passthru( + &self, + nvme_cmd: &spdk_rs::libspdk::spdk_nvme_cmd, + _buffer: *mut c_void, + _buffer_size: u64, + _cb: IoCompletionCallback, + _cb_arg: IoCompletionCallbackArg, + ) -> Result<(), CoreError> { + Err(CoreError::NvmeIoPassthruDispatch { + source: Errno::EOPNOTSUPP, + opcode: nvme_cmd.opc(), + }) + } + /// TODO async fn nvme_admin_custom(&self, opcode: u8) -> Result<(), CoreError>; diff --git a/io-engine/src/core/env.rs b/io-engine/src/core/env.rs index f76a08548..100697721 100644 --- a/io-engine/src/core/env.rs +++ b/io-engine/src/core/env.rs @@ -283,7 +283,7 @@ impl Default for MayastorCliArgs { reactor_mask: "0x1".into(), mem_size: 0, rpc_address: "/var/tmp/mayastor.sock".to_string(), - no_pci: true, + no_pci: false, log_components: vec![], log_format: None, mayastor_config: None, @@ -328,6 +328,11 @@ pub static SIG_RECEIVED: Lazy = // FFI functions that are needed to initialize the environment extern "C" { pub fn rte_eal_init(argc: i32, argv: *mut *mut libc::c_char) -> i32; + pub fn spdk_trace_init(shm_name: *const c_char, num_entries: u64) -> i32; + pub fn spdk_trace_set_tpoints(group_id: u32, tpoint_mask: u64); + pub fn spdk_trace_create_tpoint_group_mask( + group_name: *const c_char, + ) -> u64; pub fn spdk_trace_cleanup(); pub fn spdk_env_dpdk_post_init(legacy_mem: bool) -> i32; pub fn spdk_env_fini(); @@ -941,6 +946,39 @@ impl MayastorEnvironment { None } + fn init_spdk_tracing(&self) { + const MAX_GROUP_IDS: u32 = 16; + let cshm_name = if self.shm_id >= 0 { + CString::new( + format!("/{}_trace.{}", self.name, self.shm_id).as_str(), + ) + .unwrap() + } else { + CString::new( + format!("/{}_trace.pid{}", self.name, std::process::id()) + .as_str(), + ) + .unwrap() + }; + unsafe { + if spdk_trace_init(cshm_name.as_ptr(), self.num_entries) != 0 { + error!("SPDK tracing init error"); + } + } + let tpoint_group_name = CString::new("all").unwrap(); + let tpoint_group_mask = unsafe { + spdk_trace_create_tpoint_group_mask(tpoint_group_name.as_ptr()) + }; + + for group_id in 0..MAX_GROUP_IDS { + if (tpoint_group_mask & (1 << group_id) as u64) > 0 { + unsafe { + spdk_trace_set_tpoints(group_id, u64::MAX); + } + } + } + } + /// initialize the core, call this before all else pub fn init(mut self) -> Self { // setup the logger as soon as possible @@ -1009,6 +1047,11 @@ impl MayastorEnvironment { // ensure we are within the context of a spdk thread from here Mthread::primary().set_current(); + // To enable SPDK tracing set self.num_entries (eg. to 32768). + if self.num_entries > 0 { + self.init_spdk_tracing(); + } + Reactor::block_on(async { let (sender, receiver) = oneshot::channel::(); diff --git a/io-engine/src/core/mod.rs b/io-engine/src/core/mod.rs index a225c5ec5..2bba3ba56 100644 --- a/io-engine/src/core/mod.rs +++ b/io-engine/src/core/mod.rs @@ -22,6 +22,7 @@ pub use block_device::{ OpCompletionCallback, OpCompletionCallbackArg, ReadOptions, + ZonedBlockDevice, }; pub use cpu_cores::{Core, Cores}; pub use descriptor::{DescriptorGuard, UntypedDescriptorGuard}; @@ -85,7 +86,17 @@ pub use snapshot::{ SnapshotXattrs, }; -use spdk_rs::libspdk::SPDK_NVME_SC_CAPACITY_EXCEEDED; +use spdk_rs::libspdk::{ + SPDK_NVME_SC_CAPACITY_EXCEEDED, + SPDK_NVME_SC_ZONED_BOUNDARY_ERROR, + SPDK_NVME_SC_ZONE_IS_FULL, + SPDK_NVME_SC_ZONE_IS_READ_ONLY, + SPDK_NVME_SC_ZONE_IS_OFFLINE, + SPDK_NVME_SC_ZONE_INVALID_WRITE, + SPDK_NVME_SC_TOO_MANY_ACTIVE_ZONES, + SPDK_NVME_SC_TOO_MANY_OPEN_ZONES, + SPDK_NVME_SC_INVALID_ZONE_STATE_TRANSITION, +}; mod bdev; mod block_device; @@ -477,6 +488,16 @@ pub enum IoSubmissionFailure { Write, } +/// Supported NVMe command passthrough opcodes +#[derive(Debug, Copy, Clone, Eq, PartialOrd, PartialEq)] +#[repr(u16)] +pub enum NvmeCmdOpc{ + // Zone Management Send opcode: 79h = 121 + ZoneMgmtSend = 121, + // Zone Management Receive opcode: 7Ah = 122 + ZoneMgmtReceive = 122, +} + // Generic I/O completion status for block devices, which supports per-protocol // error domains. #[derive(Copy, Clone, Eq, PartialOrd, PartialEq)] @@ -517,6 +538,24 @@ impl From for IoCompletionStatus { } } +/// Returns true if the given IoCompletionStatus NvmeError can be matched to a Zoned Namespace Command Specific Status Code +pub fn is_zoned_nvme_error(status: IoCompletionStatus) -> bool { + match status { + IoCompletionStatus::NvmeError(NvmeStatus::CmdSpecific(cssc)) => match cssc { + SPDK_NVME_SC_ZONED_BOUNDARY_ERROR | + SPDK_NVME_SC_ZONE_IS_FULL | + SPDK_NVME_SC_ZONE_IS_READ_ONLY | + SPDK_NVME_SC_ZONE_IS_OFFLINE | + SPDK_NVME_SC_ZONE_INVALID_WRITE | + SPDK_NVME_SC_TOO_MANY_ACTIVE_ZONES | + SPDK_NVME_SC_TOO_MANY_OPEN_ZONES | + SPDK_NVME_SC_INVALID_ZONE_STATE_TRANSITION => true, + _ => false, + }, + _ => false, + } +} + // TODO move this elsewhere ASAP pub static PAUSING: AtomicUsize = AtomicUsize::new(0); pub static PAUSED: AtomicUsize = AtomicUsize::new(0); diff --git a/io-engine/tests/mount_fs.rs b/io-engine/tests/mount_fs.rs index 014f5b0da..1faa8cfc5 100644 --- a/io-engine/tests/mount_fs.rs +++ b/io-engine/tests/mount_fs.rs @@ -124,6 +124,7 @@ async fn mount_fs_mirror() { mount_test(ms, "xfs").await; mount_test(ms, "ext4").await; + mount_test(ms, "btrfs").await; } #[tokio::test] diff --git a/io-engine/tests/zns.rs b/io-engine/tests/zns.rs new file mode 100644 index 000000000..d93b1e0f1 --- /dev/null +++ b/io-engine/tests/zns.rs @@ -0,0 +1,190 @@ +use lazy_static::lazy_static; +use once_cell::sync::OnceCell; +use std::{convert::TryFrom, sync::Mutex}; + +extern crate libnvme_rs; + +use io_engine::{ + bdev::nexus::{nexus_bdev_error::Error, nexus_create, nexus_lookup_mut}, + core::{MayastorCliArgs, Protocol, UntypedBdevHandle}, +}; + +pub mod common; +use common::compose::MayastorTest; +use io_engine_tests::NullBlk; +use run_script::{self}; + +//TODO: Also test pcie and nvmf +//static BDEVNAME1: &str = "pcie:///0000:00:03.0"; +//static BDEVNAME1: &str = "nvmf://192.168.0.1:4420/nvmet-always"; +lazy_static! { + static ref BDEVNAME1: Mutex = Mutex::new(String::new()); +} +fn get_bdevname1() -> String { + BDEVNAME1.lock().unwrap().clone() +} +fn set_bdevname1(name: String) { + *BDEVNAME1.lock().unwrap() = name; +} + +static DISKNAME2: &str = "/tmp/disk2.img"; +static BDEVNAME2: &str = "uring:///tmp/disk2.img?blk_size=4096"; + +static MAYASTOR: OnceCell = OnceCell::new(); + +fn prepare_storage() -> NullBlk { + common::delete_file(&[DISKNAME2.into()]); + common::truncate_file(DISKNAME2, 64 * 1024); + let ret = + common::create_zoned_nullblk_device(4096, 2048, 1077, 0, 16, 14, 14); + let nullblk_id = ret.unwrap(); + set_bdevname1(format!("uring:///dev/nullb{}?blk_size=4096", nullblk_id)); + nullblk_id +} + +fn get_ms() -> &'static MayastorTest<'static> { + MAYASTOR.get_or_init(|| MayastorTest::new(MayastorCliArgs::default())) +} + +async fn create_connected_nvmf_nexus( + ms: &'static MayastorTest<'static>, +) -> (libnvme_rs::NvmeTarget, String) { + let uri = ms + .spawn(async { + create_nexus().await; + // Claim the bdev + let hdl = UntypedBdevHandle::open(&get_bdevname1(), true, true); + let nexus = nexus_lookup_mut("nexus").unwrap(); + let ret = nexus.share(Protocol::Nvmf, None).await.unwrap(); + drop(hdl); + ret + }) + .await; + // Create and connect NVMF target. + let target = libnvme_rs::NvmeTarget::try_from(uri) + .unwrap() + .with_rand_hostnqn(true); + + target.connect().unwrap(); + + let devices = target.block_devices(2).unwrap(); + + assert_eq!(devices.len(), 1); + (target, devices[0].to_string()) +} + +fn fio_run_zoned_verify(device: &str) -> Result { + println!("Running fio workload ..."); + //This writes sequentially two zones, resets them, writes them again and reads from them to do the crc32 check + let (exit, stdout, stderr) = run_script::run( + r#" + fio --name=zonedwrite --rw=write --ioengine=libaio --direct=1 --zonemode=zbd \ + --size=2z --io_size=4z --bs=128k --verify=crc32 --filename=$1 + "#, + &vec![device.into()], + &run_script::ScriptOptions::new(), + ).unwrap(); + + if exit == 0 { + Ok(stdout) + } else { + Err(stderr) + } +} + +fn blkzone(device: &str, subcommand: &str) -> Result { + let (exit, stdout, stderr) = run_script::run( + r#" + blkzone $1 $2 + "#, + &vec![subcommand.into(), device.into()], + &run_script::ScriptOptions::new(), + ) + .unwrap(); + + if exit == 0 { + Ok(stdout) + } else { + Err(stderr) + } +} + +#[tokio::test] +async fn zns_fio() { + let ms = get_ms(); + + let _nullblk_id = prepare_storage(); + let (target, nvmf_dev) = create_connected_nvmf_nexus(ms).await; + + let fio_result = fio_run_zoned_verify(&nvmf_dev); + match fio_result { + Ok(ref ok) => println!("{}", ok), + Err(ref err) => println!("{}", err), + } + + target.disconnect().unwrap(); + + ms.spawn(async move { + let mut nexus = nexus_lookup_mut("nexus").unwrap(); + nexus.as_mut().unshare_nexus().await.unwrap(); + nexus.destroy().await.unwrap(); + }) + .await; + + assert_eq!(true, fio_result.is_ok()); +} + +#[tokio::test] +async fn zns_blkzone() { + let ms = get_ms(); + + let _nullblk_id = prepare_storage(); + let (target, nvmf_dev) = create_connected_nvmf_nexus(ms).await; + + let blkzone_report_result = blkzone(&nvmf_dev, "report"); + match blkzone_report_result { + Ok(ref ok) => println!("{}", ok), + Err(ref err) => println!("{}", err), + } + + let blkzone_reset_result = blkzone(&nvmf_dev, "reset"); + match blkzone_reset_result { + Ok(ref ok) => println!("{}", ok), + Err(ref err) => println!("{}", err), + } + + target.disconnect().unwrap(); + + ms.spawn(async move { + let mut nexus = nexus_lookup_mut("nexus").unwrap(); + nexus.as_mut().unshare_nexus().await.unwrap(); + nexus.destroy().await.unwrap(); + }) + .await; + + assert_eq!(true, blkzone_report_result.is_ok()); + assert_eq!(true, blkzone_reset_result.is_ok()); +} + +#[tokio::test] +async fn zns_replicated() { + let ms = get_ms(); + + let _nullblk_id = prepare_storage(); + let ret = ms.spawn(async { create_replicated_nexus().await }).await; + + assert_eq!(true, ret.is_err()); +} + +async fn create_nexus() { + let ch = vec![get_bdevname1()]; + //TODO: test different sizes and a splitted nexus + nexus_create("nexus", 1024 * 1024 * 1024 * 32, None, &ch) + .await + .unwrap(); +} + +async fn create_replicated_nexus() -> Result<(), Error> { + let ch = vec![get_bdevname1(), BDEVNAME2.to_string()]; + nexus_create("nexus", 1024 * 1024 * 1024 * 32, None, &ch).await +} diff --git a/nix/overlay.nix b/nix/overlay.nix index a23923cdc..99a87df15 100644 --- a/nix/overlay.nix +++ b/nix/overlay.nix @@ -4,6 +4,7 @@ let img_prefix = if product_prefix == "" then config.product_prefix else product_prefix; in self: super: rec { + btrfs-progs = super.callPackage ./pkgs/btrfs-progs { }; fio = super.callPackage ./pkgs/fio { }; sourcer = super.callPackage ./lib/sourcer.nix { }; images = super.callPackage ./pkgs/images { inherit img_tag img_org img_prefix; }; diff --git a/nix/pkgs/btrfs-progs/default.nix b/nix/pkgs/btrfs-progs/default.nix new file mode 100644 index 000000000..9eeab400c --- /dev/null +++ b/nix/pkgs/btrfs-progs/default.nix @@ -0,0 +1,63 @@ +# Source: https://github.com/NixOS/nixpkgs/blob/b483ef3cc866ae6afed65115ba6f0c6b19efce49/pkgs/tools/filesystems/btrfs-progs/default.nix +{ lib +, stdenv +, fetchurl +, pkg-config +, attr +, acl +, zlib +, libuuid +, e2fsprogs +, lzo +, asciidoc +, xmlto +, docbook_xml_dtd_45 +, docbook_xsl +, libxslt +, zstd +, python3 +}: + +stdenv.mkDerivation rec { + pname = "btrfs-progs"; + version = "5.14.1"; + + src = fetchurl { + url = "mirror://kernel/linux/kernel/people/kdave/btrfs-progs/btrfs-progs-v${version}.tar.xz"; + sha256 = "sha256-1UqTRlRcpG3xKOPMt31gwJfZDJO34xSZAjbijPr4xVs="; + }; + + nativeBuildInputs = [ + pkg-config + asciidoc + xmlto + docbook_xml_dtd_45 + docbook_xsl + libxslt + python3 + python3.pkgs.setuptools + ]; + + buildInputs = [ attr acl zlib libuuid e2fsprogs lzo zstd python3 ]; + + # for python cross-compiling + _PYTHON_HOST_PLATFORM = stdenv.hostPlatform.config; + + # gcc bug with -O1 on ARM with gcc 4.8 + # This should be fine on all platforms so apply universally + postPatch = "sed -i s/-O1/-O2/ configure"; + + postInstall = '' + install -v -m 444 -D btrfs-completion $out/share/bash-completion/completions/btrfs + ''; + + configureFlags = lib.optional stdenv.hostPlatform.isMusl "--disable-backtrace"; + + meta = with lib; { + description = "Utilities for the btrfs filesystem"; + homepage = "https://btrfs.wiki.kernel.org/"; + license = licenses.gpl2; + maintainers = with maintainers; [ raskin ]; + platforms = platforms.linux; + }; +} diff --git a/nix/pkgs/images/default.nix b/nix/pkgs/images/default.nix index 867b94821..82aa7b62b 100644 --- a/nix/pkgs/images/default.nix +++ b/nix/pkgs/images/default.nix @@ -19,6 +19,7 @@ , utillinux , writeScriptBin , xfsprogs +, btrfs-progs , runCommand , tini , sourcer diff --git a/nix/pkgs/io-engine/cargo-package.nix b/nix/pkgs/io-engine/cargo-package.nix index bf03e2518..64bc6eb07 100644 --- a/nix/pkgs/io-engine/cargo-package.nix +++ b/nix/pkgs/io-engine/cargo-package.nix @@ -19,6 +19,7 @@ , protobuf , sources , xfsprogs +, btrfs-progs , utillinux , llvmPackages , targetPackages diff --git a/nix/pkgs/io-engine/default.nix b/nix/pkgs/io-engine/default.nix index 90bd6a585..ad32082c8 100644 --- a/nix/pkgs/io-engine/default.nix +++ b/nix/pkgs/io-engine/default.nix @@ -17,6 +17,7 @@ , protobuf , sources , xfsprogs +, btrfs-progs , utillinux , llvmPackages , targetPackages diff --git a/nix/pkgs/libspdk/default.nix b/nix/pkgs/libspdk/default.nix index 1f6797466..91ab8d208 100644 --- a/nix/pkgs/libspdk/default.nix +++ b/nix/pkgs/libspdk/default.nix @@ -116,7 +116,6 @@ let (if with-fio then [ "--with-fio=${fio-include}" ] else [ ]) ++ [ "--with-uring" - "--without-uring-zns" "--disable-unit-tests" "--disable-tests" ]; diff --git a/spdk-rs b/spdk-rs index a1efae6c1..bc229bbfb 160000 --- a/spdk-rs +++ b/spdk-rs @@ -1 +1 @@ -Subproject commit a1efae6c1d8c6eaf4f6ce54b5c919f664fd466f3 +Subproject commit bc229bbfb631e8d79cc12e0fdce1aa925047efd7