Skip to content

Commit 1214d55

Browse files
committed
Merge remote-tracking branch 'remotes/nvme/tags/nvme-next-pull-request' into staging
Emulated NVMe device updates * deallocate or unwritten logical block error feature (me) * dataset management command (me) * compare command (Gollu Appalanaidu) * namespace types (Niklas Cassel) * zoned namespaces (Dmitry Fomichev) * smart critical warning toggle (Zhenwei Pi) * allow cmb and pmr to coexist (me) * pmr rds/wds support (Naveen Nagar) * cmb v1.4 logic (Padmakar Kalghatgi) And a lot of smaller fixes from Gollu Appalanaidu and Minwoo Im. # gpg: Signature made Tue 09 Feb 2021 07:25:18 GMT # gpg: using RSA key 522833AA75E2DCE6A24766C04DE1AF316D4F0DE9 # gpg: Good signature from "Klaus Jensen <[email protected]>" [unknown] # gpg: aka "Klaus Jensen <[email protected]>" [unknown] # gpg: WARNING: This key is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: DDCA 4D9C 9EF9 31CC 3468 4272 63D5 6FC5 E55D A838 # Subkey fingerprint: 5228 33AA 75E2 DCE6 A247 66C0 4DE1 AF31 6D4F 0DE9 * remotes/nvme/tags/nvme-next-pull-request: (56 commits) hw/block/nvme: refactor the logic for zone write checks hw/block/nvme: fix zone boundary check for append hw/block/nvme: fix wrong parameter name 'cross_read' hw/block/nvme: align with existing style hw/block/nvme: fix set feature save field check hw/block/nvme: fix set feature for error recovery hw/block/nvme: error if drive less than a zone size hw/block/nvme: lift cmb restrictions hw/block/nvme: bump to v1.4 hw/block/nvme: move cmb logic to v1.4 hw/block/nvme: add PMR RDS/WDS support hw/block/nvme: disable PMR at boot up hw/block/nvme: remove redundant zeroing of PMR registers hw/block/nvme: rename PMR/CMB shift/mask fields hw/block/nvme: allow cmb and pmr to coexist hw/block/nvme: move msix table and pba to BAR 0 hw/block/nvme: indicate CMB support through controller capabilities register hw/block/nvme: fix 64 bit register hi/lo split writes hw/block/nvme: add size to mmio read/write trace events hw/block/nvme: trigger async event during injecting smart warning ... Signed-off-by: Peter Maydell <[email protected]>
2 parents 41d306e + 3e22762 commit 1214d55

File tree

6 files changed

+2879
-389
lines changed

6 files changed

+2879
-389
lines changed

hw/block/nvme-ns.c

+277-13
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "qemu/units.h"
1717
#include "qemu/cutils.h"
1818
#include "qemu/log.h"
19+
#include "qemu/error-report.h"
1920
#include "hw/block/block.h"
2021
#include "hw/pci/pci.h"
2122
#include "sysemu/sysemu.h"
@@ -25,28 +26,47 @@
2526
#include "hw/qdev-properties.h"
2627
#include "hw/qdev-core.h"
2728

29+
#include "trace.h"
2830
#include "nvme.h"
2931
#include "nvme-ns.h"
3032

31-
static void nvme_ns_init(NvmeNamespace *ns)
33+
#define MIN_DISCARD_GRANULARITY (4 * KiB)
34+
35+
static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
3236
{
37+
BlockDriverInfo bdi;
3338
NvmeIdNs *id_ns = &ns->id_ns;
3439
int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
40+
int npdg;
3541

36-
if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
37-
ns->id_ns.dlfeat = 0x9;
38-
}
42+
ns->id_ns.dlfeat = 0x9;
3943

4044
id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
4145

4246
id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
4347

48+
ns->csi = NVME_CSI_NVM;
49+
4450
/* no thin provisioning */
4551
id_ns->ncap = id_ns->nsze;
4652
id_ns->nuse = id_ns->ncap;
53+
54+
/* support DULBE and I/O optimization fields */
55+
id_ns->nsfeat |= (0x4 | 0x10);
56+
57+
npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
58+
59+
if (bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi) >= 0 &&
60+
bdi.cluster_size > ns->blkconf.discard_granularity) {
61+
npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
62+
}
63+
64+
id_ns->npda = id_ns->npdg = npdg - 1;
65+
66+
return 0;
4767
}
4868

49-
static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
69+
static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
5070
{
5171
bool read_only;
5272

@@ -59,19 +79,225 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
5979
return -1;
6080
}
6181

82+
if (ns->blkconf.discard_granularity == -1) {
83+
ns->blkconf.discard_granularity =
84+
MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
85+
}
86+
6287
ns->size = blk_getlength(ns->blkconf.blk);
6388
if (ns->size < 0) {
6489
error_setg_errno(errp, -ns->size, "could not get blockdev size");
6590
return -1;
6691
}
6792

68-
if (blk_enable_write_cache(ns->blkconf.blk)) {
69-
n->features.vwc = 0x1;
93+
return 0;
94+
}
95+
96+
static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
97+
{
98+
uint64_t zone_size, zone_cap;
99+
uint32_t lbasz = ns->blkconf.logical_block_size;
100+
101+
/* Make sure that the values of ZNS properties are sane */
102+
if (ns->params.zone_size_bs) {
103+
zone_size = ns->params.zone_size_bs;
104+
} else {
105+
zone_size = NVME_DEFAULT_ZONE_SIZE;
106+
}
107+
if (ns->params.zone_cap_bs) {
108+
zone_cap = ns->params.zone_cap_bs;
109+
} else {
110+
zone_cap = zone_size;
111+
}
112+
if (zone_cap > zone_size) {
113+
error_setg(errp, "zone capacity %"PRIu64"B exceeds "
114+
"zone size %"PRIu64"B", zone_cap, zone_size);
115+
return -1;
116+
}
117+
if (zone_size < lbasz) {
118+
error_setg(errp, "zone size %"PRIu64"B too small, "
119+
"must be at least %"PRIu32"B", zone_size, lbasz);
120+
return -1;
121+
}
122+
if (zone_cap < lbasz) {
123+
error_setg(errp, "zone capacity %"PRIu64"B too small, "
124+
"must be at least %"PRIu32"B", zone_cap, lbasz);
125+
return -1;
126+
}
127+
128+
/*
129+
* Save the main zone geometry values to avoid
130+
* calculating them later again.
131+
*/
132+
ns->zone_size = zone_size / lbasz;
133+
ns->zone_capacity = zone_cap / lbasz;
134+
ns->num_zones = ns->size / lbasz / ns->zone_size;
135+
136+
/* Do a few more sanity checks of ZNS properties */
137+
if (!ns->num_zones) {
138+
error_setg(errp,
139+
"insufficient drive capacity, must be at least the size "
140+
"of one zone (%"PRIu64"B)", zone_size);
141+
return -1;
142+
}
143+
144+
if (ns->params.max_open_zones > ns->num_zones) {
145+
error_setg(errp,
146+
"max_open_zones value %u exceeds the number of zones %u",
147+
ns->params.max_open_zones, ns->num_zones);
148+
return -1;
149+
}
150+
if (ns->params.max_active_zones > ns->num_zones) {
151+
error_setg(errp,
152+
"max_active_zones value %u exceeds the number of zones %u",
153+
ns->params.max_active_zones, ns->num_zones);
154+
return -1;
155+
}
156+
157+
if (ns->params.zd_extension_size) {
158+
if (ns->params.zd_extension_size & 0x3f) {
159+
error_setg(errp,
160+
"zone descriptor extension size must be a multiple of 64B");
161+
return -1;
162+
}
163+
if ((ns->params.zd_extension_size >> 6) > 0xff) {
164+
error_setg(errp, "zone descriptor extension size is too large");
165+
return -1;
166+
}
70167
}
71168

72169
return 0;
73170
}
74171

172+
static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
173+
{
174+
uint64_t start = 0, zone_size = ns->zone_size;
175+
uint64_t capacity = ns->num_zones * zone_size;
176+
NvmeZone *zone;
177+
int i;
178+
179+
ns->zone_array = g_new0(NvmeZone, ns->num_zones);
180+
if (ns->params.zd_extension_size) {
181+
ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
182+
ns->num_zones);
183+
}
184+
185+
QTAILQ_INIT(&ns->exp_open_zones);
186+
QTAILQ_INIT(&ns->imp_open_zones);
187+
QTAILQ_INIT(&ns->closed_zones);
188+
QTAILQ_INIT(&ns->full_zones);
189+
190+
zone = ns->zone_array;
191+
for (i = 0; i < ns->num_zones; i++, zone++) {
192+
if (start + zone_size > capacity) {
193+
zone_size = capacity - start;
194+
}
195+
zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE;
196+
nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
197+
zone->d.za = 0;
198+
zone->d.zcap = ns->zone_capacity;
199+
zone->d.zslba = start;
200+
zone->d.wp = start;
201+
zone->w_ptr = start;
202+
start += zone_size;
203+
}
204+
205+
ns->zone_size_log2 = 0;
206+
if (is_power_of_2(ns->zone_size)) {
207+
ns->zone_size_log2 = 63 - clz64(ns->zone_size);
208+
}
209+
}
210+
211+
static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index)
212+
{
213+
NvmeIdNsZoned *id_ns_z;
214+
215+
nvme_ns_zoned_init_state(ns);
216+
217+
id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
218+
219+
/* MAR/MOR are zeroes-based, 0xffffffff means no limit */
220+
id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
221+
id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
222+
id_ns_z->zoc = 0;
223+
id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
224+
225+
id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
226+
id_ns_z->lbafe[lba_index].zdes =
227+
ns->params.zd_extension_size >> 6; /* Units of 64B */
228+
229+
ns->csi = NVME_CSI_ZONED;
230+
ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
231+
ns->id_ns.ncap = ns->id_ns.nsze;
232+
ns->id_ns.nuse = ns->id_ns.ncap;
233+
234+
/*
235+
* The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated"
236+
* status of logical blocks. Since the spec defines that logical blocks
237+
* SHALL be deallocated when then zone is in the Empty or Offline states,
238+
* we can only support DULBE if the zone size is a multiple of the
239+
* calculated NPDG.
240+
*/
241+
if (ns->zone_size % (ns->id_ns.npdg + 1)) {
242+
warn_report("the zone size (%"PRIu64" blocks) is not a multiple of "
243+
"the calculated deallocation granularity (%d blocks); "
244+
"DULBE support disabled",
245+
ns->zone_size, ns->id_ns.npdg + 1);
246+
247+
ns->id_ns.nsfeat &= ~0x4;
248+
}
249+
250+
ns->id_ns_zoned = id_ns_z;
251+
}
252+
253+
static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone)
254+
{
255+
uint8_t state;
256+
257+
zone->w_ptr = zone->d.wp;
258+
state = nvme_get_zone_state(zone);
259+
if (zone->d.wp != zone->d.zslba ||
260+
(zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
261+
if (state != NVME_ZONE_STATE_CLOSED) {
262+
trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
263+
nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
264+
}
265+
nvme_aor_inc_active(ns);
266+
QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry);
267+
} else {
268+
trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
269+
nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
270+
}
271+
}
272+
273+
/*
274+
* Close all the zones that are currently open.
275+
*/
276+
static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
277+
{
278+
NvmeZone *zone, *next;
279+
280+
QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
281+
QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
282+
nvme_aor_dec_active(ns);
283+
nvme_clear_zone(ns, zone);
284+
}
285+
QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
286+
QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
287+
nvme_aor_dec_open(ns);
288+
nvme_aor_dec_active(ns);
289+
nvme_clear_zone(ns, zone);
290+
}
291+
QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
292+
QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
293+
nvme_aor_dec_open(ns);
294+
nvme_aor_dec_active(ns);
295+
nvme_clear_zone(ns, zone);
296+
}
297+
298+
assert(ns->nr_open_zones == 0);
299+
}
300+
75301
static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
76302
{
77303
if (!ns->blkconf.blk) {
@@ -82,20 +308,25 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
82308
return 0;
83309
}
84310

85-
int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
311+
int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
86312
{
87313
if (nvme_ns_check_constraints(ns, errp)) {
88314
return -1;
89315
}
90316

91-
if (nvme_ns_init_blk(n, ns, errp)) {
317+
if (nvme_ns_init_blk(ns, errp)) {
92318
return -1;
93319
}
94320

95-
nvme_ns_init(ns);
96-
if (nvme_register_namespace(n, ns, errp)) {
321+
if (nvme_ns_init(ns, errp)) {
97322
return -1;
98323
}
324+
if (ns->params.zoned) {
325+
if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) {
326+
return -1;
327+
}
328+
nvme_ns_init_zoned(ns, 0);
329+
}
99330

100331
return 0;
101332
}
@@ -105,9 +336,21 @@ void nvme_ns_drain(NvmeNamespace *ns)
105336
blk_drain(ns->blkconf.blk);
106337
}
107338

108-
void nvme_ns_flush(NvmeNamespace *ns)
339+
void nvme_ns_shutdown(NvmeNamespace *ns)
109340
{
110341
blk_flush(ns->blkconf.blk);
342+
if (ns->params.zoned) {
343+
nvme_zoned_ns_shutdown(ns);
344+
}
345+
}
346+
347+
void nvme_ns_cleanup(NvmeNamespace *ns)
348+
{
349+
if (ns->params.zoned) {
350+
g_free(ns->id_ns_zoned);
351+
g_free(ns->zone_array);
352+
g_free(ns->zd_extensions);
353+
}
111354
}
112355

113356
static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -117,16 +360,37 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
117360
NvmeCtrl *n = NVME(s->parent);
118361
Error *local_err = NULL;
119362

120-
if (nvme_ns_setup(n, ns, &local_err)) {
363+
if (nvme_ns_setup(ns, &local_err)) {
121364
error_propagate_prepend(errp, local_err,
122365
"could not setup namespace: ");
123366
return;
124367
}
368+
369+
if (nvme_register_namespace(n, ns, errp)) {
370+
error_propagate_prepend(errp, local_err,
371+
"could not register namespace: ");
372+
return;
373+
}
374+
125375
}
126376

127377
static Property nvme_ns_props[] = {
128378
DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
129379
DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
380+
DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
381+
DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
382+
DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs,
383+
NVME_DEFAULT_ZONE_SIZE),
384+
DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs,
385+
0),
386+
DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace,
387+
params.cross_zone_read, false),
388+
DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace,
389+
params.max_active_zones, 0),
390+
DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
391+
params.max_open_zones, 0),
392+
DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
393+
params.zd_extension_size, 0),
130394
DEFINE_PROP_END_OF_LIST(),
131395
};
132396

0 commit comments

Comments
 (0)