16
16
#include "qemu/units.h"
17
17
#include "qemu/cutils.h"
18
18
#include "qemu/log.h"
19
+ #include "qemu/error-report.h"
19
20
#include "hw/block/block.h"
20
21
#include "hw/pci/pci.h"
21
22
#include "sysemu/sysemu.h"
25
26
#include "hw/qdev-properties.h"
26
27
#include "hw/qdev-core.h"
27
28
29
+ #include "trace.h"
28
30
#include "nvme.h"
29
31
#include "nvme-ns.h"
30
32
31
- static void nvme_ns_init (NvmeNamespace * ns )
33
+ #define MIN_DISCARD_GRANULARITY (4 * KiB)
34
+
35
+ static int nvme_ns_init (NvmeNamespace * ns , Error * * errp )
32
36
{
37
+ BlockDriverInfo bdi ;
33
38
NvmeIdNs * id_ns = & ns -> id_ns ;
34
39
int lba_index = NVME_ID_NS_FLBAS_INDEX (ns -> id_ns .flbas );
40
+ int npdg ;
35
41
36
- if (blk_get_flags (ns -> blkconf .blk ) & BDRV_O_UNMAP ) {
37
- ns -> id_ns .dlfeat = 0x9 ;
38
- }
42
+ ns -> id_ns .dlfeat = 0x9 ;
39
43
40
44
id_ns -> lbaf [lba_index ].ds = 31 - clz32 (ns -> blkconf .logical_block_size );
41
45
42
46
id_ns -> nsze = cpu_to_le64 (nvme_ns_nlbas (ns ));
43
47
48
+ ns -> csi = NVME_CSI_NVM ;
49
+
44
50
/* no thin provisioning */
45
51
id_ns -> ncap = id_ns -> nsze ;
46
52
id_ns -> nuse = id_ns -> ncap ;
53
+
54
+ /* support DULBE and I/O optimization fields */
55
+ id_ns -> nsfeat |= (0x4 | 0x10 );
56
+
57
+ npdg = ns -> blkconf .discard_granularity / ns -> blkconf .logical_block_size ;
58
+
59
+ if (bdrv_get_info (blk_bs (ns -> blkconf .blk ), & bdi ) >= 0 &&
60
+ bdi .cluster_size > ns -> blkconf .discard_granularity ) {
61
+ npdg = bdi .cluster_size / ns -> blkconf .logical_block_size ;
62
+ }
63
+
64
+ id_ns -> npda = id_ns -> npdg = npdg - 1 ;
65
+
66
+ return 0 ;
47
67
}
48
68
49
- static int nvme_ns_init_blk (NvmeCtrl * n , NvmeNamespace * ns , Error * * errp )
69
+ static int nvme_ns_init_blk (NvmeNamespace * ns , Error * * errp )
50
70
{
51
71
bool read_only ;
52
72
@@ -59,19 +79,225 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
59
79
return -1 ;
60
80
}
61
81
82
+ if (ns -> blkconf .discard_granularity == -1 ) {
83
+ ns -> blkconf .discard_granularity =
84
+ MAX (ns -> blkconf .logical_block_size , MIN_DISCARD_GRANULARITY );
85
+ }
86
+
62
87
ns -> size = blk_getlength (ns -> blkconf .blk );
63
88
if (ns -> size < 0 ) {
64
89
error_setg_errno (errp , - ns -> size , "could not get blockdev size" );
65
90
return -1 ;
66
91
}
67
92
68
- if (blk_enable_write_cache (ns -> blkconf .blk )) {
69
- n -> features .vwc = 0x1 ;
93
+ return 0 ;
94
+ }
95
+
96
+ static int nvme_ns_zoned_check_calc_geometry (NvmeNamespace * ns , Error * * errp )
97
+ {
98
+ uint64_t zone_size , zone_cap ;
99
+ uint32_t lbasz = ns -> blkconf .logical_block_size ;
100
+
101
+ /* Make sure that the values of ZNS properties are sane */
102
+ if (ns -> params .zone_size_bs ) {
103
+ zone_size = ns -> params .zone_size_bs ;
104
+ } else {
105
+ zone_size = NVME_DEFAULT_ZONE_SIZE ;
106
+ }
107
+ if (ns -> params .zone_cap_bs ) {
108
+ zone_cap = ns -> params .zone_cap_bs ;
109
+ } else {
110
+ zone_cap = zone_size ;
111
+ }
112
+ if (zone_cap > zone_size ) {
113
+ error_setg (errp , "zone capacity %" PRIu64 "B exceeds "
114
+ "zone size %" PRIu64 "B" , zone_cap , zone_size );
115
+ return -1 ;
116
+ }
117
+ if (zone_size < lbasz ) {
118
+ error_setg (errp , "zone size %" PRIu64 "B too small, "
119
+ "must be at least %" PRIu32 "B" , zone_size , lbasz );
120
+ return -1 ;
121
+ }
122
+ if (zone_cap < lbasz ) {
123
+ error_setg (errp , "zone capacity %" PRIu64 "B too small, "
124
+ "must be at least %" PRIu32 "B" , zone_cap , lbasz );
125
+ return -1 ;
126
+ }
127
+
128
+ /*
129
+ * Save the main zone geometry values to avoid
130
+ * calculating them later again.
131
+ */
132
+ ns -> zone_size = zone_size / lbasz ;
133
+ ns -> zone_capacity = zone_cap / lbasz ;
134
+ ns -> num_zones = ns -> size / lbasz / ns -> zone_size ;
135
+
136
+ /* Do a few more sanity checks of ZNS properties */
137
+ if (!ns -> num_zones ) {
138
+ error_setg (errp ,
139
+ "insufficient drive capacity, must be at least the size "
140
+ "of one zone (%" PRIu64 "B)" , zone_size );
141
+ return -1 ;
142
+ }
143
+
144
+ if (ns -> params .max_open_zones > ns -> num_zones ) {
145
+ error_setg (errp ,
146
+ "max_open_zones value %u exceeds the number of zones %u" ,
147
+ ns -> params .max_open_zones , ns -> num_zones );
148
+ return -1 ;
149
+ }
150
+ if (ns -> params .max_active_zones > ns -> num_zones ) {
151
+ error_setg (errp ,
152
+ "max_active_zones value %u exceeds the number of zones %u" ,
153
+ ns -> params .max_active_zones , ns -> num_zones );
154
+ return -1 ;
155
+ }
156
+
157
+ if (ns -> params .zd_extension_size ) {
158
+ if (ns -> params .zd_extension_size & 0x3f ) {
159
+ error_setg (errp ,
160
+ "zone descriptor extension size must be a multiple of 64B" );
161
+ return -1 ;
162
+ }
163
+ if ((ns -> params .zd_extension_size >> 6 ) > 0xff ) {
164
+ error_setg (errp , "zone descriptor extension size is too large" );
165
+ return -1 ;
166
+ }
70
167
}
71
168
72
169
return 0 ;
73
170
}
74
171
172
+ static void nvme_ns_zoned_init_state (NvmeNamespace * ns )
173
+ {
174
+ uint64_t start = 0 , zone_size = ns -> zone_size ;
175
+ uint64_t capacity = ns -> num_zones * zone_size ;
176
+ NvmeZone * zone ;
177
+ int i ;
178
+
179
+ ns -> zone_array = g_new0 (NvmeZone , ns -> num_zones );
180
+ if (ns -> params .zd_extension_size ) {
181
+ ns -> zd_extensions = g_malloc0 (ns -> params .zd_extension_size *
182
+ ns -> num_zones );
183
+ }
184
+
185
+ QTAILQ_INIT (& ns -> exp_open_zones );
186
+ QTAILQ_INIT (& ns -> imp_open_zones );
187
+ QTAILQ_INIT (& ns -> closed_zones );
188
+ QTAILQ_INIT (& ns -> full_zones );
189
+
190
+ zone = ns -> zone_array ;
191
+ for (i = 0 ; i < ns -> num_zones ; i ++ , zone ++ ) {
192
+ if (start + zone_size > capacity ) {
193
+ zone_size = capacity - start ;
194
+ }
195
+ zone -> d .zt = NVME_ZONE_TYPE_SEQ_WRITE ;
196
+ nvme_set_zone_state (zone , NVME_ZONE_STATE_EMPTY );
197
+ zone -> d .za = 0 ;
198
+ zone -> d .zcap = ns -> zone_capacity ;
199
+ zone -> d .zslba = start ;
200
+ zone -> d .wp = start ;
201
+ zone -> w_ptr = start ;
202
+ start += zone_size ;
203
+ }
204
+
205
+ ns -> zone_size_log2 = 0 ;
206
+ if (is_power_of_2 (ns -> zone_size )) {
207
+ ns -> zone_size_log2 = 63 - clz64 (ns -> zone_size );
208
+ }
209
+ }
210
+
211
+ static void nvme_ns_init_zoned (NvmeNamespace * ns , int lba_index )
212
+ {
213
+ NvmeIdNsZoned * id_ns_z ;
214
+
215
+ nvme_ns_zoned_init_state (ns );
216
+
217
+ id_ns_z = g_malloc0 (sizeof (NvmeIdNsZoned ));
218
+
219
+ /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
220
+ id_ns_z -> mar = cpu_to_le32 (ns -> params .max_active_zones - 1 );
221
+ id_ns_z -> mor = cpu_to_le32 (ns -> params .max_open_zones - 1 );
222
+ id_ns_z -> zoc = 0 ;
223
+ id_ns_z -> ozcs = ns -> params .cross_zone_read ? 0x01 : 0x00 ;
224
+
225
+ id_ns_z -> lbafe [lba_index ].zsze = cpu_to_le64 (ns -> zone_size );
226
+ id_ns_z -> lbafe [lba_index ].zdes =
227
+ ns -> params .zd_extension_size >> 6 ; /* Units of 64B */
228
+
229
+ ns -> csi = NVME_CSI_ZONED ;
230
+ ns -> id_ns .nsze = cpu_to_le64 (ns -> num_zones * ns -> zone_size );
231
+ ns -> id_ns .ncap = ns -> id_ns .nsze ;
232
+ ns -> id_ns .nuse = ns -> id_ns .ncap ;
233
+
234
+ /*
235
+ * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated"
236
+ * status of logical blocks. Since the spec defines that logical blocks
237
+ * SHALL be deallocated when then zone is in the Empty or Offline states,
238
+ * we can only support DULBE if the zone size is a multiple of the
239
+ * calculated NPDG.
240
+ */
241
+ if (ns -> zone_size % (ns -> id_ns .npdg + 1 )) {
242
+ warn_report ("the zone size (%" PRIu64 " blocks) is not a multiple of "
243
+ "the calculated deallocation granularity (%d blocks); "
244
+ "DULBE support disabled" ,
245
+ ns -> zone_size , ns -> id_ns .npdg + 1 );
246
+
247
+ ns -> id_ns .nsfeat &= ~0x4 ;
248
+ }
249
+
250
+ ns -> id_ns_zoned = id_ns_z ;
251
+ }
252
+
253
+ static void nvme_clear_zone (NvmeNamespace * ns , NvmeZone * zone )
254
+ {
255
+ uint8_t state ;
256
+
257
+ zone -> w_ptr = zone -> d .wp ;
258
+ state = nvme_get_zone_state (zone );
259
+ if (zone -> d .wp != zone -> d .zslba ||
260
+ (zone -> d .za & NVME_ZA_ZD_EXT_VALID )) {
261
+ if (state != NVME_ZONE_STATE_CLOSED ) {
262
+ trace_pci_nvme_clear_ns_close (state , zone -> d .zslba );
263
+ nvme_set_zone_state (zone , NVME_ZONE_STATE_CLOSED );
264
+ }
265
+ nvme_aor_inc_active (ns );
266
+ QTAILQ_INSERT_HEAD (& ns -> closed_zones , zone , entry );
267
+ } else {
268
+ trace_pci_nvme_clear_ns_reset (state , zone -> d .zslba );
269
+ nvme_set_zone_state (zone , NVME_ZONE_STATE_EMPTY );
270
+ }
271
+ }
272
+
273
+ /*
274
+ * Close all the zones that are currently open.
275
+ */
276
+ static void nvme_zoned_ns_shutdown (NvmeNamespace * ns )
277
+ {
278
+ NvmeZone * zone , * next ;
279
+
280
+ QTAILQ_FOREACH_SAFE (zone , & ns -> closed_zones , entry , next ) {
281
+ QTAILQ_REMOVE (& ns -> closed_zones , zone , entry );
282
+ nvme_aor_dec_active (ns );
283
+ nvme_clear_zone (ns , zone );
284
+ }
285
+ QTAILQ_FOREACH_SAFE (zone , & ns -> imp_open_zones , entry , next ) {
286
+ QTAILQ_REMOVE (& ns -> imp_open_zones , zone , entry );
287
+ nvme_aor_dec_open (ns );
288
+ nvme_aor_dec_active (ns );
289
+ nvme_clear_zone (ns , zone );
290
+ }
291
+ QTAILQ_FOREACH_SAFE (zone , & ns -> exp_open_zones , entry , next ) {
292
+ QTAILQ_REMOVE (& ns -> exp_open_zones , zone , entry );
293
+ nvme_aor_dec_open (ns );
294
+ nvme_aor_dec_active (ns );
295
+ nvme_clear_zone (ns , zone );
296
+ }
297
+
298
+ assert (ns -> nr_open_zones == 0 );
299
+ }
300
+
75
301
static int nvme_ns_check_constraints (NvmeNamespace * ns , Error * * errp )
76
302
{
77
303
if (!ns -> blkconf .blk ) {
@@ -82,20 +308,25 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
82
308
return 0 ;
83
309
}
84
310
85
- int nvme_ns_setup (NvmeCtrl * n , NvmeNamespace * ns , Error * * errp )
311
+ int nvme_ns_setup (NvmeNamespace * ns , Error * * errp )
86
312
{
87
313
if (nvme_ns_check_constraints (ns , errp )) {
88
314
return -1 ;
89
315
}
90
316
91
- if (nvme_ns_init_blk (n , ns , errp )) {
317
+ if (nvme_ns_init_blk (ns , errp )) {
92
318
return -1 ;
93
319
}
94
320
95
- nvme_ns_init (ns );
96
- if (nvme_register_namespace (n , ns , errp )) {
321
+ if (nvme_ns_init (ns , errp )) {
97
322
return -1 ;
98
323
}
324
+ if (ns -> params .zoned ) {
325
+ if (nvme_ns_zoned_check_calc_geometry (ns , errp ) != 0 ) {
326
+ return -1 ;
327
+ }
328
+ nvme_ns_init_zoned (ns , 0 );
329
+ }
99
330
100
331
return 0 ;
101
332
}
@@ -105,9 +336,21 @@ void nvme_ns_drain(NvmeNamespace *ns)
105
336
blk_drain (ns -> blkconf .blk );
106
337
}
107
338
108
- void nvme_ns_flush (NvmeNamespace * ns )
339
+ void nvme_ns_shutdown (NvmeNamespace * ns )
109
340
{
110
341
blk_flush (ns -> blkconf .blk );
342
+ if (ns -> params .zoned ) {
343
+ nvme_zoned_ns_shutdown (ns );
344
+ }
345
+ }
346
+
347
+ void nvme_ns_cleanup (NvmeNamespace * ns )
348
+ {
349
+ if (ns -> params .zoned ) {
350
+ g_free (ns -> id_ns_zoned );
351
+ g_free (ns -> zone_array );
352
+ g_free (ns -> zd_extensions );
353
+ }
111
354
}
112
355
113
356
static void nvme_ns_realize (DeviceState * dev , Error * * errp )
@@ -117,16 +360,37 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
117
360
NvmeCtrl * n = NVME (s -> parent );
118
361
Error * local_err = NULL ;
119
362
120
- if (nvme_ns_setup (n , ns , & local_err )) {
363
+ if (nvme_ns_setup (ns , & local_err )) {
121
364
error_propagate_prepend (errp , local_err ,
122
365
"could not setup namespace: " );
123
366
return ;
124
367
}
368
+
369
+ if (nvme_register_namespace (n , ns , errp )) {
370
+ error_propagate_prepend (errp , local_err ,
371
+ "could not register namespace: " );
372
+ return ;
373
+ }
374
+
125
375
}
126
376
127
377
static Property nvme_ns_props [] = {
128
378
DEFINE_BLOCK_PROPERTIES (NvmeNamespace , blkconf ),
129
379
DEFINE_PROP_UINT32 ("nsid" , NvmeNamespace , params .nsid , 0 ),
380
+ DEFINE_PROP_UUID ("uuid" , NvmeNamespace , params .uuid ),
381
+ DEFINE_PROP_BOOL ("zoned" , NvmeNamespace , params .zoned , false),
382
+ DEFINE_PROP_SIZE ("zoned.zone_size" , NvmeNamespace , params .zone_size_bs ,
383
+ NVME_DEFAULT_ZONE_SIZE ),
384
+ DEFINE_PROP_SIZE ("zoned.zone_capacity" , NvmeNamespace , params .zone_cap_bs ,
385
+ 0 ),
386
+ DEFINE_PROP_BOOL ("zoned.cross_read" , NvmeNamespace ,
387
+ params .cross_zone_read , false),
388
+ DEFINE_PROP_UINT32 ("zoned.max_active" , NvmeNamespace ,
389
+ params .max_active_zones , 0 ),
390
+ DEFINE_PROP_UINT32 ("zoned.max_open" , NvmeNamespace ,
391
+ params .max_open_zones , 0 ),
392
+ DEFINE_PROP_UINT32 ("zoned.descr_ext_size" , NvmeNamespace ,
393
+ params .zd_extension_size , 0 ),
130
394
DEFINE_PROP_END_OF_LIST (),
131
395
};
132
396
0 commit comments