Skip to content

Commit 0594de3

Browse files
committed
range_tree: Add zfs_recover_rt parameter and extra debug info
There are production cases where unexpected range tree segment adding/removal leads to panic. The root cause investigation requires more debug info about the range tree and the segments in question when it happens. In addition, the zfs_recover_rt parameter allows converting such panics into warnings with a potential space leak as a trade-off. Signed-off-by: Igor Ostapenko <[email protected]>
1 parent fe67499 commit 0594de3

File tree

9 files changed

+257
-71
lines changed

9 files changed

+257
-71
lines changed

include/sys/range_tree.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,32 @@ typedef enum zfs_range_seg_type {
4848
ZFS_RANGE_SEG_NUM_TYPES,
4949
} zfs_range_seg_type_t;
5050

51+
/*
52+
* Range tree behavior flags.
53+
*
54+
* The UC (use case) flags are intended to support the zfs_recover_rt mode.
55+
* The range tree's logic needs to know the context in order to correctly
56+
* recover from an unexpected situation by exchanging potential data loss for
57+
* a potential space leak:
58+
*
59+
* - If it knows that the tree represents allocated space then it should better
60+
* perform an unexpected addition to the tree.
61+
*
62+
* - Similarly, if it's about free space (aka allocatable) then it should
63+
* perform unexpected removals instead of silently ignoring the issue.
64+
*
65+
* The generic case means to simply ignore unexpected additions/removals as
66+
* a recovery mechanism, without special treatment.
67+
*
68+
* Unexpected actions are logged with extra details such as a range tree
69+
* name string, which can be marked as dynamic to be freed along with the tree
70+
* instance destruction.
71+
*/
72+
#define ZFS_RANGE_TREE_F_UC_GENERIC (1 << 0)
73+
#define ZFS_RANGE_TREE_F_UC_ALLOCATED_SPACE (1 << 1)
74+
#define ZFS_RANGE_TREE_F_UC_FREE_SPACE (1 << 2)
75+
#define ZFS_RANGE_TREE_F_DYN_NAME (1 << 3)
76+
5177
/*
5278
* Note: the range_tree may not be accessed concurrently; consumers
5379
* must provide external locking if required.
@@ -67,6 +93,9 @@ typedef struct zfs_range_tree {
6793
void *rt_arg;
6894
uint64_t rt_gap; /* allowable inter-segment gap */
6995

96+
uint64_t rt_flags;
97+
const char *rt_name; /* details for debugging */
98+
7099
/*
71100
* The rt_histogram maintains a histogram of ranges. Each bucket,
72101
* rt_histogram[i], contains the number of ranges whose size is:
@@ -280,6 +309,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
280309
uint64_t gap);
281310
zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
282311
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
312+
zfs_range_tree_t *zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
313+
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
314+
uint64_t flags, const char *name);
283315
void zfs_range_tree_destroy(zfs_range_tree_t *rt);
284316
boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
285317
uint64_t size);

man/man4/zfs.4

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1987,6 +1987,12 @@ Set to attempt to recover from fatal errors.
19871987
This should only be used as a last resort,
19881988
as it typically results in leaked space, or worse.
19891989
.
1990+
.It Sy zfs_recover_rt Ns = Ns Sy 0 Ns | Ns 1 Pq int
1991+
Set to attempt to recover from fatal errors while adding or removing
1992+
unexpected segments to a range tree.
1993+
This should only be used as a last resort,
1994+
as it typically results in leaked space.
1995+
.
19901996
.It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int
19911997
Ignore hard I/O errors during device removal.
19921998
When set, if a device encounters a hard I/O error during the removal process

module/zfs/dnode.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,8 +2435,10 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
24352435
{
24362436
int txgoff = tx->tx_txg & TXG_MASK;
24372437
if (dn->dn_free_ranges[txgoff] == NULL) {
2438-
dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
2439-
ZFS_RANGE_SEG64, NULL, 0, 0);
2438+
dn->dn_free_ranges[txgoff] =
2439+
zfs_range_tree_create_flags(
2440+
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
2441+
ZFS_RANGE_TREE_F_UC_FREE_SPACE, "dn_free_ranges");
24402442
}
24412443
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
24422444
zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);

module/zfs/metaslab.c

Lines changed: 55 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,17 @@ static metaslab_stats_t metaslab_stats = {
368368
#define METASLABSTAT_BUMP(stat) \
369369
atomic_inc_64(&metaslab_stats.stat.value.ui64);
370370

371+
static inline char *
372+
zfs_rt_name(metaslab_group_t *mg, metaslab_t *ms,
373+
const char *name)
374+
{
375+
return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}",
376+
mg->mg_vd->vdev_spa->spa_name,
377+
(u_longlong_t)mg->mg_vd->vdev_guid,
378+
(u_longlong_t)ms->ms_id,
379+
name));
380+
}
381+
371382

372383
static kstat_t *metaslab_ksp;
373384

@@ -2753,30 +2764,53 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
27532764
zfs_range_seg_type_t type =
27542765
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
27552766

2756-
ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
2757-
shift);
2767+
ms->ms_allocatable = zfs_range_tree_create_flags(
2768+
NULL, type, NULL, start, shift,
2769+
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
2770+
zfs_rt_name(mg, ms, "ms_allocatable"));
27582771
for (int t = 0; t < TXG_SIZE; t++) {
2759-
ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
2760-
NULL, start, shift);
2761-
}
2762-
ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
2763-
ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
2772+
ms->ms_allocating[t] = zfs_range_tree_create_flags(
2773+
NULL, type, NULL, start, shift,
2774+
ZFS_RANGE_TREE_F_UC_ALLOCATED_SPACE |
2775+
ZFS_RANGE_TREE_F_DYN_NAME,
2776+
zfs_rt_name(mg, ms, "ms_allocating"));
2777+
}
2778+
ms->ms_freeing = zfs_range_tree_create_flags(
2779+
NULL, type, NULL, start, shift,
2780+
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
2781+
zfs_rt_name(mg, ms, "ms_freeing"));
2782+
ms->ms_freed = zfs_range_tree_create_flags(
2783+
NULL, type, NULL, start, shift,
2784+
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
2785+
zfs_rt_name(mg, ms, "ms_freed"));
27642786
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2765-
ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
2766-
start, shift);
2767-
}
2768-
ms->ms_checkpointing =
2769-
zfs_range_tree_create(NULL, type, NULL, start, shift);
2770-
ms->ms_unflushed_allocs =
2771-
zfs_range_tree_create(NULL, type, NULL, start, shift);
2787+
ms->ms_defer[t] = zfs_range_tree_create_flags(
2788+
NULL, type, NULL, start, shift,
2789+
ZFS_RANGE_TREE_F_UC_FREE_SPACE |
2790+
ZFS_RANGE_TREE_F_DYN_NAME,
2791+
zfs_rt_name(mg, ms, "ms_defer"));
2792+
}
2793+
ms->ms_checkpointing = zfs_range_tree_create_flags(
2794+
NULL, type, NULL, start, shift,
2795+
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
2796+
zfs_rt_name(mg, ms, "ms_checkpointing"));
2797+
ms->ms_unflushed_allocs = zfs_range_tree_create_flags(
2798+
NULL, type, NULL, start, shift,
2799+
ZFS_RANGE_TREE_F_UC_ALLOCATED_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
2800+
zfs_rt_name(mg, ms, "ms_unflushed_allocs"));
27722801

27732802
metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
27742803
mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
27752804
mrap->mra_floor_shift = metaslab_by_size_min_shift;
2776-
ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
2777-
type, mrap, start, shift);
2805+
ms->ms_unflushed_frees = zfs_range_tree_create_flags(
2806+
&metaslab_rt_ops, type, mrap, start, shift,
2807+
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
2808+
zfs_rt_name(mg, ms, "ms_unflushed_frees"));
27782809

2779-
ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
2810+
ms->ms_trim = zfs_range_tree_create_flags(
2811+
NULL, type, NULL, start, shift,
2812+
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
2813+
zfs_rt_name(mg, ms, "ms_trim"));
27802814

27812815
metaslab_group_add(mg, ms);
27822816
metaslab_set_fragmentation(ms, B_FALSE);
@@ -3750,7 +3784,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
37503784
type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
37513785
&start, &shift);
37523786

3753-
condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
3787+
condense_tree = zfs_range_tree_create_flags(
3788+
NULL, type, NULL, start, shift,
3789+
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
3790+
zfs_rt_name(msp->ms_group, msp, "condense_tree"));
37543791

37553792
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
37563793
zfs_range_tree_walk(msp->ms_defer[t],

0 commit comments

Comments
 (0)