Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 72 additions & 22 deletions bpf/estat/metaslab-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
#define VD_NAME_SIZE 32
typedef struct {
u64 ts;
u64 size;
u64 size; /* psize from metaslab_alloc_dva_range_entry */
u64 asize;
u64 alloc_time;
char vd_name[VD_NAME_SIZE];
} data_t;

Expand All @@ -40,9 +39,32 @@ equal_to_pool(char *str)
return (true);
}

// @@ kprobe|metaslab_alloc_dva|metaslab_alloc_dva_entry
/*
* metaslab_alloc_dva_range() is the per-DVA entry point in the write path
* for ZFS versions that have it (Delphix 2026.3 / ZFS 2.4.99+). It receives
* spa_t * as its first argument, making pool filtering straightforward.
*
* Call chain:
* metaslab_alloc()
* -> metaslab_alloc_range()
* -> metaslab_alloc_dva_range() <- outer probe (entry/exit)
* -> metaslab_group_alloc() <- inner probe (entry/exit)
*
* metaslab_alloc_dva_range() may call metaslab_group_alloc() multiple times
* (once per metaslab group tried). We therefore emit a metric on each
* metaslab_group_alloc_exit and reset the per-group fields so the next
* attempt gets a fresh vdev name, leaving the outer entry alive until
* metaslab_alloc_dva_range_exit cleans it up.
*
* metaslab_alloc_dva() (the old outer entry point, now only used in
* vdev_removal.c) is no longer probed. If this script is run on an older
* ZFS that lacks metaslab_alloc_dva_range, estat(8) will print a WARNING and
* skip those probes — no data will be collected.
*/

// @@ kprobe|metaslab_alloc_dva_range|metaslab_alloc_dva_range_entry
int
metaslab_alloc_dva_entry(struct pt_regs *ctx,
metaslab_alloc_dva_range_entry(struct pt_regs *ctx,
spa_t *spa, metaslab_class_t *mc, uint64_t psize)
{
u32 tid = bpf_get_current_pid_tgid();
Expand All @@ -51,7 +73,7 @@ metaslab_alloc_dva_entry(struct pt_regs *ctx,
if (!equal_to_pool(spa->spa_name))
return (0);

data.ts = bpf_ktime_get_ns();
data.ts = bpf_ktime_get_ns();
data.size = psize;

data_map.update(&tid, &data);
Expand All @@ -71,14 +93,14 @@ metaslab_group_alloc_entry(struct pt_regs *ctx,
return (0);

data->asize = asize;
data->alloc_time = bpf_ktime_get_ns();

if (mg->mg_vd->vdev_path != NULL) {
bpf_probe_read_str(data->vd_name,
sizeof(data->vd_name), mg->mg_vd->vdev_path);
} else {
bpf_probe_read_str(data->vd_name,
sizeof(data->vd_name), mg->mg_vd->vdev_ops->vdev_op_type);
sizeof(data->vd_name),
mg->mg_vd->vdev_ops->vdev_op_type);
}

return (0);
Expand All @@ -97,44 +119,72 @@ metaslab_group_alloc_exit(struct pt_regs *ctx)
if (data == NULL || data->ts == 0)
return (0);

if (PT_REGS_RC(ctx) == -1ULL) {
if (PT_REGS_RC(ctx) == 0) {
axis = failure;
} else {
axis = success;
}

/*
* Guard against garbage in vd_name (DLPX-88427): a kernel bug on
* some engine versions causes raw memory bytes to appear here.
* A single non-printable byte anywhere in the string breaks JSON
* output. Scan all bytes up to the first NUL; replace the whole
* name with "unknown" if empty or any byte is outside printable
* ASCII (0x20-0x7e).
*/
bool vd_valid = (data->vd_name[0] != '\0');
#pragma unroll
for (int _i = 0; _i < VD_NAME_SIZE; _i++) {
char _c = data->vd_name[_i];
if (_c == '\0')
break;
if (_c < 0x20 || _c > 0x7e)
vd_valid = false;
}
if (!vd_valid) {
char unknown[] = "unknown";
__builtin_memcpy(data->vd_name, unknown, sizeof(unknown));
}

AGGREGATE_DATA(data->vd_name, axis,
bpf_ktime_get_ns() - data->ts, data->asize);

data->asize = 0;
data->alloc_time = 0;
/*
* Reset per-group fields so that if metaslab_alloc_dva_range retries
* with another group, metaslab_group_alloc_entry gets a clean slate.
* Leave the entry alive — metaslab_alloc_dva_range_exit owns cleanup.
*/
data->asize = 0;
data->vd_name[0] = '\0';

return (0);
}

// @@ kretprobe|metaslab_alloc_dva|metaslab_alloc_dva_exit
// @@ kretprobe|metaslab_alloc_dva_range|metaslab_alloc_dva_range_exit
int
metaslab_alloc_dva_exit(struct pt_regs *ctx,
spa_t *spa, metaslab_class_t *mc, uint64_t psize)
metaslab_alloc_dva_range_exit(struct pt_regs *ctx)
{
u32 tid = bpf_get_current_pid_tgid();
data_t *data = data_map.lookup(&tid);

if (data == NULL || data->ts == 0)
return (0);
Comment thread
dbshah12 marked this conversation as resolved.

if (PT_REGS_RC(ctx) == 0)
return (0);

char name[] = "allocation failures";
char axis = 0;
AGGREGATE_DATA(name, &axis,
bpf_ktime_get_ns() - data->ts, data->size);
/*
* Non-zero return means the overall DVA allocation failed (no group
* succeeded). Emit an "allocation failures" metric using the psize
* stored at entry time; the kretprobe argument registers may be
* clobbered so we read from data->size instead.
*/
if (PT_REGS_RC(ctx) != 0) {
char name[] = "allocation failures";
char axis = 0;
AGGREGATE_DATA(name, &axis,
bpf_ktime_get_ns() - data->ts, data->size);
}

data->ts = 0;
data->size = 0;

data_map.delete(&tid);

return (0);
Expand Down
Loading