Skip to content

Commit 6e4484a

Browse files
committed
ringbuf prototype for emitting sampled syscall completion records with latency numbers
1 parent e8d3165 commit 6e4484a

File tree

3 files changed

+178
-34
lines changed

3 files changed

+178
-34
lines changed

next/src/xcapture.bpf.c

+85-23
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@
1010
char LICENSE[] SEC("license") = "Dual BSD/GPL";
1111
char VERSION[] = "3.0.0";
1212

13-
// ? figure this out without including kernel .h files (platform and kconfig-dependent)
13+
// figure this out without including kernel .h files (platform and kconfig-dependent)
1414
#define PAGE_SIZE 4096
1515
#define THREAD_SIZE (PAGE_SIZE << 2)
1616

17-
// for map_get lookups
17+
// for map_get lookups & empty map creations
1818
static __u32 zero = 0;
1919

2020

@@ -44,20 +44,14 @@ struct {
4444
} task_storage SEC(".maps");
4545

4646

47-
// static struct task_storage *get_task_storage(struct task_struct *task)
48-
// {
49-
// struct task_storage *storage;
50-
//
51-
// storage = bpf_task_storage_get(&task_storage, task, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE);
52-
// bpf_printk("bpf_task_storage_get = %llx \n", storage);
53-
//
54-
// if (!storage)
55-
// return NULL;
56-
//
57-
// return storage;
58-
// }
47+
// ringbuf for event completion events
48+
struct {
49+
__uint(type, BPF_MAP_TYPE_RINGBUF);
50+
__uint(max_entries, 256 * 1024);
51+
} completion_events SEC(".maps");
5952

6053

54+
// kernel version-adaptive task state retrieval
6155
static __u32 get_task_state(void *arg)
6256
{
6357
if (bpf_core_field_exists(struct task_struct___pre514, state)) {
@@ -70,6 +64,17 @@ static __u32 get_task_state(void *arg)
7064
}
7165

7266

67+
// xcapture start ktime for syscall duration sanitization later on
68+
__u64 program_start_time = 0;
69+
70+
SEC("tp_btf/sys_enter")
71+
int handle_init(void *ctx)
72+
{
73+
if (program_start_time == 0)
74+
program_start_time = bpf_ktime_get_ns();
75+
return 0;
76+
}
77+
7378

7479
SEC("raw_tracepoint/sys_enter")
7580
int handle_sys_enter(struct bpf_raw_tracepoint_args *ctx)
@@ -78,16 +83,17 @@ int handle_sys_enter(struct bpf_raw_tracepoint_args *ctx)
7883

7984
storage = bpf_task_storage_get(&task_storage, bpf_get_current_task_btf(), 0,
8085
BPF_LOCAL_STORAGE_GET_F_CREATE);
81-
8286
if (!storage)
8387
return 0;
8488

8589
storage->sc_enter_time = bpf_ktime_get_ns();
90+
storage->sc_sampled = false; // new syscall, can't have been seen yet by sampler
8691
storage->sc_sequence_num += 1;
8792
storage->in_syscall_nr = (s32)ctx->args[1]; // syscall nr
8893
return 0;
8994
}
9095

96+
9197
SEC("raw_tracepoint/sys_exit")
9298
int handle_sys_exit(struct bpf_raw_tracepoint_args *ctx)
9399
{
@@ -97,9 +103,36 @@ int handle_sys_exit(struct bpf_raw_tracepoint_args *ctx)
97103
BPF_LOCAL_STORAGE_GET_F_CREATE);
98104
if (!storage)
99105
return 0;
100-
106+
107+
// Only emit completion events for syscalls that were caught in a sample
108+
if (storage->sc_sampled) {
109+
struct sc_completion_event *event;
110+
111+
// Reserve space in the ringbuf
112+
event = bpf_ringbuf_reserve(&completion_events, sizeof(*event), 0);
113+
if (!event)
114+
goto cleanup; // ringbuf full, skip this event
115+
116+
// Get current task info for pid/tgid
117+
struct task_struct *task = bpf_get_current_task_btf();
118+
event->pid = BPF_CORE_READ(task, pid);
119+
event->tgid = BPF_CORE_READ(task, tgid);
120+
121+
// Fill in syscall completion details
122+
event->completed_syscall_nr = storage->in_syscall_nr;
123+
event->completed_sc_sequence_nr = storage->sc_sequence_num;
124+
event->completed_sc_enter_time = storage->sc_enter_time;
125+
event->completed_sc_exit_time = bpf_ktime_get_ns();
126+
127+
// Submit the event
128+
bpf_ringbuf_submit(event, 0);
129+
}
130+
131+
cleanup:
132+
// Reset syscall tracking state
101133
storage->in_syscall_nr = -1;
102134
storage->sc_enter_time = 0;
135+
103136
return 0;
104137
}
105138

@@ -120,7 +153,7 @@ int get_tasks(struct bpf_iter__task *ctx)
120153
__u32 task_state = get_task_state(task);
121154
__u32 task_flags = task->flags;
122155

123-
// idle kernel worker thread waiting for work or other kernel threads in S state
156+
// idle kernel worker thread waiting for work or other kernel threads in S state (also, they make no syscalls)
124157
if ((task_state & TASK_NOLOAD) || ((task_flags & PF_KTHREAD) && (task_state & TASK_INTERRUPTIBLE)))
125158
return 0;
126159

@@ -197,7 +230,8 @@ int get_tasks(struct bpf_iter__task *ctx)
197230
bpf_probe_read_kernel(&file, sizeof(file), &fd_array[t->syscall_args[0]]);
198231

199232
if (file) {
200-
// bpf_d_path() doesn't work in a non-tracing task iterator program context?
233+
// bpf_d_path() doesn't work in a simple task iterator program context (?)
234+
// task_vma, task_files should work since kernel #3d06f34 (bpf: Allow bpf_d_path in bpf_iter program)
201235
struct path file_path;
202236
bpf_probe_read_kernel(&file_path, sizeof(file_path), &file->f_path);
203237
struct dentry *dentry = BPF_CORE_READ(file, f_path.dentry);
@@ -212,22 +246,50 @@ int get_tasks(struct bpf_iter__task *ctx)
212246
ret = bpf_get_task_stack(task, t->kstack, sizeof(__u64) * MAX_STACK_LEN, 0);
213247
t->kstack_len = ret <= 0 ? ret : ret / sizeof(t->kstack[0]);
214248

215-
// read task storage map experiment
249+
250+
// put all this into a task_storage map for emitting to userspace
216251
struct task_storage *storage;
217252

218253
storage = bpf_task_storage_get(&task_storage, task, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE);
219254

220255
if (!storage) // for the verifier
221256
return 0;
222257

223-
t->storage.in_syscall_nr = storage->in_syscall_nr;
224-
t->storage.sample_ktime = sample_ktime;
225-
t->storage.sc_enter_time = storage->sc_enter_time;
226-
t->storage.sc_sequence_num = storage->sc_sequence_num;
258+
// use "storage" here not "t->storage" (I need to come up with better naming of things)
259+
storage->sc_sampled = true; // used for deciding when to emit syscall completion records
260+
261+
// if this is the first time we see this task and it's already in a syscall,
262+
// set its sc_enter_time to program start time instead of 0
263+
if (storage->sc_enter_time == 0 && storage->in_syscall_nr >= 0) {
264+
storage->sc_enter_time = program_start_time;
265+
}
266+
267+
// "t" is the final struct for emitting everything to userspace
268+
t->storage.in_syscall_nr = storage->in_syscall_nr;
269+
t->storage.sample_ktime = sample_ktime;
270+
t->storage.sc_enter_time = storage->sc_enter_time;
271+
t->storage.sc_sequence_num = storage->sc_sequence_num;
227272

228273
bpf_seq_write(seq, t, sizeof(struct task_info));
229274
return 0;
230275
}
231276

232277

233278

279+
// maybe for refactoring later
280+
281+
// static struct task_storage *get_task_storage(struct task_struct *task)
282+
// {
283+
// struct task_storage *storage;
284+
//
285+
// storage = bpf_task_storage_get(&task_storage, task, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE);
286+
// bpf_printk("bpf_task_storage_get = %llx \n", storage);
287+
//
288+
// if (!storage)
289+
// return NULL;
290+
//
291+
// return storage;
292+
// }
293+
294+
295+

next/src/xcapture.c

+72-10
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
#include "xcapture.skel.h"
1515
#include <syscall_names.h>
1616

17+
// global, set once during startup arg checking
18+
bool output_csv = false;
19+
1720

1821
// handle CTRL+C and sigpipe etc
1922
static volatile bool exiting = false;
@@ -72,8 +75,9 @@ static const char *safe_syscall_name(__s32 syscall_nr)
7275

7376

7477
// ns is signed as sometimes there's small negative durations reported due to
75-
// concurrency of BPF task iterator vs event capture probes running on different CPUs
76-
struct timespec subtract_ns_from_timespec(struct timespec ts, __s64 ns) {
78+
// concurrency of BPF task iterator vs event capture probes running on different CPUs (?)
79+
struct timespec subtract_ns_from_timespec(struct timespec ts, __s64 ns)
80+
{
7781
struct timespec result = ts;
7882

7983
if (result.tv_nsec < (long)(ns % 1000000000)) {
@@ -87,12 +91,44 @@ struct timespec subtract_ns_from_timespec(struct timespec ts, __s64 ns) {
8791
return result;
8892
}
8993

94+
// handle ringbuf completion events, don't bother trying to format
95+
// wallclock time as these are ktime timestamps
96+
static int handle_event(void *ctx, void *data, size_t data_sz)
97+
{
98+
const struct sc_completion_event *e = data;
99+
100+
// Calculate duration in microseconds
101+
__u64 duration_us = (e->completed_sc_exit_time - e->completed_sc_enter_time) / 1000;
102+
103+
if (output_csv) {
104+
printf("SC_END,%d,%d,%d,%llu,%llu,%llu,%llu\n",
105+
e->pid,
106+
e->tgid,
107+
e->completed_syscall_nr,
108+
e->completed_sc_sequence_nr,
109+
e->completed_sc_enter_time,
110+
e->completed_sc_exit_time,
111+
duration_us);
112+
} else {
113+
printf("SC_END %7d %7d %-20s %12llu %26llu %26llu %'16llu\n",
114+
e->pid,
115+
e->tgid,
116+
safe_syscall_name(e->completed_syscall_nr),
117+
e->completed_sc_sequence_nr,
118+
e->completed_sc_enter_time,
119+
e->completed_sc_exit_time,
120+
duration_us);
121+
}
122+
123+
return 0;
124+
}
125+
126+
90127

91128
int main(int argc, char **argv)
92129
{
93130
// super simple check to avoid proper argument handling for now
94131
// if any args are given to xcapture, output CSV
95-
bool output_csv = false;
96132
if (argc > 1)
97133
output_csv = true;
98134

@@ -124,6 +160,16 @@ int main(int argc, char **argv)
124160
goto cleanup;
125161
}
126162

163+
164+
/* Set up ring buffer polling */
165+
struct ring_buffer *rb = NULL;
166+
rb = ring_buffer__new(bpf_map__fd(skel->maps.completion_events), handle_event, NULL, NULL);
167+
if (!rb) {
168+
fprintf(stderr, "Failed to create ring buffer\n");
169+
goto cleanup;
170+
}
171+
172+
127173
struct timespec sample_ts; // sample timestamp
128174
char timestamp[64];
129175

@@ -143,13 +189,13 @@ int main(int argc, char **argv)
143189
// Print output (kernel pid printed as TID in userspace and kernel tgid as PID)
144190
if (output_csv) {
145191
if (!header_printed) {
146-
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
147-
"TIMESTAMP", "TID", "TGID", "STATE", "USER", "EXE", "COMM",
192+
printf("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
193+
"TYPE", "TIMESTAMP", "TID", "TGID", "STATE", "USER", "EXE", "COMM",
148194
"SYSCALL_PASSIVE", "SYSCALL_ACTIVE", "SC_ENTRY_TIME", "SC_US_SO_FAR", "SC_SEQ_NUM", "ARG0", "FILENAME");
149195
}
150196
} else {
151-
printf("%-26s %7s %7s %-6s %-16s %-20s %-16s %-20s %-20s %-26s %16s %12s %-16s %s\n",
152-
"TIMESTAMP", "TID", "TGID", "STATE", "USER", "EXE", "COMM",
197+
printf("%-6s %-26s %7s %7s %-6s %-16s %-20s %-16s %-20s %-20s %-26s %16s %12s %-16s %s\n",
198+
"TYPE", "TIMESTAMP", "TID", "TGID", "STATE", "USER", "EXE", "COMM",
153199
"SYSCALL_PASSIVE", "SYSCALL_ACTIVE", "SC_ENTRY_TIME", "SC_US_SO_FAR", "SC_SEQ_NUM", "ARG0", "FILENAME");
154200
}
155201

@@ -174,6 +220,7 @@ int main(int argc, char **argv)
174220
if (ret == 0)
175221
break;
176222

223+
177224
__s64 duration_ns = 0; // event duration so far, from its start to sampling point
178225
if (buf.storage.sc_enter_time)
179226
duration_ns = buf.storage.sample_ktime - buf.storage.sc_enter_time;
@@ -188,7 +235,8 @@ int main(int argc, char **argv)
188235
snprintf(sc_start_time_str + 19, sizeof(sc_start_time_str) - 19, ".%06ld", sc_start_timespec.tv_nsec / 1000);
189236

190237
if (output_csv) {
191-
printf("%s,%d,%d,%s,\"%s\",\"%s\",\"%s\",%s,%s,%s,%lld,%lld,%llx,\"%s\"\n",
238+
printf("%s,%s,%d,%d,%s,\"%s\",\"%s\",\"%s\",%s,%s,%s,%lld,%lld,%llx,\"%s\"\n",
239+
"SAMPLE",
192240
timestamp,
193241
buf.pid,
194242
buf.tgid,
@@ -208,7 +256,8 @@ int main(int argc, char **argv)
208256
);
209257
}
210258
else {
211-
printf("%-26s %7d %7d %-6s %-16s %-20s %-16s %-20s %-20s %-26s %'16lld %12lld %-16llx %s\n",
259+
printf("%-6s %-26s %7d %7d %-6s %-16s %-20s %-16s %-20s %-20s %-26s %'16lld %12lld %-16llx %s\n",
260+
"SAMPLE",
212261
timestamp,
213262
buf.pid,
214263
buf.tgid,
@@ -227,7 +276,14 @@ int main(int argc, char **argv)
227276
buf.filename[0] ? buf.filename : "-"
228277
);
229278
}
279+
}
230280

281+
282+
/* Ring buffer poll for event completions */
283+
err = ring_buffer__poll(rb, 100 /* timeout, ms */);
284+
if (err < 0) {
285+
fprintf(stderr, "Error polling ring buffer: %d\n", err);
286+
goto cleanup;
231287
}
232288

233289

@@ -249,7 +305,13 @@ int main(int argc, char **argv)
249305
cleanup:
250306
/* Clean up */
251307
fflush(stdout);
252-
close(iter_fd); // the fd might already be closed above, but this would just return an EBADF then
308+
close(iter_fd); /* the fd might already be closed above, but this would just return an EBADF then */
309+
310+
/* Release event completion ring buffer */
311+
if (rb) {
312+
ring_buffer__free(rb);
313+
}
314+
253315
xcapture_bpf__destroy(skel);
254316

255317
return err < 0 ? -err : 0;

next/src/xcapture.h

+21-1
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,39 @@
3434
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
3535

3636

37+
// global xcapture start time (for syscall duration sanitization later on)
38+
extern __u64 program_start_time;
39+
3740
// to be used with BPF_MAP_TYPE_TASK_STORAGE
3841
struct task_storage {
3942
__u64 sample_ktime;
43+
4044
__s32 in_syscall_nr;
4145
__u64 sc_enter_time;
4246
__u64 sc_sequence_num; // any syscall entry in a task will increment this single counter
4347

48+
bool sc_sampled:1; // task iterator will set the following fields only if it catches a task
49+
// in syscall during its sampling (later used for event completion records)
50+
// block I/O latency sampling (not implemented yet)
4451
__u64 bio_queue_ktime;
4552
__u32 bio_dev;
4653
__u64 bio_sector;
4754
};
4855

49-
// use kernel nomenclature in kernel side eBPF code (pid,tgid)
56+
57+
// to be emitted via ringbuf only on *sampled* event completions, for getting the correct measured event times
58+
// so we are not going to emit millions of events per sec here (just up to num active threads * sampling rate)
59+
struct sc_completion_event {
60+
pid_t pid; // pid, tgid, completed_sc_sequence_nr, completed_sc_enter_time are the unique identifier here
61+
pid_t tgid;
62+
__u64 completed_sc_sequence_nr;
63+
__u64 completed_sc_enter_time;
64+
__u64 completed_sc_exit_time;
65+
__s32 completed_syscall_nr;
66+
};
67+
68+
69+
// use kernel nomenclature in kernel side eBPF code (pid=thread_id, tgid=process_id)
5070
struct task_info {
5171
struct task_struct * addr; // task kernel address for task storage lookups elsewhere
5272
pid_t pid; // task id (tid in userspace)

0 commit comments

Comments
 (0)