Add BPF Workqueues support and example

- Introduced BPF workqueues to enable asynchronous work from BPF programs, allowing deferred processing, non-blocking operations, and sleepable contexts for long-running tasks.
- Added README.md to document the BPF workqueues, including use cases, technical architecture, and code examples.
- Created bpf_experimental.h header file to define necessary BPF workqueue functions and structures.
- Implemented a simple BPF workqueue example (wq_simple) demonstrating the initialization, scheduling, and execution of work in a separate context.
- Developed a userspace test (wq_simple.c) to verify the functionality of the BPF workqueue by triggering a syscall and checking the execution results.
This commit is contained in:
yunwei37
2025-10-04 22:49:09 -07:00
parent ba1a6a472e
commit b88ab2ae0e
11 changed files with 1755 additions and 0 deletions

12
src/features/bpf_iters/.gitignore vendored Normal file
View File

@@ -0,0 +1,12 @@
# Build artifacts
.output/
*.o
*.skel.h
# Generated binaries
task_stack
# Editor files
*.swp
*~
.vscode/

View File

@@ -0,0 +1,112 @@
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
OUTPUT := .output
CLANG ?= clang
LIBBPF_SRC := $(abspath ../../third_party/libbpf/src)
BPFTOOL_SRC := $(abspath ../../third_party/bpftool/src)
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
| sed 's/arm.*/arm/' \
| sed 's/aarch64/arm64/' \
| sed 's/ppc64le/powerpc/' \
| sed 's/mips.*/mips/' \
| sed 's/riscv64/riscv/' \
| sed 's/loongarch64/loongarch/')
VMLINUX := ../../third_party/vmlinux/$(ARCH)/vmlinux.h
# Use our own libbpf API headers and Linux UAPI headers distributed with
# libbpf to avoid dependency on system-wide headers, which could be missing or
# outdated
INCLUDES := -I$(OUTPUT) -I../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX)) -I.
CFLAGS := -g -Wall
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
APPS = task_stack
# Get Clang's default includes on this system. We'll explicitly add these dirs
# to the includes list when compiling with `-target bpf` because otherwise some
# architecture-specific dirs will be "missing" on some architectures/distros -
# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
# sys/cdefs.h etc. might be missing.
#
# Use '-idirafter': Don't interfere with include mechanics except where the
# build would have failed anyways.
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
ifeq ($(V),1)
Q =
msg =
else
Q = @
msg = @printf ' %-8s %s%s\n' \
"$(1)" \
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
"$(if $(3), $(3))";
MAKEFLAGS += --no-print-directory
endif
define allow-override
$(if $(or $(findstring environment,$(origin $(1))),\
$(findstring command line,$(origin $(1)))),,\
$(eval $(1) = $(2)))
endef
$(call allow-override,CC,$(CROSS_COMPILE)cc)
$(call allow-override,LD,$(CROSS_COMPILE)ld)
.PHONY: all
all: $(APPS)
.PHONY: clean
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) $(APPS)
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
$(call msg,MKDIR,$@)
$(Q)mkdir -p $@
# Build libbpf
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
$(call msg,LIB,$@)
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
INCLUDEDIR= LIBDIR= UAPIDIR= \
install
# Build bpftool
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
$(call msg,BPFTOOL,$@)
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
# Build BPF code
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
$(call msg,BPF,$@)
$(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
# Generate BPF skeletons
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
$(Q)$(BPFTOOL) gen skeleton $< > $@
# Build user-space code
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
$(call msg,CC,$@)
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
# Build application binary
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
$(call msg,BINARY,$@)
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
# delete failed targets
.DELETE_ON_ERROR:
# keep intermediate (.skel.h, .bpf.o, etc) targets
.SECONDARY:

View File

@@ -0,0 +1,205 @@
# BPF Iterators Tutorial
## What are BPF Iterators?
BPF iterators allow you to iterate over kernel data structures and export formatted data to userspace via `seq_file`. They're a modern replacement for traditional `/proc` files with **programmable, filterable, in-kernel data processing**.
## Real-World Example: Task Stack Iterator
### The Problem with Traditional Approach
**Traditional method** (using `/proc` or system tools):
```bash
# Show all process stack traces
cat /proc/*/stack
```
**Problems:**
1.**No filtering** - Must read ALL processes, parse in userspace
2.**Fixed format** - Cannot customize output
3.**High overhead** - Context switches, string formatting, massive output
4.**Post-processing** - All filtering/aggregation in userspace
5.**Inflexible** - Want different fields? Modify kernel!
### BPF Iterator Solution
**Our implementation** (`task_stack.bpf.c`):
```bash
# Show only systemd tasks with kernel stack traces
sudo ./task_stack systemd
```
**Benefits:**
1.**In-kernel filtering** - Only selected processes sent to userspace
2.**Custom format** - Choose exactly what fields to show
3.**Low overhead** - Filter before copying to userspace
4.**Programmable** - Add statistics, calculations, aggregations
5.**Dynamic** - Load different filters without kernel changes
### Performance Comparison
| Operation | Traditional `/proc` | BPF Iterator |
|-----------|-------------------|--------------|
| Read all stacks | Parse 1000+ files | Single read() call |
| Filter by name | Userspace loop | In-kernel filter |
| Data transfer | MB of text | KB of relevant data |
| CPU usage | High (parsing) | Low (pre-filtered) |
| Customization | Recompile kernel | Load new BPF program |
## Example Output
```
$ sudo ./task_stack systemd
Filtering for tasks matching: systemd
=== BPF Task Stack Iterator ===
=== Task: systemd (pid=1, tgid=1) ===
Stack depth: 6 frames
[ 0] ep_poll+0x447/0x460
[ 1] do_epoll_wait+0xc3/0xe0
[ 2] __x64_sys_epoll_wait+0x6d/0x110
[ 3] x64_sys_call+0x19b1/0x2310
[ 4] do_syscall_64+0x7e/0x170
[ 5] entry_SYSCALL_64_after_hwframe+0x76/0x7e
=== Summary: 2 task stacks shown ===
```
## How It Works
### 1. BPF Program (`task_stack.bpf.c`)
```c
SEC("iter/task")
int dump_task_stack(struct bpf_iter__task *ctx)
{
struct task_struct *task = ctx->task;
// In-kernel filtering by task name
if (target_comm[0] != '\0' && !match_name(task->comm))
return 0; // Skip this task
// Get kernel stack trace
bpf_get_task_stack(task, entries, MAX_DEPTH * SIZE_OF_ULONG, 0);
// Format and output to seq_file
BPF_SEQ_PRINTF(seq, "Task: %s (pid=%u)\n", task->comm, task->pid);
return 0;
}
```
### 2. Userspace Program (`task_stack.c`)
```c
// Attach iterator
link = bpf_program__attach_iter(skel->progs.dump_task_stack, NULL);
// Create iterator instance
iter_fd = bpf_iter_create(bpf_link__fd(link));
// Read output
while ((len = read(iter_fd, buf, sizeof(buf))) > 0) {
printf("%s", buf);
}
```
## Available Iterator Types
The kernel provides many iterator types:
### System Iterators
- `iter/task` - Iterate all tasks/processes
- `iter/ksym` - Kernel symbols (like `/proc/kallsyms`)
- `iter/bpf_map` - All BPF maps in system
- `iter/bpf_link` - All BPF links
### Network Iterators
- `iter/tcp` - TCP sockets (replaces `/proc/net/tcp`)
- `iter/udp` - UDP sockets
- `iter/unix` - Unix domain sockets
- `iter/netlink` - Netlink sockets
### Map Iterators
- `iter/bpf_map_elem` - Iterate map elements
- `iter/sockmap` - Socket map entries
### Task/Process Iterators
- `iter/task_file` - Task file descriptors (like `/proc/PID/fd`)
- `iter/task_vma` - Task memory mappings (like `/proc/PID/maps`)
## Use Cases
### 1. Performance Monitoring
- Track high-latency network connections
- Monitor stuck processes (long-running syscalls)
- Identify memory-hungry tasks
### 2. Debugging
- Capture stack traces of specific processes
- Dump kernel state for analysis
- Trace system calls in real-time
### 3. Security
- Monitor process creation patterns
- Track network connection attempts
- Audit file access patterns
### 4. Custom `/proc` Replacements
- Create application-specific views
- Filter and aggregate kernel data
- Reduce userspace processing overhead
## Building and Running
```bash
# Build
cd /home/yunwei37/workspace/bpf-developer-tutorial/src/features/bpf_iters
make
# Run - show all tasks
sudo ./task_stack
# Run - filter by task name
sudo ./task_stack systemd
sudo ./task_stack bash
```
## Key Differences: Iterator Types
### Kernel Iterators (`SEC("iter/...")`)
- **Purpose**: Export kernel data to userspace
- **Output**: seq_file (readable via read())
- **Activation**: Attach, create instance, read FD
- **Example**: Task stacks, TCP sockets, kernel symbols
### Open-Coded Iterators (`bpf_for`, `bpf_iter_num`)
- **Purpose**: Loop constructs within BPF programs
- **Output**: Internal program variables
- **Activation**: Execute during program run
- **Example**: Sum numbers, count elements, iterate arrays
## Advantages Over Traditional Approaches
| Feature | Traditional `/proc` | BPF Iterators |
|---------|-------------------|---------------|
| **Filtering** | Userspace only | In-kernel |
| **Performance** | High overhead | Minimal overhead |
| **Customization** | Kernel rebuild | Load BPF program |
| **Format** | Fixed | Fully programmable |
| **Statistics** | Userspace calc | In-kernel aggregation |
| **Security** | No filtering | LSM hooks available |
| **Deployment** | Static | Dynamic (load anytime) |
## Summary
BPF iterators are **game-changing** for system observability:
1. **Performance**: Filter in kernel, only send relevant data
2. **Flexibility**: Load different programs for different views
3. **Power**: Access raw kernel structures with type safety (BTF)
4. **Safety**: Verified by BPF verifier, can't crash kernel
5. **Portability**: CO-RE ensures binary works across kernel versions
They enable creating **custom, high-performance system monitoring tools** without modifying the kernel!

View File

@@ -0,0 +1,118 @@
// SPDX-License-Identifier: GPL-2.0
/* Kernel task stack and file descriptor iterator */
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
#define MAX_STACK_TRACE_DEPTH 64
unsigned long entries[MAX_STACK_TRACE_DEPTH] = {};
#define SIZE_OF_ULONG (sizeof(unsigned long))
/* Filter: only show stacks for tasks with this name (empty = show all) */
char target_comm[16] = "";
__u32 stacks_shown = 0;
__u32 files_shown = 0;
/* Task stack iterator */
SEC("iter/task")
int dump_task_stack(struct bpf_iter__task *ctx)
{
struct seq_file *seq = ctx->meta->seq;
struct task_struct *task = ctx->task;
long i, retlen;
int match = 1;
if (task == (void *)0) {
/* End of iteration - print summary */
if (stacks_shown > 0) {
BPF_SEQ_PRINTF(seq, "\n=== Summary: %u task stacks shown ===\n",
stacks_shown);
}
return 0;
}
/* Filter by task name if specified */
if (target_comm[0] != '\0') {
match = 0;
for (i = 0; i < 16; i++) {
if (task->comm[i] != target_comm[i])
break;
if (task->comm[i] == '\0') {
match = 1;
break;
}
}
if (!match)
return 0;
}
/* Get kernel stack trace for this task */
retlen = bpf_get_task_stack(task, entries,
MAX_STACK_TRACE_DEPTH * SIZE_OF_ULONG, 0);
if (retlen < 0)
return 0;
stacks_shown++;
/* Print task info and stack trace */
BPF_SEQ_PRINTF(seq, "=== Task: %s (pid=%u, tgid=%u) ===\n",
task->comm, task->pid, task->tgid);
BPF_SEQ_PRINTF(seq, "Stack depth: %u frames\n", retlen / SIZE_OF_ULONG);
for (i = 0; i < MAX_STACK_TRACE_DEPTH; i++) {
if (retlen > i * SIZE_OF_ULONG)
BPF_SEQ_PRINTF(seq, " [%2ld] %pB\n", i, (void *)entries[i]);
}
BPF_SEQ_PRINTF(seq, "\n");
return 0;
}
/* Task file descriptor iterator */
SEC("iter/task_file")
int dump_task_file(struct bpf_iter__task_file *ctx)
{
struct seq_file *seq = ctx->meta->seq;
struct task_struct *task = ctx->task;
struct file *file = ctx->file;
__u32 fd = ctx->fd;
long i;
int match = 1;
if (task == (void *)0 || file == (void *)0) {
if (files_shown > 0 && ctx->meta->seq_num > 0) {
BPF_SEQ_PRINTF(seq, "\n=== Summary: %u file descriptors shown ===\n",
files_shown);
}
return 0;
}
/* Filter by task name if specified */
if (target_comm[0] != '\0') {
match = 0;
for (i = 0; i < 16; i++) {
if (task->comm[i] != target_comm[i])
break;
if (task->comm[i] == '\0') {
match = 1;
break;
}
}
if (!match)
return 0;
}
if (ctx->meta->seq_num == 0) {
BPF_SEQ_PRINTF(seq, "%-16s %8s %8s %6s %s\n",
"COMM", "TGID", "PID", "FD", "FILE_OPS");
}
files_shown++;
BPF_SEQ_PRINTF(seq, "%-16s %8d %8d %6d 0x%lx\n",
task->comm, task->tgid, task->pid, fd,
(long)file->f_op);
return 0;
}

View File

@@ -0,0 +1,93 @@
// SPDX-License-Identifier: GPL-2.0
/* Userspace program for task stack and file iterator */
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <bpf/libbpf.h>
#include <bpf/bpf.h>
#include "task_stack.skel.h"
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
return vfprintf(stderr, format, args);
}
static void run_iterator(const char *name, struct bpf_program *prog)
{
struct bpf_link *link;
int iter_fd, len;
char buf[8192];
link = bpf_program__attach_iter(prog, NULL);
if (!link) {
fprintf(stderr, "Failed to attach %s iterator\n", name);
return;
}
iter_fd = bpf_iter_create(bpf_link__fd(link));
if (iter_fd < 0) {
fprintf(stderr, "Failed to create %s iterator: %d\n", name, iter_fd);
bpf_link__destroy(link);
return;
}
while ((len = read(iter_fd, buf, sizeof(buf) - 1)) > 0) {
buf[len] = '\0';
printf("%s", buf);
}
close(iter_fd);
bpf_link__destroy(link);
}
int main(int argc, char **argv)
{
struct task_stack_bpf *skel;
int err;
int show_files = 0;
libbpf_set_print(libbpf_print_fn);
/* Parse arguments */
if (argc > 1 && strcmp(argv[1], "--files") == 0) {
show_files = 1;
argc--;
argv++;
}
/* Open BPF application */
skel = task_stack_bpf__open();
if (!skel) {
fprintf(stderr, "Failed to open BPF skeleton\n");
return 1;
}
/* Configure filter before loading */
if (argc > 1) {
strncpy(skel->bss->target_comm, argv[1], sizeof(skel->bss->target_comm) - 1);
printf("Filtering for tasks matching: %s\n\n", argv[1]);
} else {
printf("Usage: %s [--files] [comm]\n", argv[0]);
printf(" --files Show open file descriptors instead of stacks\n");
printf(" comm Filter by process name\n\n");
}
/* Load BPF program */
err = task_stack_bpf__load(skel);
if (err) {
fprintf(stderr, "Failed to load BPF skeleton\n");
goto cleanup;
}
if (show_files) {
printf("=== BPF Task File Descriptor Iterator ===\n\n");
run_iterator("task_file", skel->progs.dump_task_file);
} else {
printf("=== BPF Task Stack Iterator ===\n\n");
run_iterator("task", skel->progs.dump_task_stack);
}
cleanup:
task_stack_bpf__destroy(skel);
return err;
}