mirror of
https://github.com/eunomia-bpf/bpf-developer-tutorial.git
synced 2026-04-03 02:29:06 +08:00
Add BPF Workqueues support and example
- Introduced BPF workqueues to enable asynchronous work from BPF programs, allowing deferred processing, non-blocking operations, and sleepable contexts for long-running tasks. - Added README.md to document the BPF workqueues, including use cases, technical architecture, and code examples. - Created bpf_experimental.h header file to define necessary BPF workqueue functions and structures. - Implemented a simple BPF workqueue example (wq_simple) demonstrating the initialization, scheduling, and execution of work in a separate context. - Developed a userspace test (wq_simple.c) to verify the functionality of the BPF workqueue by triggering a syscall and checking the execution results.
This commit is contained in:
12
src/features/bpf_iters/.gitignore
vendored
Normal file
12
src/features/bpf_iters/.gitignore
vendored
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Build artifacts
|
||||||
|
.output/
|
||||||
|
*.o
|
||||||
|
*.skel.h
|
||||||
|
|
||||||
|
# Generated binaries
|
||||||
|
task_stack
|
||||||
|
|
||||||
|
# Editor files
|
||||||
|
*.swp
|
||||||
|
*~
|
||||||
|
.vscode/
|
||||||
112
src/features/bpf_iters/Makefile
Normal file
112
src/features/bpf_iters/Makefile
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||||
|
OUTPUT := .output
|
||||||
|
CLANG ?= clang
|
||||||
|
LIBBPF_SRC := $(abspath ../../third_party/libbpf/src)
|
||||||
|
BPFTOOL_SRC := $(abspath ../../third_party/bpftool/src)
|
||||||
|
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
|
||||||
|
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
|
||||||
|
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
|
||||||
|
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
|
||||||
|
| sed 's/arm.*/arm/' \
|
||||||
|
| sed 's/aarch64/arm64/' \
|
||||||
|
| sed 's/ppc64le/powerpc/' \
|
||||||
|
| sed 's/mips.*/mips/' \
|
||||||
|
| sed 's/riscv64/riscv/' \
|
||||||
|
| sed 's/loongarch64/loongarch/')
|
||||||
|
VMLINUX := ../../third_party/vmlinux/$(ARCH)/vmlinux.h
|
||||||
|
# Use our own libbpf API headers and Linux UAPI headers distributed with
|
||||||
|
# libbpf to avoid dependency on system-wide headers, which could be missing or
|
||||||
|
# outdated
|
||||||
|
INCLUDES := -I$(OUTPUT) -I../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX)) -I.
|
||||||
|
CFLAGS := -g -Wall
|
||||||
|
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
|
||||||
|
|
||||||
|
APPS = task_stack
|
||||||
|
|
||||||
|
# Get Clang's default includes on this system. We'll explicitly add these dirs
|
||||||
|
# to the includes list when compiling with `-target bpf` because otherwise some
|
||||||
|
# architecture-specific dirs will be "missing" on some architectures/distros -
|
||||||
|
# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
|
||||||
|
# sys/cdefs.h etc. might be missing.
|
||||||
|
#
|
||||||
|
# Use '-idirafter': Don't interfere with include mechanics except where the
|
||||||
|
# build would have failed anyways.
|
||||||
|
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
|
||||||
|
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
|
||||||
|
|
||||||
|
ifeq ($(V),1)
|
||||||
|
Q =
|
||||||
|
msg =
|
||||||
|
else
|
||||||
|
Q = @
|
||||||
|
msg = @printf ' %-8s %s%s\n' \
|
||||||
|
"$(1)" \
|
||||||
|
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
|
||||||
|
"$(if $(3), $(3))";
|
||||||
|
MAKEFLAGS += --no-print-directory
|
||||||
|
endif
|
||||||
|
|
||||||
|
define allow-override
|
||||||
|
$(if $(or $(findstring environment,$(origin $(1))),\
|
||||||
|
$(findstring command line,$(origin $(1)))),,\
|
||||||
|
$(eval $(1) = $(2)))
|
||||||
|
endef
|
||||||
|
|
||||||
|
$(call allow-override,CC,$(CROSS_COMPILE)cc)
|
||||||
|
$(call allow-override,LD,$(CROSS_COMPILE)ld)
|
||||||
|
|
||||||
|
.PHONY: all
|
||||||
|
all: $(APPS)
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
$(call msg,CLEAN)
|
||||||
|
$(Q)rm -rf $(OUTPUT) $(APPS)
|
||||||
|
|
||||||
|
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
|
||||||
|
$(call msg,MKDIR,$@)
|
||||||
|
$(Q)mkdir -p $@
|
||||||
|
|
||||||
|
# Build libbpf
|
||||||
|
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
|
||||||
|
$(call msg,LIB,$@)
|
||||||
|
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
|
||||||
|
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
|
||||||
|
INCLUDEDIR= LIBDIR= UAPIDIR= \
|
||||||
|
install
|
||||||
|
|
||||||
|
# Build bpftool
|
||||||
|
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
|
||||||
|
$(call msg,BPFTOOL,$@)
|
||||||
|
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
|
||||||
|
|
||||||
|
# Build BPF code
|
||||||
|
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
|
||||||
|
$(call msg,BPF,$@)
|
||||||
|
$(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
|
||||||
|
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
|
||||||
|
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
|
||||||
|
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
|
||||||
|
|
||||||
|
# Generate BPF skeletons
|
||||||
|
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
|
||||||
|
$(call msg,GEN-SKEL,$@)
|
||||||
|
$(Q)$(BPFTOOL) gen skeleton $< > $@
|
||||||
|
|
||||||
|
# Build user-space code
|
||||||
|
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
|
||||||
|
|
||||||
|
$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
|
||||||
|
$(call msg,CC,$@)
|
||||||
|
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
|
||||||
|
|
||||||
|
# Build application binary
|
||||||
|
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
|
||||||
|
$(call msg,BINARY,$@)
|
||||||
|
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
|
||||||
|
|
||||||
|
# delete failed targets
|
||||||
|
.DELETE_ON_ERROR:
|
||||||
|
|
||||||
|
# keep intermediate (.skel.h, .bpf.o, etc) targets
|
||||||
|
.SECONDARY:
|
||||||
205
src/features/bpf_iters/README.md
Normal file
205
src/features/bpf_iters/README.md
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
# BPF Iterators Tutorial
|
||||||
|
|
||||||
|
## What are BPF Iterators?
|
||||||
|
|
||||||
|
BPF iterators allow you to iterate over kernel data structures and export formatted data to userspace via `seq_file`. They're a modern replacement for traditional `/proc` files with **programmable, filterable, in-kernel data processing**.
|
||||||
|
|
||||||
|
## Real-World Example: Task Stack Iterator
|
||||||
|
|
||||||
|
### The Problem with Traditional Approach
|
||||||
|
|
||||||
|
**Traditional method** (using `/proc` or system tools):
|
||||||
|
```bash
|
||||||
|
# Show all process stack traces
|
||||||
|
cat /proc/*/stack
|
||||||
|
```
|
||||||
|
|
||||||
|
**Problems:**
|
||||||
|
1. ❌ **No filtering** - Must read ALL processes, parse in userspace
|
||||||
|
2. ❌ **Fixed format** - Cannot customize output
|
||||||
|
3. ❌ **High overhead** - Context switches, string formatting, massive output
|
||||||
|
4. ❌ **Post-processing** - All filtering/aggregation in userspace
|
||||||
|
5. ❌ **Inflexible** - Want different fields? Modify kernel!
|
||||||
|
|
||||||
|
### BPF Iterator Solution
|
||||||
|
|
||||||
|
**Our implementation** (`task_stack.bpf.c`):
|
||||||
|
```bash
|
||||||
|
# Show only systemd tasks with kernel stack traces
|
||||||
|
sudo ./task_stack systemd
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits:**
|
||||||
|
1. ✅ **In-kernel filtering** - Only selected processes sent to userspace
|
||||||
|
2. ✅ **Custom format** - Choose exactly what fields to show
|
||||||
|
3. ✅ **Low overhead** - Filter before copying to userspace
|
||||||
|
4. ✅ **Programmable** - Add statistics, calculations, aggregations
|
||||||
|
5. ✅ **Dynamic** - Load different filters without kernel changes
|
||||||
|
|
||||||
|
### Performance Comparison
|
||||||
|
|
||||||
|
| Operation | Traditional `/proc` | BPF Iterator |
|
||||||
|
|-----------|-------------------|--------------|
|
||||||
|
| Read all stacks | Parse 1000+ files | Single read() call |
|
||||||
|
| Filter by name | Userspace loop | In-kernel filter |
|
||||||
|
| Data transfer | MB of text | KB of relevant data |
|
||||||
|
| CPU usage | High (parsing) | Low (pre-filtered) |
|
||||||
|
| Customization | Recompile kernel | Load new BPF program |
|
||||||
|
|
||||||
|
## Example Output
|
||||||
|
|
||||||
|
```
|
||||||
|
$ sudo ./task_stack systemd
|
||||||
|
Filtering for tasks matching: systemd
|
||||||
|
|
||||||
|
=== BPF Task Stack Iterator ===
|
||||||
|
|
||||||
|
=== Task: systemd (pid=1, tgid=1) ===
|
||||||
|
Stack depth: 6 frames
|
||||||
|
[ 0] ep_poll+0x447/0x460
|
||||||
|
[ 1] do_epoll_wait+0xc3/0xe0
|
||||||
|
[ 2] __x64_sys_epoll_wait+0x6d/0x110
|
||||||
|
[ 3] x64_sys_call+0x19b1/0x2310
|
||||||
|
[ 4] do_syscall_64+0x7e/0x170
|
||||||
|
[ 5] entry_SYSCALL_64_after_hwframe+0x76/0x7e
|
||||||
|
|
||||||
|
=== Summary: 2 task stacks shown ===
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### 1. BPF Program (`task_stack.bpf.c`)
|
||||||
|
|
||||||
|
```c
|
||||||
|
SEC("iter/task")
|
||||||
|
int dump_task_stack(struct bpf_iter__task *ctx)
|
||||||
|
{
|
||||||
|
struct task_struct *task = ctx->task;
|
||||||
|
|
||||||
|
// In-kernel filtering by task name
|
||||||
|
if (target_comm[0] != '\0' && !match_name(task->comm))
|
||||||
|
return 0; // Skip this task
|
||||||
|
|
||||||
|
// Get kernel stack trace
|
||||||
|
bpf_get_task_stack(task, entries, MAX_DEPTH * SIZE_OF_ULONG, 0);
|
||||||
|
|
||||||
|
// Format and output to seq_file
|
||||||
|
BPF_SEQ_PRINTF(seq, "Task: %s (pid=%u)\n", task->comm, task->pid);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Userspace Program (`task_stack.c`)
|
||||||
|
|
||||||
|
```c
|
||||||
|
// Attach iterator
|
||||||
|
link = bpf_program__attach_iter(skel->progs.dump_task_stack, NULL);
|
||||||
|
|
||||||
|
// Create iterator instance
|
||||||
|
iter_fd = bpf_iter_create(bpf_link__fd(link));
|
||||||
|
|
||||||
|
// Read output
|
||||||
|
while ((len = read(iter_fd, buf, sizeof(buf))) > 0) {
|
||||||
|
printf("%s", buf);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Available Iterator Types
|
||||||
|
|
||||||
|
The kernel provides many iterator types:
|
||||||
|
|
||||||
|
### System Iterators
|
||||||
|
- `iter/task` - Iterate all tasks/processes
|
||||||
|
- `iter/ksym` - Kernel symbols (like `/proc/kallsyms`)
|
||||||
|
- `iter/bpf_map` - All BPF maps in system
|
||||||
|
- `iter/bpf_link` - All BPF links
|
||||||
|
|
||||||
|
### Network Iterators
|
||||||
|
- `iter/tcp` - TCP sockets (replaces `/proc/net/tcp`)
|
||||||
|
- `iter/udp` - UDP sockets
|
||||||
|
- `iter/unix` - Unix domain sockets
|
||||||
|
- `iter/netlink` - Netlink sockets
|
||||||
|
|
||||||
|
### Map Iterators
|
||||||
|
- `iter/bpf_map_elem` - Iterate map elements
|
||||||
|
- `iter/sockmap` - Socket map entries
|
||||||
|
|
||||||
|
### Task/Process Iterators
|
||||||
|
- `iter/task_file` - Task file descriptors (like `/proc/PID/fd`)
|
||||||
|
- `iter/task_vma` - Task memory mappings (like `/proc/PID/maps`)
|
||||||
|
|
||||||
|
## Use Cases
|
||||||
|
|
||||||
|
### 1. Performance Monitoring
|
||||||
|
- Track high-latency network connections
|
||||||
|
- Monitor stuck processes (long-running syscalls)
|
||||||
|
- Identify memory-hungry tasks
|
||||||
|
|
||||||
|
### 2. Debugging
|
||||||
|
- Capture stack traces of specific processes
|
||||||
|
- Dump kernel state for analysis
|
||||||
|
- Trace system calls in real-time
|
||||||
|
|
||||||
|
### 3. Security
|
||||||
|
- Monitor process creation patterns
|
||||||
|
- Track network connection attempts
|
||||||
|
- Audit file access patterns
|
||||||
|
|
||||||
|
### 4. Custom `/proc` Replacements
|
||||||
|
- Create application-specific views
|
||||||
|
- Filter and aggregate kernel data
|
||||||
|
- Reduce userspace processing overhead
|
||||||
|
|
||||||
|
## Building and Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build
|
||||||
|
cd /home/yunwei37/workspace/bpf-developer-tutorial/src/features/bpf_iters
|
||||||
|
make
|
||||||
|
|
||||||
|
# Run - show all tasks
|
||||||
|
sudo ./task_stack
|
||||||
|
|
||||||
|
# Run - filter by task name
|
||||||
|
sudo ./task_stack systemd
|
||||||
|
sudo ./task_stack bash
|
||||||
|
```
|
||||||
|
|
||||||
|
## Key Differences: Iterator Types
|
||||||
|
|
||||||
|
### Kernel Iterators (`SEC("iter/...")`)
|
||||||
|
- **Purpose**: Export kernel data to userspace
|
||||||
|
- **Output**: seq_file (readable via read())
|
||||||
|
- **Activation**: Attach, create instance, read FD
|
||||||
|
- **Example**: Task stacks, TCP sockets, kernel symbols
|
||||||
|
|
||||||
|
### Open-Coded Iterators (`bpf_for`, `bpf_iter_num`)
|
||||||
|
- **Purpose**: Loop constructs within BPF programs
|
||||||
|
- **Output**: Internal program variables
|
||||||
|
- **Activation**: Execute during program run
|
||||||
|
- **Example**: Sum numbers, count elements, iterate arrays
|
||||||
|
|
||||||
|
## Advantages Over Traditional Approaches
|
||||||
|
|
||||||
|
| Feature | Traditional `/proc` | BPF Iterators |
|
||||||
|
|---------|-------------------|---------------|
|
||||||
|
| **Filtering** | Userspace only | In-kernel |
|
||||||
|
| **Performance** | High overhead | Minimal overhead |
|
||||||
|
| **Customization** | Kernel rebuild | Load BPF program |
|
||||||
|
| **Format** | Fixed | Fully programmable |
|
||||||
|
| **Statistics** | Userspace calc | In-kernel aggregation |
|
||||||
|
| **Security** | No filtering | LSM hooks available |
|
||||||
|
| **Deployment** | Static | Dynamic (load anytime) |
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
BPF iterators are **game-changing** for system observability:
|
||||||
|
|
||||||
|
1. **Performance**: Filter in kernel, only send relevant data
|
||||||
|
2. **Flexibility**: Load different programs for different views
|
||||||
|
3. **Power**: Access raw kernel structures with type safety (BTF)
|
||||||
|
4. **Safety**: Verified by BPF verifier, can't crash kernel
|
||||||
|
5. **Portability**: CO-RE ensures binary works across kernel versions
|
||||||
|
|
||||||
|
They enable creating **custom, high-performance system monitoring tools** without modifying the kernel!
|
||||||
118
src/features/bpf_iters/task_stack.bpf.c
Normal file
118
src/features/bpf_iters/task_stack.bpf.c
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/* Kernel task stack and file descriptor iterator */
|
||||||
|
#include <vmlinux.h>
|
||||||
|
#include <bpf/bpf_helpers.h>
|
||||||
|
|
||||||
|
char _license[] SEC("license") = "GPL";
|
||||||
|
|
||||||
|
#define MAX_STACK_TRACE_DEPTH 64
|
||||||
|
unsigned long entries[MAX_STACK_TRACE_DEPTH] = {};
|
||||||
|
#define SIZE_OF_ULONG (sizeof(unsigned long))
|
||||||
|
|
||||||
|
/* Filter: only show stacks for tasks with this name (empty = show all) */
|
||||||
|
char target_comm[16] = "";
|
||||||
|
__u32 stacks_shown = 0;
|
||||||
|
__u32 files_shown = 0;
|
||||||
|
|
||||||
|
/* Task stack iterator */
|
||||||
|
SEC("iter/task")
|
||||||
|
int dump_task_stack(struct bpf_iter__task *ctx)
|
||||||
|
{
|
||||||
|
struct seq_file *seq = ctx->meta->seq;
|
||||||
|
struct task_struct *task = ctx->task;
|
||||||
|
long i, retlen;
|
||||||
|
int match = 1;
|
||||||
|
|
||||||
|
if (task == (void *)0) {
|
||||||
|
/* End of iteration - print summary */
|
||||||
|
if (stacks_shown > 0) {
|
||||||
|
BPF_SEQ_PRINTF(seq, "\n=== Summary: %u task stacks shown ===\n",
|
||||||
|
stacks_shown);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Filter by task name if specified */
|
||||||
|
if (target_comm[0] != '\0') {
|
||||||
|
match = 0;
|
||||||
|
for (i = 0; i < 16; i++) {
|
||||||
|
if (task->comm[i] != target_comm[i])
|
||||||
|
break;
|
||||||
|
if (task->comm[i] == '\0') {
|
||||||
|
match = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!match)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get kernel stack trace for this task */
|
||||||
|
retlen = bpf_get_task_stack(task, entries,
|
||||||
|
MAX_STACK_TRACE_DEPTH * SIZE_OF_ULONG, 0);
|
||||||
|
if (retlen < 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
stacks_shown++;
|
||||||
|
|
||||||
|
/* Print task info and stack trace */
|
||||||
|
BPF_SEQ_PRINTF(seq, "=== Task: %s (pid=%u, tgid=%u) ===\n",
|
||||||
|
task->comm, task->pid, task->tgid);
|
||||||
|
BPF_SEQ_PRINTF(seq, "Stack depth: %u frames\n", retlen / SIZE_OF_ULONG);
|
||||||
|
|
||||||
|
for (i = 0; i < MAX_STACK_TRACE_DEPTH; i++) {
|
||||||
|
if (retlen > i * SIZE_OF_ULONG)
|
||||||
|
BPF_SEQ_PRINTF(seq, " [%2ld] %pB\n", i, (void *)entries[i]);
|
||||||
|
}
|
||||||
|
BPF_SEQ_PRINTF(seq, "\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Task file descriptor iterator */
|
||||||
|
SEC("iter/task_file")
|
||||||
|
int dump_task_file(struct bpf_iter__task_file *ctx)
|
||||||
|
{
|
||||||
|
struct seq_file *seq = ctx->meta->seq;
|
||||||
|
struct task_struct *task = ctx->task;
|
||||||
|
struct file *file = ctx->file;
|
||||||
|
__u32 fd = ctx->fd;
|
||||||
|
long i;
|
||||||
|
int match = 1;
|
||||||
|
|
||||||
|
if (task == (void *)0 || file == (void *)0) {
|
||||||
|
if (files_shown > 0 && ctx->meta->seq_num > 0) {
|
||||||
|
BPF_SEQ_PRINTF(seq, "\n=== Summary: %u file descriptors shown ===\n",
|
||||||
|
files_shown);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Filter by task name if specified */
|
||||||
|
if (target_comm[0] != '\0') {
|
||||||
|
match = 0;
|
||||||
|
for (i = 0; i < 16; i++) {
|
||||||
|
if (task->comm[i] != target_comm[i])
|
||||||
|
break;
|
||||||
|
if (task->comm[i] == '\0') {
|
||||||
|
match = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!match)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctx->meta->seq_num == 0) {
|
||||||
|
BPF_SEQ_PRINTF(seq, "%-16s %8s %8s %6s %s\n",
|
||||||
|
"COMM", "TGID", "PID", "FD", "FILE_OPS");
|
||||||
|
}
|
||||||
|
|
||||||
|
files_shown++;
|
||||||
|
|
||||||
|
BPF_SEQ_PRINTF(seq, "%-16s %8d %8d %6d 0x%lx\n",
|
||||||
|
task->comm, task->tgid, task->pid, fd,
|
||||||
|
(long)file->f_op);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
93
src/features/bpf_iters/task_stack.c
Normal file
93
src/features/bpf_iters/task_stack.c
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/* Userspace program for task stack and file iterator */
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <bpf/libbpf.h>
|
||||||
|
#include <bpf/bpf.h>
|
||||||
|
#include "task_stack.skel.h"
|
||||||
|
|
||||||
|
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
|
||||||
|
{
|
||||||
|
return vfprintf(stderr, format, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void run_iterator(const char *name, struct bpf_program *prog)
|
||||||
|
{
|
||||||
|
struct bpf_link *link;
|
||||||
|
int iter_fd, len;
|
||||||
|
char buf[8192];
|
||||||
|
|
||||||
|
link = bpf_program__attach_iter(prog, NULL);
|
||||||
|
if (!link) {
|
||||||
|
fprintf(stderr, "Failed to attach %s iterator\n", name);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
iter_fd = bpf_iter_create(bpf_link__fd(link));
|
||||||
|
if (iter_fd < 0) {
|
||||||
|
fprintf(stderr, "Failed to create %s iterator: %d\n", name, iter_fd);
|
||||||
|
bpf_link__destroy(link);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
while ((len = read(iter_fd, buf, sizeof(buf) - 1)) > 0) {
|
||||||
|
buf[len] = '\0';
|
||||||
|
printf("%s", buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
close(iter_fd);
|
||||||
|
bpf_link__destroy(link);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
struct task_stack_bpf *skel;
|
||||||
|
int err;
|
||||||
|
int show_files = 0;
|
||||||
|
|
||||||
|
libbpf_set_print(libbpf_print_fn);
|
||||||
|
|
||||||
|
/* Parse arguments */
|
||||||
|
if (argc > 1 && strcmp(argv[1], "--files") == 0) {
|
||||||
|
show_files = 1;
|
||||||
|
argc--;
|
||||||
|
argv++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Open BPF application */
|
||||||
|
skel = task_stack_bpf__open();
|
||||||
|
if (!skel) {
|
||||||
|
fprintf(stderr, "Failed to open BPF skeleton\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Configure filter before loading */
|
||||||
|
if (argc > 1) {
|
||||||
|
strncpy(skel->bss->target_comm, argv[1], sizeof(skel->bss->target_comm) - 1);
|
||||||
|
printf("Filtering for tasks matching: %s\n\n", argv[1]);
|
||||||
|
} else {
|
||||||
|
printf("Usage: %s [--files] [comm]\n", argv[0]);
|
||||||
|
printf(" --files Show open file descriptors instead of stacks\n");
|
||||||
|
printf(" comm Filter by process name\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Load BPF program */
|
||||||
|
err = task_stack_bpf__load(skel);
|
||||||
|
if (err) {
|
||||||
|
fprintf(stderr, "Failed to load BPF skeleton\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (show_files) {
|
||||||
|
printf("=== BPF Task File Descriptor Iterator ===\n\n");
|
||||||
|
run_iterator("task_file", skel->progs.dump_task_file);
|
||||||
|
} else {
|
||||||
|
printf("=== BPF Task Stack Iterator ===\n\n");
|
||||||
|
run_iterator("task", skel->progs.dump_task_stack);
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
task_stack_bpf__destroy(skel);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
12
src/features/bpf_wq/.gitignore
vendored
Normal file
12
src/features/bpf_wq/.gitignore
vendored
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# Build artifacts
|
||||||
|
.output/
|
||||||
|
*.o
|
||||||
|
*.skel.h
|
||||||
|
|
||||||
|
# Generated binaries
|
||||||
|
wq_simple
|
||||||
|
|
||||||
|
# Editor files
|
||||||
|
*.swp
|
||||||
|
*~
|
||||||
|
.vscode/
|
||||||
112
src/features/bpf_wq/Makefile
Normal file
112
src/features/bpf_wq/Makefile
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
|
||||||
|
OUTPUT := .output
|
||||||
|
CLANG ?= clang
|
||||||
|
LIBBPF_SRC := $(abspath ../../third_party/libbpf/src)
|
||||||
|
BPFTOOL_SRC := $(abspath ../../third_party/bpftool/src)
|
||||||
|
LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
|
||||||
|
BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
|
||||||
|
BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
|
||||||
|
ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
|
||||||
|
| sed 's/arm.*/arm/' \
|
||||||
|
| sed 's/aarch64/arm64/' \
|
||||||
|
| sed 's/ppc64le/powerpc/' \
|
||||||
|
| sed 's/mips.*/mips/' \
|
||||||
|
| sed 's/riscv64/riscv/' \
|
||||||
|
| sed 's/loongarch64/loongarch/')
|
||||||
|
VMLINUX := ../../third_party/vmlinux/$(ARCH)/vmlinux.h
|
||||||
|
# Use our own libbpf API headers and Linux UAPI headers distributed with
|
||||||
|
# libbpf to avoid dependency on system-wide headers, which could be missing or
|
||||||
|
# outdated
|
||||||
|
INCLUDES := -I$(OUTPUT) -I../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX)) -I.
|
||||||
|
CFLAGS := -g -Wall
|
||||||
|
ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
|
||||||
|
|
||||||
|
APPS = wq_simple
|
||||||
|
|
||||||
|
# Get Clang's default includes on this system. We'll explicitly add these dirs
|
||||||
|
# to the includes list when compiling with `-target bpf` because otherwise some
|
||||||
|
# architecture-specific dirs will be "missing" on some architectures/distros -
|
||||||
|
# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
|
||||||
|
# sys/cdefs.h etc. might be missing.
|
||||||
|
#
|
||||||
|
# Use '-idirafter': Don't interfere with include mechanics except where the
|
||||||
|
# build would have failed anyways.
|
||||||
|
CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
|
||||||
|
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
|
||||||
|
|
||||||
|
ifeq ($(V),1)
|
||||||
|
Q =
|
||||||
|
msg =
|
||||||
|
else
|
||||||
|
Q = @
|
||||||
|
msg = @printf ' %-8s %s%s\n' \
|
||||||
|
"$(1)" \
|
||||||
|
"$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \
|
||||||
|
"$(if $(3), $(3))";
|
||||||
|
MAKEFLAGS += --no-print-directory
|
||||||
|
endif
|
||||||
|
|
||||||
|
define allow-override
|
||||||
|
$(if $(or $(findstring environment,$(origin $(1))),\
|
||||||
|
$(findstring command line,$(origin $(1)))),,\
|
||||||
|
$(eval $(1) = $(2)))
|
||||||
|
endef
|
||||||
|
|
||||||
|
$(call allow-override,CC,$(CROSS_COMPILE)cc)
|
||||||
|
$(call allow-override,LD,$(CROSS_COMPILE)ld)
|
||||||
|
|
||||||
|
.PHONY: all
|
||||||
|
all: $(APPS)
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
$(call msg,CLEAN)
|
||||||
|
$(Q)rm -rf $(OUTPUT) $(APPS)
|
||||||
|
|
||||||
|
$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
|
||||||
|
$(call msg,MKDIR,$@)
|
||||||
|
$(Q)mkdir -p $@
|
||||||
|
|
||||||
|
# Build libbpf
|
||||||
|
$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
|
||||||
|
$(call msg,LIB,$@)
|
||||||
|
$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \
|
||||||
|
OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \
|
||||||
|
INCLUDEDIR= LIBDIR= UAPIDIR= \
|
||||||
|
install
|
||||||
|
|
||||||
|
# Build bpftool
|
||||||
|
$(BPFTOOL): | $(BPFTOOL_OUTPUT)
|
||||||
|
$(call msg,BPFTOOL,$@)
|
||||||
|
$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
|
||||||
|
|
||||||
|
# Build BPF code
|
||||||
|
$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
|
||||||
|
$(call msg,BPF,$@)
|
||||||
|
$(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \
|
||||||
|
$(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \
|
||||||
|
-c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
|
||||||
|
$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
|
||||||
|
|
||||||
|
# Generate BPF skeletons
|
||||||
|
$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
|
||||||
|
$(call msg,GEN-SKEL,$@)
|
||||||
|
$(Q)$(BPFTOOL) gen skeleton $< > $@
|
||||||
|
|
||||||
|
# Build user-space code
|
||||||
|
$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
|
||||||
|
|
||||||
|
$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
|
||||||
|
$(call msg,CC,$@)
|
||||||
|
$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
|
||||||
|
|
||||||
|
# Build application binary
|
||||||
|
$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
|
||||||
|
$(call msg,BINARY,$@)
|
||||||
|
$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
|
||||||
|
|
||||||
|
# delete failed targets
|
||||||
|
.DELETE_ON_ERROR:
|
||||||
|
|
||||||
|
# keep intermediate (.skel.h, .bpf.o, etc) targets
|
||||||
|
.SECONDARY:
|
||||||
368
src/features/bpf_wq/README.md
Normal file
368
src/features/bpf_wq/README.md
Normal file
@@ -0,0 +1,368 @@
|
|||||||
|
# BPF Workqueues Tutorial
|
||||||
|
|
||||||
|
## What are BPF Workqueues?
|
||||||
|
|
||||||
|
BPF workqueues allow you to schedule **asynchronous work** from BPF programs. This enables:
|
||||||
|
- Deferred processing
|
||||||
|
- Non-blocking operations
|
||||||
|
- Background task execution
|
||||||
|
- Sleepable context for long-running operations
|
||||||
|
|
||||||
|
## The Problem
|
||||||
|
|
||||||
|
### Before bpf_wq: Limitations of bpf_timer
|
||||||
|
|
||||||
|
**bpf_timer** runs in **softirq context**, which has severe limitations:
|
||||||
|
- ❌ Cannot sleep
|
||||||
|
- ❌ Cannot use `kzalloc()` (memory allocation)
|
||||||
|
- ❌ Cannot wait for device I/O
|
||||||
|
- ❌ Cannot perform any blocking operations
|
||||||
|
|
||||||
|
### Real-World Use Case: HID Device Handling
|
||||||
|
|
||||||
|
**Problem**: HID (Human Interface Devices - keyboards, mice, tablets) devices need to:
|
||||||
|
1. **React to events asynchronously** - Transform input, inject new events
|
||||||
|
2. **Communicate with hardware** - Re-initialize devices after sleep/wake
|
||||||
|
3. **Perform device I/O** - Send commands, wait for responses
|
||||||
|
|
||||||
|
**These operations require sleepable context!**
|
||||||
|
|
||||||
|
## The Solution: bpf_wq
|
||||||
|
|
||||||
|
Developed by **Benjamin Tissoires** (Red Hat) in 2024 as part of HID-BPF work.
|
||||||
|
|
||||||
|
### Key Quote from Kernel Patches:
|
||||||
|
> "I need something similar to bpf_timers, but not in soft IRQ context...
|
||||||
|
> the bpf_timer functionality would prevent me to kzalloc and wait for the device"
|
||||||
|
|
||||||
|
### What bpf_wq Provides:
|
||||||
|
- ✅ **Sleepable context** - Can perform blocking operations
|
||||||
|
- ✅ **Memory allocation** - Can use `kzalloc()` safely
|
||||||
|
- ✅ **Device I/O** - Can wait for hardware responses
|
||||||
|
- ✅ **Asynchronous execution** - Deferred work without blocking main path
|
||||||
|
|
||||||
|
## Real-World Applications
|
||||||
|
|
||||||
|
### 1. HID Device Quirks and Fixes
|
||||||
|
|
||||||
|
**Problem**: Many HID devices have firmware bugs or quirks requiring workarounds.
|
||||||
|
|
||||||
|
**Before bpf_wq**: Write kernel drivers, recompile kernel
|
||||||
|
**With bpf_wq**: Load BPF program to fix device behavior dynamically
|
||||||
|
|
||||||
|
**Example Use Cases**:
|
||||||
|
- Transform single key press into macro sequence
|
||||||
|
- Fix devices that forget to send button release events
|
||||||
|
- Invert mouse coordinates for broken hardware
|
||||||
|
- Re-initialize device after wake from sleep
|
||||||
|
|
||||||
|
### 2. Network Packet Processing
|
||||||
|
|
||||||
|
**Problem**: Rate limiting requires tracking state and cleaning up old entries.
|
||||||
|
|
||||||
|
**Before**: Either block packet processing OR leak memory
|
||||||
|
**With bpf_wq**:
|
||||||
|
- Fast path: Check limits, drop packets (non-blocking)
|
||||||
|
- Slow path: Workqueue cleans up stale entries (async)
|
||||||
|
|
||||||
|
### 3. Security and Monitoring
|
||||||
|
|
||||||
|
**Problem**: Security decisions need to consult external services or databases.
|
||||||
|
|
||||||
|
**Before**: All decisions must be instant (no waiting)
|
||||||
|
**With bpf_wq**:
|
||||||
|
- Fast path: Apply known rules immediately
|
||||||
|
- Slow path: Query reputation databases, update policy
|
||||||
|
|
||||||
|
### 4. Resource Cleanup
|
||||||
|
|
||||||
|
**Problem**: Freeing resources (memory, connections) can be expensive.
|
||||||
|
|
||||||
|
**Before**: Block main path during cleanup
|
||||||
|
**With bpf_wq**: Defer cleanup to background workqueue
|
||||||
|
|
||||||
|
## Technical Architecture
|
||||||
|
|
||||||
|
### Comparison: bpf_timer vs bpf_wq
|
||||||
|
|
||||||
|
| Feature | bpf_timer | bpf_wq |
|
||||||
|
|---------|-----------|--------|
|
||||||
|
| **Context** | Softirq (interrupt) | Process (workqueue) |
|
||||||
|
| **Can sleep?** | ❌ No | ✅ Yes |
|
||||||
|
| **Memory allocation** | ❌ No | ✅ Yes |
|
||||||
|
| **Device I/O** | ❌ No | ✅ Yes |
|
||||||
|
| **Latency** | Very low (μs) | Higher (ms) |
|
||||||
|
| **Use case** | Time-critical | Sleepable operations |
|
||||||
|
|
||||||
|
### When to Use Each
|
||||||
|
|
||||||
|
**Use bpf_timer when:**
|
||||||
|
- You need microsecond-level precision
|
||||||
|
- Operations are fast and non-blocking
|
||||||
|
- You're just updating counters or state
|
||||||
|
|
||||||
|
**Use bpf_wq when:**
|
||||||
|
- You need to sleep or wait
|
||||||
|
- You need memory allocation
|
||||||
|
- You need device/network I/O
|
||||||
|
- Cleanup can happen later
|
||||||
|
|
||||||
|
## Code Example: Why Workqueue Matters
|
||||||
|
|
||||||
|
### ❌ Cannot Do with bpf_timer (softirq):
|
||||||
|
```c
|
||||||
|
// This FAILS in bpf_timer callback (softirq context)
|
||||||
|
static int timer_callback(void *map, int *key, void *value)
|
||||||
|
{
|
||||||
|
// ERROR: Cannot allocate in softirq!
|
||||||
|
struct data *d = kmalloc(sizeof(*d), GFP_KERNEL);
|
||||||
|
|
||||||
|
// ERROR: Cannot sleep in softirq!
|
||||||
|
send_device_command_and_wait(device);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### ✅ Works with bpf_wq (workqueue):
|
||||||
|
```c
|
||||||
|
// This WORKS in bpf_wq callback (process context)
|
||||||
|
static int wq_callback(void *map, int *key, void *value)
|
||||||
|
{
|
||||||
|
// OK: Can allocate in process context
|
||||||
|
struct data *d = kmalloc(sizeof(*d), GFP_KERNEL);
|
||||||
|
|
||||||
|
// OK: Can sleep/wait in process context
|
||||||
|
send_device_command_and_wait(device);
|
||||||
|
|
||||||
|
// OK: Can do blocking I/O
|
||||||
|
write_to_file(log_file, data);
|
||||||
|
|
||||||
|
kfree(d);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Historical Timeline
|
||||||
|
|
||||||
|
1. **2022**: Benjamin Tissoires starts HID-BPF work
|
||||||
|
2. **2023**: Realizes bpf_timer limitations for HID device I/O
|
||||||
|
3. **Early 2024**: Proposes bpf_wq as "bpf_timer in process context"
|
||||||
|
4. **April 2024**: bpf_wq merged into kernel (v6.10+)
|
||||||
|
5. **2024-Present**: Used for HID quirks, rate limiting, async cleanup
|
||||||
|
|
||||||
|
## Key Takeaway
|
||||||
|
|
||||||
|
**bpf_wq exists because real-world device handling and resource management need sleepable, blocking operations that bpf_timer cannot provide.**
|
||||||
|
|
||||||
|
It enables BPF programs to:
|
||||||
|
- Fix hardware quirks without kernel drivers
|
||||||
|
- Perform async cleanup without blocking
|
||||||
|
- Wait for I/O without hanging the system
|
||||||
|
- Do "slow work" without impacting "fast path"
|
||||||
|
|
||||||
|
**Bottom line**: bpf_wq brings true asynchronous, sleepable programming to BPF!
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### 1. Workqueue Structure
|
||||||
|
|
||||||
|
Embed a `struct bpf_wq` in your map value:
|
||||||
|
|
||||||
|
```c
|
||||||
|
struct elem {
|
||||||
|
int value;
|
||||||
|
struct bpf_wq work; // Embedded workqueue
|
||||||
|
};
|
||||||
|
|
||||||
|
struct {
|
||||||
|
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||||
|
__type(value, struct elem);
|
||||||
|
} array SEC(".maps");
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Initialize and Schedule
|
||||||
|
|
||||||
|
```c
|
||||||
|
SEC("fentry/do_unlinkat")
|
||||||
|
int test_workqueue(void *ctx)
|
||||||
|
{
|
||||||
|
struct elem *val = bpf_map_lookup_elem(&array, &key);
|
||||||
|
struct bpf_wq *wq = &val->work;
|
||||||
|
|
||||||
|
// Initialize workqueue
|
||||||
|
bpf_wq_init(wq, &array, 0);
|
||||||
|
|
||||||
|
// Set callback function
|
||||||
|
bpf_wq_set_callback(wq, callback_fn, 0);
|
||||||
|
|
||||||
|
// Schedule async execution
|
||||||
|
bpf_wq_start(wq, 0);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Callback Execution
|
||||||
|
|
||||||
|
```c
|
||||||
|
static int callback_fn(void *map, int *key, void *value)
|
||||||
|
{
|
||||||
|
struct elem *val = value;
|
||||||
|
|
||||||
|
// This runs asynchronously in workqueue context
|
||||||
|
val->value = 42;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Examples
|
||||||
|
|
||||||
|
### 1. Simple Workqueue Test (`wq_simple`)
|
||||||
|
|
||||||
|
Basic demonstration:
|
||||||
|
- Workqueue initialization on syscall entry
|
||||||
|
- Async callback execution
|
||||||
|
- Verification of both sync and async paths
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ sudo ./wq_simple
|
||||||
|
BPF workqueue program attached. Triggering unlink syscall...
|
||||||
|
|
||||||
|
Results:
|
||||||
|
main_executed = 1 (expected: 1)
|
||||||
|
wq_executed = 1 (expected: 1)
|
||||||
|
|
||||||
|
✓ Test PASSED!
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Real-World: Rate Limiter with Async Cleanup (`rate_limiter`)
|
||||||
|
|
||||||
|
**Production-ready example** showing practical workqueue usage:
|
||||||
|
|
||||||
|
**Problem**:
|
||||||
|
- Track packet rates per source IP
|
||||||
|
- Drop packets exceeding 100 pps
|
||||||
|
- Clean up stale entries without blocking packet processing
|
||||||
|
|
||||||
|
**Solution with Workqueues**:
|
||||||
|
- **Fast path**: Check/update rate limits, drop if needed
|
||||||
|
- **Slow path (async)**: Workqueue removes entries older than 10 seconds
|
||||||
|
- **Zero blocking**: Cleanup runs in background
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ sudo ./rate_limiter eth0
|
||||||
|
=== BPF Rate Limiter with Workqueue Cleanup ===
|
||||||
|
Interface: eth0 (ifindex=2)
|
||||||
|
Rate limit: 100 packets/sec per IP
|
||||||
|
Cleanup: Async workqueue removes stale entries (>10s old)
|
||||||
|
|
||||||
|
Press Ctrl+C to stop...
|
||||||
|
|
||||||
|
Time Total Pkts Dropped Active IPs Cleanups
|
||||||
|
-----------------------------------------------------------------------
|
||||||
|
1234 45123 1234 150 12
|
||||||
|
1235 46789 1456 152 15
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Features**:
|
||||||
|
1. **In-kernel rate limiting** - No userspace involvement for packet decisions
|
||||||
|
2. **Per-IP tracking** - Hash map stores state for each source IP
|
||||||
|
3. **Async cleanup** - Workqueue prevents memory leaks without blocking packets
|
||||||
|
4. **Real-time stats** - Monitor performance and efficiency
|
||||||
|
|
||||||
|
## Use Cases
|
||||||
|
|
||||||
|
### 1. Rate Limiting
|
||||||
|
Schedule delayed actions to enforce rate limits:
|
||||||
|
```c
|
||||||
|
// Defer packet drop decision
|
||||||
|
bpf_wq_start(wq, 0); // Execute in background
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Batch Processing
|
||||||
|
Accumulate events and process in batches:
|
||||||
|
```c
|
||||||
|
// Collect events in map
|
||||||
|
// Workqueue processes batch periodically
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Heavy Computations
|
||||||
|
Offload expensive operations:
|
||||||
|
```c
|
||||||
|
// Main path: fast, non-blocking
|
||||||
|
// Workqueue: slow processing (parsing, crypto)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Cleanup Tasks
|
||||||
|
Defer resource cleanup:
|
||||||
|
```c
|
||||||
|
// Free memory, close connections in background
|
||||||
|
```
|
||||||
|
|
||||||
|
## Building and Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Build
|
||||||
|
cd /home/yunwei37/workspace/bpf-developer-tutorial/src/features/bpf_wq
|
||||||
|
make
|
||||||
|
|
||||||
|
# Run simple test
|
||||||
|
sudo ./wq_simple
|
||||||
|
|
||||||
|
# Run rate limiter (requires network interface)
|
||||||
|
sudo ./rate_limiter lo # Use loopback for testing
|
||||||
|
sudo ./rate_limiter eth0 # Use real interface
|
||||||
|
|
||||||
|
# Generate test traffic
|
||||||
|
ping -f localhost # Flood ping to trigger rate limiting
|
||||||
|
```
|
||||||
|
|
||||||
|
## Key APIs
|
||||||
|
|
||||||
|
| Function | Purpose |
|
||||||
|
|----------|---------|
|
||||||
|
| `bpf_wq_init(wq, map, flags)` | Initialize workqueue |
|
||||||
|
| `bpf_wq_set_callback(wq, fn, flags)` | Set callback function |
|
||||||
|
| `bpf_wq_start(wq, flags)` | Schedule async execution |
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Linux kernel 6.6+ (workqueue support)
|
||||||
|
- Root/sudo access
|
||||||
|
- libbpf, clang, bpftool
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
```
|
||||||
|
bpf_wq/
|
||||||
|
├── wq_simple.bpf.c # BPF workqueue program
|
||||||
|
├── wq_simple.c # Userspace loader
|
||||||
|
├── bpf_experimental.h # Workqueue helper definitions
|
||||||
|
├── Makefile # Build system
|
||||||
|
├── README.md # This file
|
||||||
|
└── .gitignore # Ignore build artifacts
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advantages Over Alternatives
|
||||||
|
|
||||||
|
| Approach | Blocking | Context Switches | Complexity |
|
||||||
|
|----------|----------|-----------------|------------|
|
||||||
|
| **Synchronous** | Yes | No | Low |
|
||||||
|
| **Userspace notification** | No | Yes (many) | High |
|
||||||
|
| **BPF workqueue** | No | Minimal | Medium |
|
||||||
|
|
||||||
|
BPF workqueues provide the best balance of performance and flexibility for async operations!
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
BPF workqueues enable **true asynchronous programming** in BPF:
|
||||||
|
- ✅ Non-blocking main path
|
||||||
|
- ✅ Deferred execution
|
||||||
|
- ✅ Sleepable context support
|
||||||
|
- ✅ Minimal overhead
|
||||||
|
- ✅ Type-safe callbacks
|
||||||
|
|
||||||
|
Perfect for scenarios where you need to do work later without blocking the fast path!
|
||||||
591
src/features/bpf_wq/bpf_experimental.h
Normal file
591
src/features/bpf_wq/bpf_experimental.h
Normal file
@@ -0,0 +1,591 @@
|
|||||||
|
#ifndef __BPF_EXPERIMENTAL__
|
||||||
|
#define __BPF_EXPERIMENTAL__
|
||||||
|
|
||||||
|
#include <vmlinux.h>
|
||||||
|
#include <bpf/bpf_tracing.h>
|
||||||
|
#include <bpf/bpf_helpers.h>
|
||||||
|
#include <bpf/bpf_core_read.h>
|
||||||
|
|
||||||
|
#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Allocates an object of the type represented by 'local_type_id' in
|
||||||
|
* program BTF. User may use the bpf_core_type_id_local macro to pass the
|
||||||
|
* type ID of a struct in program BTF.
|
||||||
|
*
|
||||||
|
* The 'local_type_id' parameter must be a known constant.
|
||||||
|
* The 'meta' parameter is rewritten by the verifier, no need for BPF
|
||||||
|
* program to set it.
|
||||||
|
* Returns
|
||||||
|
* A pointer to an object of the type corresponding to the passed in
|
||||||
|
* 'local_type_id', or NULL on failure.
|
||||||
|
*/
|
||||||
|
extern void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
|
||||||
|
|
||||||
|
/* Convenience macro to wrap over bpf_obj_new_impl */
|
||||||
|
#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Free an allocated object. All fields of the object that require
|
||||||
|
* destruction will be destructed before the storage is freed.
|
||||||
|
*
|
||||||
|
* The 'meta' parameter is rewritten by the verifier, no need for BPF
|
||||||
|
* program to set it.
|
||||||
|
* Returns
|
||||||
|
* Void.
|
||||||
|
*/
|
||||||
|
extern void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
|
||||||
|
|
||||||
|
/* Convenience macro to wrap over bpf_obj_drop_impl */
|
||||||
|
#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Increment the refcount on a refcounted local kptr, turning the
|
||||||
|
* non-owning reference input into an owning reference in the process.
|
||||||
|
*
|
||||||
|
* The 'meta' parameter is rewritten by the verifier, no need for BPF
|
||||||
|
* program to set it.
|
||||||
|
* Returns
|
||||||
|
* An owning reference to the object pointed to by 'kptr'
|
||||||
|
*/
|
||||||
|
extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
|
||||||
|
|
||||||
|
/* Convenience macro to wrap over bpf_refcount_acquire_impl */
|
||||||
|
#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Add a new entry to the beginning of the BPF linked list.
|
||||||
|
*
|
||||||
|
* The 'meta' and 'off' parameters are rewritten by the verifier, no need
|
||||||
|
* for BPF programs to set them
|
||||||
|
* Returns
|
||||||
|
* 0 if the node was successfully added
|
||||||
|
* -EINVAL if the node wasn't added because it's already in a list
|
||||||
|
*/
|
||||||
|
extern int bpf_list_push_front_impl(struct bpf_list_head *head,
|
||||||
|
struct bpf_list_node *node,
|
||||||
|
void *meta, __u64 off) __ksym;
|
||||||
|
|
||||||
|
/* Convenience macro to wrap over bpf_list_push_front_impl */
|
||||||
|
#define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0)
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Add a new entry to the end of the BPF linked list.
|
||||||
|
*
|
||||||
|
* The 'meta' and 'off' parameters are rewritten by the verifier, no need
|
||||||
|
* for BPF programs to set them
|
||||||
|
* Returns
|
||||||
|
* 0 if the node was successfully added
|
||||||
|
* -EINVAL if the node wasn't added because it's already in a list
|
||||||
|
*/
|
||||||
|
extern int bpf_list_push_back_impl(struct bpf_list_head *head,
|
||||||
|
struct bpf_list_node *node,
|
||||||
|
void *meta, __u64 off) __ksym;
|
||||||
|
|
||||||
|
/* Convenience macro to wrap over bpf_list_push_back_impl */
|
||||||
|
#define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0)
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Remove the entry at the beginning of the BPF linked list.
|
||||||
|
* Returns
|
||||||
|
* Pointer to bpf_list_node of deleted entry, or NULL if list is empty.
|
||||||
|
*/
|
||||||
|
extern struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Remove the entry at the end of the BPF linked list.
|
||||||
|
* Returns
|
||||||
|
* Pointer to bpf_list_node of deleted entry, or NULL if list is empty.
|
||||||
|
*/
|
||||||
|
extern struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Remove 'node' from rbtree with root 'root'
|
||||||
|
* Returns
|
||||||
|
* Pointer to the removed node, or NULL if 'root' didn't contain 'node'
|
||||||
|
*/
|
||||||
|
extern struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
|
||||||
|
struct bpf_rb_node *node) __ksym;
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Add 'node' to rbtree with root 'root' using comparator 'less'
|
||||||
|
*
|
||||||
|
* The 'meta' and 'off' parameters are rewritten by the verifier, no need
|
||||||
|
* for BPF programs to set them
|
||||||
|
* Returns
|
||||||
|
* 0 if the node was successfully added
|
||||||
|
* -EINVAL if the node wasn't added because it's already in a tree
|
||||||
|
*/
|
||||||
|
extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
|
||||||
|
bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
|
||||||
|
void *meta, __u64 off) __ksym;
|
||||||
|
|
||||||
|
/* Convenience macro to wrap over bpf_rbtree_add_impl */
|
||||||
|
#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Return the first (leftmost) node in input tree
|
||||||
|
* Returns
|
||||||
|
* Pointer to the node, which is _not_ removed from the tree. If the tree
|
||||||
|
* contains no nodes, returns NULL.
|
||||||
|
*/
|
||||||
|
extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Allocates a percpu object of the type represented by 'local_type_id' in
|
||||||
|
* program BTF. User may use the bpf_core_type_id_local macro to pass the
|
||||||
|
* type ID of a struct in program BTF.
|
||||||
|
*
|
||||||
|
* The 'local_type_id' parameter must be a known constant.
|
||||||
|
* The 'meta' parameter is rewritten by the verifier, no need for BPF
|
||||||
|
* program to set it.
|
||||||
|
* Returns
|
||||||
|
* A pointer to a percpu object of the type corresponding to the passed in
|
||||||
|
* 'local_type_id', or NULL on failure.
|
||||||
|
*/
|
||||||
|
extern void *bpf_percpu_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
|
||||||
|
|
||||||
|
/* Convenience macro to wrap over bpf_percpu_obj_new_impl */
|
||||||
|
#define bpf_percpu_obj_new(type) ((type __percpu_kptr *)bpf_percpu_obj_new_impl(bpf_core_type_id_local(type), NULL))
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Free an allocated percpu object. All fields of the object that require
|
||||||
|
* destruction will be destructed before the storage is freed.
|
||||||
|
*
|
||||||
|
* The 'meta' parameter is rewritten by the verifier, no need for BPF
|
||||||
|
* program to set it.
|
||||||
|
* Returns
|
||||||
|
* Void.
|
||||||
|
*/
|
||||||
|
extern void bpf_percpu_obj_drop_impl(void *kptr, void *meta) __ksym;
|
||||||
|
|
||||||
|
struct bpf_iter_task_vma;
|
||||||
|
|
||||||
|
extern int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it,
|
||||||
|
struct task_struct *task,
|
||||||
|
__u64 addr) __ksym;
|
||||||
|
extern struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) __ksym;
|
||||||
|
extern void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) __ksym;
|
||||||
|
|
||||||
|
/* Convenience macro to wrap over bpf_obj_drop_impl */
|
||||||
|
#define bpf_percpu_obj_drop(kptr) bpf_percpu_obj_drop_impl(kptr, NULL)
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Throw a BPF exception from the program, immediately terminating its
|
||||||
|
* execution and unwinding the stack. The supplied 'cookie' parameter
|
||||||
|
* will be the return value of the program when an exception is thrown,
|
||||||
|
* and the default exception callback is used. Otherwise, if an exception
|
||||||
|
* callback is set using the '__exception_cb(callback)' declaration tag
|
||||||
|
* on the main program, the 'cookie' parameter will be the callback's only
|
||||||
|
* input argument.
|
||||||
|
*
|
||||||
|
* Thus, in case of default exception callback, 'cookie' is subjected to
|
||||||
|
* constraints on the program's return value (as with R0 on exit).
|
||||||
|
* Otherwise, the return value of the marked exception callback will be
|
||||||
|
* subjected to the same checks.
|
||||||
|
*
|
||||||
|
* Note that throwing an exception with lingering resources (locks,
|
||||||
|
* references, etc.) will lead to a verification error.
|
||||||
|
*
|
||||||
|
* Note that callbacks *cannot* call this helper.
|
||||||
|
* Returns
|
||||||
|
* Never.
|
||||||
|
* Throws
|
||||||
|
* An exception with the specified 'cookie' value.
|
||||||
|
*/
|
||||||
|
extern void bpf_throw(u64 cookie) __ksym;
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Acquire a reference on the exe_file member field belonging to the
|
||||||
|
* mm_struct that is nested within the supplied task_struct. The supplied
|
||||||
|
* task_struct must be trusted/referenced.
|
||||||
|
* Returns
|
||||||
|
* A referenced file pointer pointing to the exe_file member field of the
|
||||||
|
* mm_struct nested in the supplied task_struct, or NULL.
|
||||||
|
*/
|
||||||
|
extern struct file *bpf_get_task_exe_file(struct task_struct *task) __ksym;
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Release a reference on the supplied file. The supplied file must be
|
||||||
|
* acquired.
|
||||||
|
*/
|
||||||
|
extern void bpf_put_file(struct file *file) __ksym;
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Resolve a pathname for the supplied path and store it in the supplied
|
||||||
|
* buffer. The supplied path must be trusted/referenced.
|
||||||
|
* Returns
|
||||||
|
* A positive integer corresponding to the length of the resolved pathname,
|
||||||
|
* including the NULL termination character, stored in the supplied
|
||||||
|
* buffer. On error, a negative integer is returned.
|
||||||
|
*/
|
||||||
|
extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __ksym;
|
||||||
|
|
||||||
|
/* This macro must be used to mark the exception callback corresponding to the
|
||||||
|
* main program. For example:
|
||||||
|
*
|
||||||
|
* int exception_cb(u64 cookie) {
|
||||||
|
* return cookie;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* SEC("tc")
|
||||||
|
* __exception_cb(exception_cb)
|
||||||
|
* int main_prog(struct __sk_buff *ctx) {
|
||||||
|
* ...
|
||||||
|
* return TC_ACT_OK;
|
||||||
|
* }
|
||||||
|
*
|
||||||
|
* Here, exception callback for the main program will be 'exception_cb'. Note
|
||||||
|
* that this attribute can only be used once, and multiple exception callbacks
|
||||||
|
* specified for the main program will lead to verification error.
|
||||||
|
*/
|
||||||
|
#define __exception_cb(name) __attribute__((btf_decl_tag("exception_callback:" #name)))
|
||||||
|
|
||||||
|
#define __bpf_assert_signed(x) _Generic((x), \
|
||||||
|
unsigned long: 0, \
|
||||||
|
unsigned long long: 0, \
|
||||||
|
signed long: 1, \
|
||||||
|
signed long long: 1 \
|
||||||
|
)
|
||||||
|
|
||||||
|
#define __bpf_assert_check(LHS, op, RHS) \
|
||||||
|
_Static_assert(sizeof(&(LHS)), "1st argument must be an lvalue expression"); \
|
||||||
|
_Static_assert(sizeof(LHS) == 8, "Only 8-byte integers are supported\n"); \
|
||||||
|
_Static_assert(__builtin_constant_p(__bpf_assert_signed(LHS)), "internal static assert"); \
|
||||||
|
_Static_assert(__builtin_constant_p((RHS)), "2nd argument must be a constant expression")
|
||||||
|
|
||||||
|
#define __bpf_assert(LHS, op, cons, RHS, VAL) \
|
||||||
|
({ \
|
||||||
|
(void)bpf_throw; \
|
||||||
|
asm volatile ("if %[lhs] " op " %[rhs] goto +2; r1 = %[value]; call bpf_throw" \
|
||||||
|
: : [lhs] "r"(LHS), [rhs] cons(RHS), [value] "ri"(VAL) : ); \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define __bpf_assert_op_sign(LHS, op, cons, RHS, VAL, supp_sign) \
|
||||||
|
({ \
|
||||||
|
__bpf_assert_check(LHS, op, RHS); \
|
||||||
|
if (__bpf_assert_signed(LHS) && !(supp_sign)) \
|
||||||
|
__bpf_assert(LHS, "s" #op, cons, RHS, VAL); \
|
||||||
|
else \
|
||||||
|
__bpf_assert(LHS, #op, cons, RHS, VAL); \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define __bpf_assert_op(LHS, op, RHS, VAL, supp_sign) \
|
||||||
|
({ \
|
||||||
|
if (sizeof(typeof(RHS)) == 8) { \
|
||||||
|
const typeof(RHS) rhs_var = (RHS); \
|
||||||
|
__bpf_assert_op_sign(LHS, op, "r", rhs_var, VAL, supp_sign); \
|
||||||
|
} else { \
|
||||||
|
__bpf_assert_op_sign(LHS, op, "i", RHS, VAL, supp_sign); \
|
||||||
|
} \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define __cmp_cannot_be_signed(x) \
|
||||||
|
__builtin_strcmp(#x, "==") == 0 || __builtin_strcmp(#x, "!=") == 0 || \
|
||||||
|
__builtin_strcmp(#x, "&") == 0
|
||||||
|
|
||||||
|
#define __is_signed_type(type) (((type)(-1)) < (type)1)
|
||||||
|
|
||||||
|
#define __bpf_cmp(LHS, OP, PRED, RHS, DEFAULT) \
|
||||||
|
({ \
|
||||||
|
__label__ l_true; \
|
||||||
|
bool ret = DEFAULT; \
|
||||||
|
asm volatile goto("if %[lhs] " OP " %[rhs] goto %l[l_true]" \
|
||||||
|
:: [lhs] "r"((short)LHS), [rhs] PRED (RHS) :: l_true); \
|
||||||
|
ret = !DEFAULT; \
|
||||||
|
l_true: \
|
||||||
|
ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
/* C type conversions coupled with comparison operator are tricky.
|
||||||
|
* Make sure BPF program is compiled with -Wsign-compare then
|
||||||
|
* __lhs OP __rhs below will catch the mistake.
|
||||||
|
* Be aware that we check only __lhs to figure out the sign of compare.
|
||||||
|
*/
|
||||||
|
#define _bpf_cmp(LHS, OP, RHS, UNLIKELY) \
|
||||||
|
({ \
|
||||||
|
typeof(LHS) __lhs = (LHS); \
|
||||||
|
typeof(RHS) __rhs = (RHS); \
|
||||||
|
bool ret; \
|
||||||
|
_Static_assert(sizeof(&(LHS)), "1st argument must be an lvalue expression"); \
|
||||||
|
(void)(__lhs OP __rhs); \
|
||||||
|
if (__cmp_cannot_be_signed(OP) || !__is_signed_type(typeof(__lhs))) { \
|
||||||
|
if (sizeof(__rhs) == 8) \
|
||||||
|
/* "i" will truncate 64-bit constant into s32, \
|
||||||
|
* so we have to use extra register via "r". \
|
||||||
|
*/ \
|
||||||
|
ret = __bpf_cmp(__lhs, #OP, "r", __rhs, UNLIKELY); \
|
||||||
|
else \
|
||||||
|
ret = __bpf_cmp(__lhs, #OP, "ri", __rhs, UNLIKELY); \
|
||||||
|
} else { \
|
||||||
|
if (sizeof(__rhs) == 8) \
|
||||||
|
ret = __bpf_cmp(__lhs, "s"#OP, "r", __rhs, UNLIKELY); \
|
||||||
|
else \
|
||||||
|
ret = __bpf_cmp(__lhs, "s"#OP, "ri", __rhs, UNLIKELY); \
|
||||||
|
} \
|
||||||
|
ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
#ifndef bpf_cmp_unlikely
|
||||||
|
#define bpf_cmp_unlikely(LHS, OP, RHS) _bpf_cmp(LHS, OP, RHS, true)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef bpf_cmp_likely
|
||||||
|
#define bpf_cmp_likely(LHS, OP, RHS) \
|
||||||
|
({ \
|
||||||
|
bool ret = 0; \
|
||||||
|
if (__builtin_strcmp(#OP, "==") == 0) \
|
||||||
|
ret = _bpf_cmp(LHS, !=, RHS, false); \
|
||||||
|
else if (__builtin_strcmp(#OP, "!=") == 0) \
|
||||||
|
ret = _bpf_cmp(LHS, ==, RHS, false); \
|
||||||
|
else if (__builtin_strcmp(#OP, "<=") == 0) \
|
||||||
|
ret = _bpf_cmp(LHS, >, RHS, false); \
|
||||||
|
else if (__builtin_strcmp(#OP, "<") == 0) \
|
||||||
|
ret = _bpf_cmp(LHS, >=, RHS, false); \
|
||||||
|
else if (__builtin_strcmp(#OP, ">") == 0) \
|
||||||
|
ret = _bpf_cmp(LHS, <=, RHS, false); \
|
||||||
|
else if (__builtin_strcmp(#OP, ">=") == 0) \
|
||||||
|
ret = _bpf_cmp(LHS, <, RHS, false); \
|
||||||
|
else \
|
||||||
|
asm volatile("r0 " #OP " invalid compare"); \
|
||||||
|
ret; \
|
||||||
|
})
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note that cond_break can only be portably used in the body of a breakable
|
||||||
|
* construct, whereas can_loop can be used anywhere.
|
||||||
|
*/
|
||||||
|
#ifdef __BPF_FEATURE_MAY_GOTO
|
||||||
|
#define can_loop \
|
||||||
|
({ __label__ l_break, l_continue; \
|
||||||
|
bool ret = true; \
|
||||||
|
asm volatile goto("may_goto %l[l_break]" \
|
||||||
|
:::: l_break); \
|
||||||
|
goto l_continue; \
|
||||||
|
l_break: ret = false; \
|
||||||
|
l_continue:; \
|
||||||
|
ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define cond_break \
|
||||||
|
({ __label__ l_break, l_continue; \
|
||||||
|
asm volatile goto("may_goto %l[l_break]" \
|
||||||
|
:::: l_break); \
|
||||||
|
goto l_continue; \
|
||||||
|
l_break: break; \
|
||||||
|
l_continue:; \
|
||||||
|
})
|
||||||
|
#else
|
||||||
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||||
|
#define can_loop \
|
||||||
|
({ __label__ l_break, l_continue; \
|
||||||
|
bool ret = true; \
|
||||||
|
asm volatile goto("1:.byte 0xe5; \
|
||||||
|
.byte 0; \
|
||||||
|
.long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \
|
||||||
|
.short 0" \
|
||||||
|
:::: l_break); \
|
||||||
|
goto l_continue; \
|
||||||
|
l_break: ret = false; \
|
||||||
|
l_continue:; \
|
||||||
|
ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define cond_break \
|
||||||
|
({ __label__ l_break, l_continue; \
|
||||||
|
asm volatile goto("1:.byte 0xe5; \
|
||||||
|
.byte 0; \
|
||||||
|
.long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \
|
||||||
|
.short 0" \
|
||||||
|
:::: l_break); \
|
||||||
|
goto l_continue; \
|
||||||
|
l_break: break; \
|
||||||
|
l_continue:; \
|
||||||
|
})
|
||||||
|
#else
|
||||||
|
#define can_loop \
|
||||||
|
({ __label__ l_break, l_continue; \
|
||||||
|
bool ret = true; \
|
||||||
|
asm volatile goto("1:.byte 0xe5; \
|
||||||
|
.byte 0; \
|
||||||
|
.long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \
|
||||||
|
.short 0" \
|
||||||
|
:::: l_break); \
|
||||||
|
goto l_continue; \
|
||||||
|
l_break: ret = false; \
|
||||||
|
l_continue:; \
|
||||||
|
ret; \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define cond_break \
|
||||||
|
({ __label__ l_break, l_continue; \
|
||||||
|
asm volatile goto("1:.byte 0xe5; \
|
||||||
|
.byte 0; \
|
||||||
|
.long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \
|
||||||
|
.short 0" \
|
||||||
|
:::: l_break); \
|
||||||
|
goto l_continue; \
|
||||||
|
l_break: break; \
|
||||||
|
l_continue:; \
|
||||||
|
})
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef bpf_nop_mov
|
||||||
|
#define bpf_nop_mov(var) \
|
||||||
|
asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* emit instruction:
|
||||||
|
* rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as
|
||||||
|
*/
|
||||||
|
#ifndef bpf_addr_space_cast
|
||||||
|
#define bpf_addr_space_cast(var, dst_as, src_as)\
|
||||||
|
asm volatile(".byte 0xBF; \
|
||||||
|
.ifc %[reg], r0; \
|
||||||
|
.byte 0x00; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r1; \
|
||||||
|
.byte 0x11; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r2; \
|
||||||
|
.byte 0x22; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r3; \
|
||||||
|
.byte 0x33; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r4; \
|
||||||
|
.byte 0x44; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r5; \
|
||||||
|
.byte 0x55; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r6; \
|
||||||
|
.byte 0x66; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r7; \
|
||||||
|
.byte 0x77; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r8; \
|
||||||
|
.byte 0x88; \
|
||||||
|
.endif; \
|
||||||
|
.ifc %[reg], r9; \
|
||||||
|
.byte 0x99; \
|
||||||
|
.endif; \
|
||||||
|
.short %[off]; \
|
||||||
|
.long %[as]" \
|
||||||
|
: [reg]"+r"(var) \
|
||||||
|
: [off]"i"(BPF_ADDR_SPACE_CAST) \
|
||||||
|
, [as]"i"((dst_as << 16) | src_as));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void bpf_preempt_disable(void) __weak __ksym;
|
||||||
|
void bpf_preempt_enable(void) __weak __ksym;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
} __bpf_preempt_t;
|
||||||
|
|
||||||
|
static inline __bpf_preempt_t __bpf_preempt_constructor(void)
|
||||||
|
{
|
||||||
|
__bpf_preempt_t ret = {};
|
||||||
|
|
||||||
|
bpf_preempt_disable();
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
static inline void __bpf_preempt_destructor(__bpf_preempt_t *t)
|
||||||
|
{
|
||||||
|
bpf_preempt_enable();
|
||||||
|
}
|
||||||
|
#define bpf_guard_preempt() \
|
||||||
|
__bpf_preempt_t ___bpf_apply(preempt, __COUNTER__) \
|
||||||
|
__attribute__((__unused__, __cleanup__(__bpf_preempt_destructor))) = \
|
||||||
|
__bpf_preempt_constructor()
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Assert that a conditional expression is true.
|
||||||
|
* Returns
|
||||||
|
* Void.
|
||||||
|
* Throws
|
||||||
|
* An exception with the value zero when the assertion fails.
|
||||||
|
*/
|
||||||
|
#define bpf_assert(cond) if (!(cond)) bpf_throw(0);
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Assert that a conditional expression is true.
|
||||||
|
* Returns
|
||||||
|
* Void.
|
||||||
|
* Throws
|
||||||
|
* An exception with the specified value when the assertion fails.
|
||||||
|
*/
|
||||||
|
#define bpf_assert_with(cond, value) if (!(cond)) bpf_throw(value);
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Assert that LHS is in the range [BEG, END] (inclusive of both). This
|
||||||
|
* statement updates the known bounds of LHS during verification. Note
|
||||||
|
* that both BEG and END must be constant values, and must fit within the
|
||||||
|
* data type of LHS.
|
||||||
|
* Returns
|
||||||
|
* Void.
|
||||||
|
* Throws
|
||||||
|
* An exception with the value zero when the assertion fails.
|
||||||
|
*/
|
||||||
|
#define bpf_assert_range(LHS, BEG, END) \
|
||||||
|
({ \
|
||||||
|
_Static_assert(BEG <= END, "BEG must be <= END"); \
|
||||||
|
barrier_var(LHS); \
|
||||||
|
__bpf_assert_op(LHS, >=, BEG, 0, false); \
|
||||||
|
__bpf_assert_op(LHS, <=, END, 0, false); \
|
||||||
|
})
|
||||||
|
|
||||||
|
/* Description
|
||||||
|
* Assert that LHS is in the range [BEG, END] (inclusive of both). This
|
||||||
|
* statement updates the known bounds of LHS during verification. Note
|
||||||
|
* that both BEG and END must be constant values, and must fit within the
|
||||||
|
* data type of LHS.
|
||||||
|
* Returns
|
||||||
|
* Void.
|
||||||
|
* Throws
|
||||||
|
* An exception with the specified value when the assertion fails.
|
||||||
|
*/
|
||||||
|
#define bpf_assert_range_with(LHS, BEG, END, value) \
|
||||||
|
({ \
|
||||||
|
_Static_assert(BEG <= END, "BEG must be <= END"); \
|
||||||
|
barrier_var(LHS); \
|
||||||
|
__bpf_assert_op(LHS, >=, BEG, value, false); \
|
||||||
|
__bpf_assert_op(LHS, <=, END, value, false); \
|
||||||
|
})
|
||||||
|
|
||||||
|
struct bpf_iter_css_task;
|
||||||
|
struct cgroup_subsys_state;
|
||||||
|
extern int bpf_iter_css_task_new(struct bpf_iter_css_task *it,
|
||||||
|
struct cgroup_subsys_state *css, unsigned int flags) __weak __ksym;
|
||||||
|
extern struct task_struct *bpf_iter_css_task_next(struct bpf_iter_css_task *it) __weak __ksym;
|
||||||
|
extern void bpf_iter_css_task_destroy(struct bpf_iter_css_task *it) __weak __ksym;
|
||||||
|
|
||||||
|
struct bpf_iter_task;
|
||||||
|
extern int bpf_iter_task_new(struct bpf_iter_task *it,
|
||||||
|
struct task_struct *task, unsigned int flags) __weak __ksym;
|
||||||
|
extern struct task_struct *bpf_iter_task_next(struct bpf_iter_task *it) __weak __ksym;
|
||||||
|
extern void bpf_iter_task_destroy(struct bpf_iter_task *it) __weak __ksym;
|
||||||
|
|
||||||
|
struct bpf_iter_css;
|
||||||
|
extern int bpf_iter_css_new(struct bpf_iter_css *it,
|
||||||
|
struct cgroup_subsys_state *start, unsigned int flags) __weak __ksym;
|
||||||
|
extern struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
|
||||||
|
extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
|
||||||
|
|
||||||
|
extern int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags) __weak __ksym;
|
||||||
|
extern int bpf_wq_start(struct bpf_wq *wq, unsigned int flags) __weak __ksym;
|
||||||
|
extern int bpf_wq_set_callback_impl(struct bpf_wq *wq,
|
||||||
|
int (callback_fn)(void *map, int *key, void *value),
|
||||||
|
unsigned int flags__k, void *aux__ign) __ksym;
|
||||||
|
#define bpf_wq_set_callback(timer, cb, flags) \
|
||||||
|
bpf_wq_set_callback_impl(timer, cb, flags, NULL)
|
||||||
|
|
||||||
|
struct bpf_iter_kmem_cache;
|
||||||
|
extern int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it) __weak __ksym;
|
||||||
|
extern struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it) __weak __ksym;
|
||||||
|
extern void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it) __weak __ksym;
|
||||||
|
|
||||||
|
#endif
|
||||||
69
src/features/bpf_wq/wq_simple.bpf.c
Normal file
69
src/features/bpf_wq/wq_simple.bpf.c
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/* Simple BPF workqueue example */
|
||||||
|
#include <vmlinux.h>
|
||||||
|
#include <bpf/bpf_helpers.h>
|
||||||
|
#include "bpf_experimental.h"
|
||||||
|
|
||||||
|
char LICENSE[] SEC("license") = "GPL";
|
||||||
|
|
||||||
|
/* Element with embedded workqueue */
|
||||||
|
struct elem {
|
||||||
|
int value;
|
||||||
|
struct bpf_wq work;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Array to store our element */
|
||||||
|
struct {
|
||||||
|
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||||
|
__uint(max_entries, 1);
|
||||||
|
__type(key, int);
|
||||||
|
__type(value, struct elem);
|
||||||
|
} array SEC(".maps");
|
||||||
|
|
||||||
|
/* Result variables */
|
||||||
|
__u32 wq_executed = 0;
|
||||||
|
__u32 main_executed = 0;
|
||||||
|
|
||||||
|
/* Workqueue callback - runs asynchronously in workqueue context */
|
||||||
|
static int wq_callback(void *map, int *key, void *value)
|
||||||
|
{
|
||||||
|
struct elem *val = value;
|
||||||
|
/* This runs later in workqueue context */
|
||||||
|
wq_executed = 1;
|
||||||
|
val->value = 42; /* Modify the value asynchronously */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Main program - schedules work */
|
||||||
|
SEC("fentry/do_unlinkat")
|
||||||
|
int test_workqueue(void *ctx)
|
||||||
|
{
|
||||||
|
struct elem init = {.value = 0}, *val;
|
||||||
|
struct bpf_wq *wq;
|
||||||
|
int key = 0;
|
||||||
|
|
||||||
|
main_executed = 1;
|
||||||
|
|
||||||
|
/* Initialize element in map */
|
||||||
|
bpf_map_update_elem(&array, &key, &init, 0);
|
||||||
|
|
||||||
|
/* Get element from map */
|
||||||
|
val = bpf_map_lookup_elem(&array, &key);
|
||||||
|
if (!val)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* Initialize workqueue */
|
||||||
|
wq = &val->work;
|
||||||
|
if (bpf_wq_init(wq, &array, 0) != 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* Set callback function */
|
||||||
|
if (bpf_wq_set_callback(wq, wq_callback, 0))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* Schedule work to run asynchronously */
|
||||||
|
if (bpf_wq_start(wq, 0))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
63
src/features/bpf_wq/wq_simple.c
Normal file
63
src/features/bpf_wq/wq_simple.c
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/* Userspace test for BPF workqueue */
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <sys/resource.h>
|
||||||
|
#include <bpf/libbpf.h>
|
||||||
|
#include "wq_simple.skel.h"
|
||||||
|
|
||||||
|
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
|
||||||
|
{
|
||||||
|
return vfprintf(stderr, format, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
struct wq_simple_bpf *skel;
|
||||||
|
int err, fd;
|
||||||
|
|
||||||
|
libbpf_set_print(libbpf_print_fn);
|
||||||
|
|
||||||
|
/* Open and load BPF application */
|
||||||
|
skel = wq_simple_bpf__open_and_load();
|
||||||
|
if (!skel) {
|
||||||
|
fprintf(stderr, "Failed to open and load BPF skeleton\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Attach tracepoint handler */
|
||||||
|
err = wq_simple_bpf__attach(skel);
|
||||||
|
if (err) {
|
||||||
|
fprintf(stderr, "Failed to attach BPF skeleton\n");
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("BPF workqueue program attached. Triggering unlink syscall...\n");
|
||||||
|
|
||||||
|
/* Create a temporary file to trigger do_unlinkat */
|
||||||
|
fd = open("/tmp/wq_test_file", O_CREAT | O_WRONLY, 0644);
|
||||||
|
if (fd >= 0) {
|
||||||
|
close(fd);
|
||||||
|
unlink("/tmp/wq_test_file");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Give workqueue time to execute */
|
||||||
|
sleep(1);
|
||||||
|
|
||||||
|
/* Check results */
|
||||||
|
printf("\nResults:\n");
|
||||||
|
printf(" main_executed = %u (expected: 1)\n", skel->bss->main_executed);
|
||||||
|
printf(" wq_executed = %u (expected: 1)\n", skel->bss->wq_executed);
|
||||||
|
|
||||||
|
if (skel->bss->main_executed == 1 && skel->bss->wq_executed == 1) {
|
||||||
|
printf("\n✓ Test PASSED!\n");
|
||||||
|
} else {
|
||||||
|
printf("\n✗ Test FAILED!\n");
|
||||||
|
err = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
wq_simple_bpf__destroy(skel);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user