diff --git a/src/cgroup/.gitignore b/src/cgroup/.gitignore new file mode 100644 index 0000000..e32dbea --- /dev/null +++ b/src/cgroup/.gitignore @@ -0,0 +1,20 @@ +# Build artifacts +*.o +*.skel.h +*.skel.json +.output/ +cgroup_guard + +# Test artifacts +*.tmp +*.err + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db diff --git a/src/cgroup/Makefile b/src/cgroup/Makefile new file mode 100644 index 0000000..a6c27a9 --- /dev/null +++ b/src/cgroup/Makefile @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../third_party/libbpf/src) +BPFTOOL_SRC := $(abspath ../third_party/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../third_party/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../third_party/libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = cgroup_guard + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/src/cgroup/README.md b/src/cgroup/README.md new file mode 100644 index 0000000..b109378 --- /dev/null +++ b/src/cgroup/README.md @@ -0,0 +1,193 @@ +# eBPF Tutorial: cgroup-based Policy Control + +This tutorial demonstrates how to use cgroup eBPF programs to implement per-cgroup policy controls for networking, device access, and sysctl operations. + +## What is cgroup eBPF? + +**cgroup eBPF** allows you to attach eBPF programs to cgroups (control groups) to enforce policies based on process/container membership. Unlike XDP/tc which work on network interfaces, cgroup eBPF works at the process level: + +- Policies only affect processes in the target cgroup +- Perfect for container/multi-tenant/sandbox isolation +- Covers: network access control, socket options, sysctl access, device access + +When a cgroup eBPF program denies an operation, userspace typically sees `EPERM` (Operation not permitted). + +## cgroup eBPF Hook Points + +### 1. `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` - Socket Address Hooks + +Triggered on socket address syscalls (bind/connect/sendmsg/recvmsg): + +| Hook | Section Name | Description | +|------|--------------|-------------| +| IPv4 bind | `cgroup/bind4` | Filter bind() calls | +| IPv6 bind | `cgroup/bind6` | Filter bind() calls | +| IPv4 connect | `cgroup/connect4` | Filter connect() calls | +| IPv6 connect | `cgroup/connect6` | Filter connect() calls | +| UDP sendmsg | `cgroup/sendmsg4`, `cgroup/sendmsg6` | Filter UDP sends | +| UDP recvmsg | `cgroup/recvmsg4`, `cgroup/recvmsg6` | Filter UDP receives | +| Unix connect | `cgroup/connect_unix` | Filter Unix socket connect | + +**Context**: `struct bpf_sock_addr` - contains `user_ip4`, `user_port` (network byte order) + +**Return semantics**: `return 1` = allow, `return 0` = deny (EPERM) + +### 2. `BPF_PROG_TYPE_CGROUP_DEVICE` - Device Access Control + +| Hook | Section Name | Description | +|------|--------------|-------------| +| Device access | `cgroup/dev` | Filter device open/read/write/mknod | + +**Context**: `struct bpf_cgroup_dev_ctx` - contains `major`, `minor`, `access_type` + +**Return semantics**: `return 0` = deny (EPERM), non-zero = allow + +### 3. `BPF_PROG_TYPE_CGROUP_SYSCTL` - Sysctl Access Control + +| Hook | Section Name | Description | +|------|--------------|-------------| +| Sysctl access | `cgroup/sysctl` | Filter /proc/sys reads/writes | + +**Context**: `struct bpf_sysctl` - use `bpf_sysctl_get_name()` to get sysctl name + +**Return semantics**: `return 0` = reject (EPERM), `return 1` = proceed + +### 4. Other cgroup Hooks + +- `cgroup_skb/ingress`, `cgroup_skb/egress` - Packet-level filtering +- `cgroup/getsockopt`, `cgroup/setsockopt` - Socket option filtering +- `cgroup/sock_create`, `cgroup/sock_release` - Socket lifecycle +- `sockops` - TCP-level optimization (attached via `BPF_CGROUP_SOCK_OPS`) + +## This Tutorial: cgroup Policy Guard + +We implement a single eBPF object with three programs: + +1. **Network (TCP)**: Block `connect()` to a specified destination port +2. **Device**: Block access to a specified `major:minor` device +3. **Sysctl**: Block reading a specified sysctl (read-only, safer for testing) + +Events are sent to userspace via ringbuf for observability. + +## Building + +```bash +cd src/49-cgroup +make +``` + +## Running + +### Terminal A: Start the loader + +```bash +# Block: TCP port 9090, /dev/null (1:3), reading kernel/hostname +sudo ./cgroup_guard \ + --cgroup /sys/fs/cgroup/ebpf_demo \ + --block-port 9090 \ + --deny-device 1:3 \ + --deny-sysctl kernel/hostname +``` + +You should see: +``` +Attached to cgroup: /sys/fs/cgroup/ebpf_demo +Config: block_port=9090, deny_device=1:3, deny_sysctl_read=kernel/hostname +Press Ctrl-C to stop. +``` + +### Terminal B: Start test servers (outside cgroup) + +```bash +# Start two HTTP servers +python3 -m http.server 8080 --bind 127.0.0.1 & +python3 -m http.server 9090 --bind 127.0.0.1 & +``` + +### Terminal C: Test from within the cgroup + +```bash +sudo bash -c ' +echo $$ > /sys/fs/cgroup/ebpf_demo/cgroup.procs + +echo "== TCP test ==" +curl -s http://127.0.0.1:8080 >/dev/null && echo "8080 OK" +curl -s http://127.0.0.1:9090 >/dev/null && echo "9090 OK (unexpected)" || echo "9090 BLOCKED (expected)" + +echo +echo "== Device test ==" +cat /dev/null && echo "/dev/null OK (unexpected)" || echo "/dev/null BLOCKED (expected)" + +echo +echo "== Sysctl test ==" +cat /proc/sys/kernel/hostname && echo "sysctl read OK (unexpected)" || echo "sysctl read BLOCKED (expected)" +' +``` + +Expected output: +- `8080 OK` - Port 8080 is allowed +- `9090 BLOCKED (expected)` - Port 9090 is blocked +- `/dev/null BLOCKED (expected)` - Device 1:3 is blocked +- `sysctl read BLOCKED (expected)` - Reading kernel/hostname is blocked + +### Terminal A output (events) + +``` +[DENY connect4] pid=12345 comm=curl daddr=127.0.0.1 dport=9090 proto=6 +[DENY device] pid=12346 comm=cat major=1 minor=3 access_type=0x... +[DENY sysctl] pid=12347 comm=cat write=0 name=kernel/hostname +``` + +## Verifying with bpftool + +```bash +sudo bpftool cgroup tree /sys/fs/cgroup/ebpf_demo +``` + +## Key Implementation Details + +### 1. Network byte order for sock_addr + +```c +// user_port is network byte order, must convert +__u16 dport = bpf_ntohs((__u16)ctx->user_port); +``` + +### 2. Return value semantics + +```c +// For sock_addr (connect4/bind4/etc): +return 1; // allow +return 0; // deny -> EPERM + +// For device: +return 0; // deny -> EPERM +return 1; // allow + +// For sysctl: +return 0; // reject -> EPERM +return 1; // proceed +``` + +### 3. Configuration via .rodata + +```c +// BPF side - const volatile for CO-RE +const volatile __u16 blocked_tcp_dport = 0; + +// Userspace - set before load +skel->rodata->blocked_tcp_dport = (__u16)port; +``` + +## Files + +- `cgroup_guard.h` - Shared data structures +- `cgroup_guard.bpf.c` - eBPF programs (connect4, device, sysctl hooks) +- `cgroup_guard.c` - Userspace loader +- `Makefile` - Build system + +## References + +- [Kernel docs: libbpf program types](https://docs.kernel.org/bpf/libbpf/program_types.html) +- [eBPF docs: CGROUP_SOCK_ADDR](https://docs.ebpf.io/linux/program-type/BPF_PROG_TYPE_CGROUP_SOCK_ADDR/) +- [eBPF docs: CGROUP_DEVICE](https://docs.ebpf.io/linux/program-type/BPF_PROG_TYPE_CGROUP_DEVICE/) diff --git a/src/cgroup/README.zh.md b/src/cgroup/README.zh.md new file mode 100644 index 0000000..f64c3c4 --- /dev/null +++ b/src/cgroup/README.zh.md @@ -0,0 +1,606 @@ +# eBPF 实例教程:基于 cgroup 的策略控制 + +你是否需要对容器或特定进程组实施网络访问控制,但又不想影响整个系统?或者你需要限制某些进程访问特定设备,同时允许其他进程正常使用?传统的 iptables 和设备权限是全局生效的,无法做到按进程组精细控制。 + +这就是 **cgroup eBPF** 解决的问题。通过将 eBPF 程序挂载到 cgroup(控制组),你可以实现按进程归属的策略控制,只有属于特定 cgroup 的进程才会受到影响。这使得容器隔离、多租户安全和沙箱环境成为可能。在本教程中,我们将构建一个完整的"策略守卫"程序,同时演示 TCP 连接过滤、设备访问控制和 sysctl 读取限制三种 cgroup eBPF 用法。 + +## cgroup eBPF 简介:按进程组做策略 + +cgroup eBPF 的核心思想很简单:把 eBPF 程序挂到 cgroup 上,这个 cgroup 里的所有进程都会受到这个程序的控制。与 XDP/tc 按网卡过滤流量不同,cgroup eBPF 按进程归属过滤,你把容器放进一个 cgroup,挂上策略程序,这个容器的网络访问、设备访问、sysctl 读写就都在你的控制之下了。其他 cgroup 里的进程完全不受影响。 + +这种模型非常适合容器和多租户场景。Kubernetes 的 NetworkPolicy 底层就用了 cgroup eBPF。你也可以用它来做设备隔离(比如限制哪些容器能访问 GPU)、安全沙箱(禁止读取敏感 sysctl)等。当 cgroup eBPF 程序拒绝一个操作时,用户态的系统调用会返回 `EPERM`(操作不允许)。 + +## cgroup eBPF 挂载点 + +### 1. `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` - Socket 地址钩子 + +在 socket 地址相关的系统调用(bind/connect/sendmsg/recvmsg)上触发: + +| 钩子 | Section 名称 | 描述 | +|------|--------------|------| +| IPv4 bind | `cgroup/bind4` | 过滤 bind() 调用 | +| IPv6 bind | `cgroup/bind6` | 过滤 bind() 调用 | +| IPv4 connect | `cgroup/connect4` | 过滤 connect() 调用 | +| IPv6 connect | `cgroup/connect6` | 过滤 connect() 调用 | +| UDP sendmsg | `cgroup/sendmsg4`, `cgroup/sendmsg6` | 过滤 UDP 发送 | +| UDP recvmsg | `cgroup/recvmsg4`, `cgroup/recvmsg6` | 过滤 UDP 接收 | +| Unix connect | `cgroup/connect_unix` | 过滤 Unix socket 连接 | + +**上下文**:`struct bpf_sock_addr` - 包含 `user_ip4`、`user_port`(网络字节序) + +**返回语义**:`return 1` = 允许,`return 0` = 拒绝(EPERM) + +### 2. `BPF_PROG_TYPE_CGROUP_DEVICE` - 设备访问控制 + +| 钩子 | Section 名称 | 描述 | +|------|--------------|------| +| 设备访问 | `cgroup/dev` | 过滤设备 open/read/write/mknod | + +**上下文**:`struct bpf_cgroup_dev_ctx` - 包含 `major`、`minor`、`access_type` + +**返回语义**:`return 0` = 拒绝(EPERM),非零 = 允许 + +### 3. `BPF_PROG_TYPE_CGROUP_SYSCTL` - Sysctl 访问控制 + +| 钩子 | Section 名称 | 描述 | +|------|--------------|------| +| Sysctl 访问 | `cgroup/sysctl` | 过滤 /proc/sys 的读写 | + +**上下文**:`struct bpf_sysctl` - 使用 `bpf_sysctl_get_name()` 获取 sysctl 名称 + +**返回语义**:`return 0` = 拒绝(EPERM),`return 1` = 允许 + +### 4. 其他 cgroup 钩子 + +- `cgroup_skb/ingress`、`cgroup_skb/egress` - 包级过滤 +- `cgroup/getsockopt`、`cgroup/setsockopt` - Socket 选项过滤 +- `cgroup/sock_create`、`cgroup/sock_release` - Socket 生命周期 +- `sockops` - TCP 层优化(通过 `BPF_CGROUP_SOCK_OPS` 挂载) + +## 本教程:cgroup 策略守卫 + +我们实现一个包含三个程序的 eBPF 对象: + +1. **网络(TCP)**:阻断到指定目的端口的 `connect()` +2. **设备**:阻断对指定 `major:minor` 设备的访问 +3. **Sysctl**:阻断读取指定的 sysctl(只读,测试更安全) + +事件通过 ringbuf 发送到用户态以便观测。 + +## 实现 + +### 共享头文件:cgroup_guard.h + +这个头文件定义了内核态和用户态共享的数据结构: + +```c +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +#ifndef __CGROUP_GUARD_H +#define __CGROUP_GUARD_H + +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif + +#define SYSCTL_NAME_LEN 64 + +enum event_type { + EVENT_CONNECT4 = 1, + EVENT_DEVICE = 2, + EVENT_SYSCTL = 3, +}; + +struct event { + __u64 ts_ns; + __u32 pid; + __u32 type; + char comm[TASK_COMM_LEN]; + + union { + struct { + __u32 daddr; /* IPv4, network order */ + __u16 dport; /* host order */ + __u16 proto; /* e.g. 6 for TCP */ + } connect4; + + struct { + __u32 major; + __u32 minor; + __u32 access_type; + } device; + + struct { + __u32 write; + char name[SYSCTL_NAME_LEN]; + } sysctl; + }; +}; + +#endif /* __CGROUP_GUARD_H */ +``` + +`event` 结构使用 union 来存储不同类型事件的特定数据,这样可以节省空间并保持统一的事件格式。 + +### eBPF 程序:cgroup_guard.bpf.c + +```c +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* cgroup_guard.bpf.c - cgroup eBPF policy guard + * + * This program demonstrates three types of cgroup eBPF hooks: + * 1. cgroup/connect4 - TCP connection filtering + * 2. cgroup/dev - Device access control + * 3. cgroup/sysctl - Sysctl read/write control + */ +#include "vmlinux.h" +#include +#include + +#include "cgroup_guard.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +/* ===== Configurable options: set by userspace before load ===== */ +#define IPPROTO_TCP 6 + +const volatile __u16 blocked_tcp_dport = 0; /* host order */ +const volatile __u32 blocked_dev_major = 0; +const volatile __u32 blocked_dev_minor = 0; +const volatile char denied_sysctl_name[SYSCTL_NAME_LEN] = {}; /* NUL-terminated */ + +/* ===== ringbuf: send denied events to userspace ===== */ +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 24); /* 16MB */ +} events SEC(".maps"); + +static __always_inline void fill_common(struct event *e, __u32 type) +{ + e->ts_ns = bpf_ktime_get_ns(); + e->type = type; + e->pid = (__u32)(bpf_get_current_pid_tgid() >> 32); + bpf_get_current_comm(&e->comm, sizeof(e->comm)); +} + +/* Compare two strings, return 1 if equal, 0 if not + * Note: b is volatile to handle const volatile rodata arrays correctly */ +static __always_inline int str_eq(const char *a, const volatile char *b, int max_len) +{ +#pragma unroll + for (int i = 0; i < SYSCTL_NAME_LEN; i++) { + char ca = a[i]; + char cb = b[i]; + if (ca != cb) + return 0; + if (ca == '\0') + return 1; + } + return 1; +} + +/* ===== 1) Network: block TCP connect4 to specified port ===== + * ctx: struct bpf_sock_addr + * user_ip4/user_port: network byte order (need conversion) + * + * Return semantics: + * - return 1: allow + * - return 0: deny (userspace gets EPERM) + */ +SEC("cgroup/connect4") +int cg_connect4(struct bpf_sock_addr *ctx) +{ + if (blocked_tcp_dport == 0) + return 1; + + if (ctx->protocol != IPPROTO_TCP) + return 1; + + __u16 dport = bpf_ntohs((__u16)ctx->user_port); + if (dport != blocked_tcp_dport) + return 1; + + struct event *e = bpf_ringbuf_reserve(&events, sizeof(*e), 0); + if (e) { + fill_common(e, EVENT_CONNECT4); + e->connect4.daddr = ctx->user_ip4; /* network order */ + e->connect4.dport = dport; /* host order */ + e->connect4.proto = ctx->protocol; + bpf_ringbuf_submit(e, 0); + } + + return 0; /* deny -> userspace gets EPERM on connect */ +} + +/* ===== 2) Device: block access to specified major:minor ===== + * ctx: struct bpf_cgroup_dev_ctx { access_type, major, minor } + * + * Return semantics: + * - return 0: deny (userspace gets EPERM) + * - return non-zero: allow + */ +SEC("cgroup/dev") +int cg_dev(struct bpf_cgroup_dev_ctx *ctx) +{ + if (blocked_dev_major == 0 && blocked_dev_minor == 0) + return 1; + + if (ctx->major != blocked_dev_major || ctx->minor != blocked_dev_minor) + return 1; + + struct event *e = bpf_ringbuf_reserve(&events, sizeof(*e), 0); + if (e) { + fill_common(e, EVENT_DEVICE); + e->device.major = ctx->major; + e->device.minor = ctx->minor; + e->device.access_type = ctx->access_type; + bpf_ringbuf_submit(e, 0); + } + + return 0; /* deny -> -EPERM */ +} + +/* ===== 3) Sysctl: block reading specified sysctl ===== + * ctx: struct bpf_sysctl + * Use bpf_sysctl_get_name() to get name + * + * Return semantics: + * - return 0: reject + * - return 1: proceed + * If return 0, userspace read/write returns -1 with errno=EPERM + */ +SEC("cgroup/sysctl") +int cg_sysctl(struct bpf_sysctl *ctx) +{ + char name[SYSCTL_NAME_LEN]; + int ret = bpf_sysctl_get_name(ctx, name, sizeof(name), 0); + if (ret < 0) + return 1; + + if (denied_sysctl_name[0] == '\0') + return 1; + + /* Only deny reads, allow writes (safer for testing) */ + if (ctx->write) + return 1; + + if (!str_eq(name, denied_sysctl_name, SYSCTL_NAME_LEN)) + return 1; + + struct event *e = bpf_ringbuf_reserve(&events, sizeof(*e), 0); + if (e) { + fill_common(e, EVENT_SYSCTL); + e->sysctl.write = ctx->write; +#pragma unroll + for (int i = 0; i < SYSCTL_NAME_LEN; i++) { + e->sysctl.name[i] = name[i]; + if (name[i] == '\0') + break; + } + bpf_ringbuf_submit(e, 0); + } + + return 0; /* deny -> -EPERM */ +} +``` + +#### 理解 BPF 代码 + +这个程序的整体逻辑很清晰:三个 cgroup 钩子分别处理网络连接、设备访问和 sysctl 读写。每个钩子的工作流程都是一样的,检查当前操作是否匹配配置的阻断规则,如果匹配就通过 ringbuf 上报事件并返回 0(拒绝),否则返回 1(放行)。 + +`cg_connect4` 函数使用 `SEC("cgroup/connect4")` 挂载,在进程发起 IPv4 连接时触发。这里有一个重要的细节:`ctx->user_port` 是网络字节序(大端),而我们配置的端口号是主机字节序,所以必须用 `bpf_ntohs()` 转换后再比较。如果目标端口匹配我们配置的 `blocked_tcp_dport`,程序返回 0,用户态的 `connect()` 调用就会失败并返回 `EPERM`。 + +`cg_dev` 函数处理设备访问。它的上下文 `struct bpf_cgroup_dev_ctx` 包含三个关键字段:`major` 和 `minor` 标识设备(比如 `/dev/null` 是 1:3),`access_type` 表示访问类型(读/写/mknod)。我们只需要比较 major:minor 是否匹配配置值就行了。 + +`cg_sysctl` 函数拦截 `/proc/sys/` 下的 sysctl 读写。这里用 `bpf_sysctl_get_name()` 获取 sysctl 名称,格式是 `kernel/hostname` 这样的路径形式(用斜杠分隔,不是点)。我们只阻断读操作,写操作放行,这样测试更安全,不会意外改变系统配置。 + +程序顶部的配置项使用 `const volatile` 声明。这是 CO-RE(Compile Once, Run Everywhere)的标准模式:BPF 程序编译时这些值是默认值(0 或空字符串),用户态在 `load()` 之前通过 `skel->rodata->` 设置实际值。这样一份编译好的 BPF 程序可以用不同的配置运行。 + +### 用户态加载器:cgroup_guard.c + +```c +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* cgroup_guard.c - Userspace loader for cgroup eBPF policy guard */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cgroup_guard.skel.h" +#include "cgroup_guard.h" + +static volatile sig_atomic_t exiting = 0; + +static void sig_handler(int sig) +{ + (void)sig; + exiting = 1; +} + +static int libbpf_print_fn(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG) + return 0; + return vfprintf(stderr, format, args); +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "Usage: %s [OPTIONS]\n" + "\n" + "Options:\n" + " -c, --cgroup PATH cgroup v2 path (default: /sys/fs/cgroup/ebpf_demo)\n" + " -p, --block-port PORT block TCP connect() to this dst port (IPv4)\n" + " -d, --deny-device MAJ:MIN deny device access for (major:minor)\n" + " -s, --deny-sysctl NAME deny sysctl READ of this name\n" + " -h, --help show this help\n", + prog); +} + +static int handle_event(void *ctx, void *data, size_t data_sz) +{ + (void)ctx; + (void)data_sz; + + const struct event *e = (const struct event *)data; + + if (e->type == EVENT_CONNECT4) { + char ip[INET_ADDRSTRLEN] = {0}; + struct in_addr addr = { .s_addr = e->connect4.daddr }; + inet_ntop(AF_INET, &addr, ip, sizeof(ip)); + + printf("[DENY connect4] pid=%u comm=%s daddr=%s dport=%u proto=%u\n", + e->pid, e->comm, ip, e->connect4.dport, e->connect4.proto); + } else if (e->type == EVENT_DEVICE) { + printf("[DENY device] pid=%u comm=%s major=%u minor=%u access_type=0x%x\n", + e->pid, e->comm, e->device.major, e->device.minor, e->device.access_type); + } else if (e->type == EVENT_SYSCTL) { + printf("[DENY sysctl] pid=%u comm=%s write=%u name=%s\n", + e->pid, e->comm, e->sysctl.write, e->sysctl.name); + } + + fflush(stdout); + return 0; +} + +int main(int argc, char **argv) +{ + const char *cgroup_path = "/sys/fs/cgroup/ebpf_demo"; + int block_port = 0; + int dev_major = 0, dev_minor = 0; + const char *deny_sysctl = NULL; + + /* Parse command line arguments */ + static const struct option long_opts[] = { + { "cgroup", required_argument, NULL, 'c' }, + { "block-port", required_argument, NULL, 'p' }, + { "deny-device", required_argument, NULL, 'd' }, + { "deny-sysctl", required_argument, NULL, 's' }, + { "help", no_argument, NULL, 'h' }, + {} + }; + + int opt; + while ((opt = getopt_long(argc, argv, "c:p:d:s:h", long_opts, NULL)) != -1) { + switch (opt) { + case 'c': cgroup_path = optarg; break; + case 'p': block_port = atoi(optarg); break; + case 'd': /* parse major:minor */ break; + case 's': deny_sysctl = optarg; break; + default: usage(argv[0]); return 1; + } + } + + libbpf_set_print(libbpf_print_fn); + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + /* Create cgroup directory if needed */ + mkdir(cgroup_path, 0755); + + int cg_fd = open(cgroup_path, O_RDONLY | O_DIRECTORY); + if (cg_fd < 0) { + fprintf(stderr, "open(%s) failed: %s\n", cgroup_path, strerror(errno)); + return 1; + } + + /* Open and configure BPF skeleton */ + struct cgroup_guard_bpf *skel = cgroup_guard_bpf__open(); + if (!skel) { + fprintf(stderr, "cgroup_guard_bpf__open() failed\n"); + close(cg_fd); + return 1; + } + + /* Write .rodata configuration (must be before load) */ + if (block_port > 0 && block_port <= 65535) + skel->rodata->blocked_tcp_dport = (__u16)block_port; + if (dev_major > 0 || dev_minor > 0) { + skel->rodata->blocked_dev_major = (__u32)dev_major; + skel->rodata->blocked_dev_minor = (__u32)dev_minor; + } + if (deny_sysctl) { + snprintf((char *)skel->rodata->denied_sysctl_name, + SYSCTL_NAME_LEN, "%s", deny_sysctl); + } + + /* Load BPF programs into kernel */ + int err = cgroup_guard_bpf__load(skel); + if (err) { + fprintf(stderr, "cgroup_guard_bpf__load() failed: %d\n", err); + goto cleanup; + } + + /* Attach programs to cgroup */ + struct bpf_link *link_connect = bpf_program__attach_cgroup(skel->progs.cg_connect4, cg_fd); + struct bpf_link *link_dev = bpf_program__attach_cgroup(skel->progs.cg_dev, cg_fd); + struct bpf_link *link_sysctl = bpf_program__attach_cgroup(skel->progs.cg_sysctl, cg_fd); + + /* Setup ring buffer for events */ + struct ring_buffer *rb = ring_buffer__new(bpf_map__fd(skel->maps.events), + handle_event, NULL, NULL); + + printf("Attached to cgroup: %s\n", cgroup_path); + printf("Config: block_port=%d, deny_device=%d:%d, deny_sysctl_read=%s\n", + block_port, dev_major, dev_minor, deny_sysctl ? deny_sysctl : "(none)"); + + /* Main event loop */ + while (!exiting) { + err = ring_buffer__poll(rb, 200 /* ms */); + if (err == -EINTR) + break; + } + + ring_buffer__free(rb); + +cleanup: + bpf_link__destroy(link_sysctl); + bpf_link__destroy(link_dev); + bpf_link__destroy(link_connect); + cgroup_guard_bpf__destroy(skel); + close(cg_fd); + return err ? 1 : 0; +} +``` + +#### 理解用户态代码 + +用户态加载器的核心工作是把 BPF 程序挂载到指定的 cgroup 上,然后不断轮询 ringbuf 打印被拒绝的事件。 + +程序首先用 `getopt_long` 解析命令行参数,获取 cgroup 路径和三个策略配置。然后用 `open()` 以 `O_RDONLY | O_DIRECTORY` 打开 cgroup 目录,拿到一个文件描述符。这个 fd 就是后面 attach 的目标,cgroup eBPF 程序是挂到 cgroup 目录上的。 + +接下来是 skeleton 的标准流程:`open()` 打开 BPF 对象,设置 `.rodata` 配置项,然后 `load()` 加载到内核。注意配置必须在 load 之前设置,load 之后 `.rodata` 就是只读的了。 + +Attach 用的是 `bpf_program__attach_cgroup(prog, cg_fd)`,把每个 BPF 程序挂载到 cgroup。这里我们挂了三个程序:connect4、dev、sysctl。挂载成功后,这个 cgroup 里的所有进程的相关操作都会经过这些 BPF 程序。 + +最后是事件循环。`ring_buffer__poll()` 轮询 ringbuf,每当有事件到来就调用 `handle_event` 回调打印出来。这样你就能实时看到哪些操作被拒绝了。 + +## 编译 + +```bash +cd src/cgroup +make +``` + +## 运行 + +### 终端 A:启动加载器 + +```bash +# 阻断:TCP 端口 9090、/dev/null (1:3)、读取 kernel/hostname +sudo ./cgroup_guard \ + --cgroup /sys/fs/cgroup/ebpf_demo \ + --block-port 9090 \ + --deny-device 1:3 \ + --deny-sysctl kernel/hostname +``` + +你应该看到: +``` +Attached to cgroup: /sys/fs/cgroup/ebpf_demo +Config: block_port=9090, deny_device=1:3, deny_sysctl_read=kernel/hostname +Press Ctrl-C to stop. +``` + +### 终端 B:启动测试服务器(在 cgroup 外) + +```bash +# 启动两个 HTTP 服务器 +python3 -m http.server 8080 --bind 127.0.0.1 & +python3 -m http.server 9090 --bind 127.0.0.1 & +``` + +### 终端 C:在 cgroup 内测试 + +```bash +sudo bash -c ' +echo $$ > /sys/fs/cgroup/ebpf_demo/cgroup.procs + +echo "== TCP 测试 ==" +curl -s http://127.0.0.1:8080 >/dev/null && echo "8080 OK" +curl -s http://127.0.0.1:9090 >/dev/null && echo "9090 OK (意外)" || echo "9090 被阻断 (预期)" + +echo +echo "== 设备测试 ==" +cat /dev/null && echo "/dev/null OK (意外)" || echo "/dev/null 被阻断 (预期)" + +echo +echo "== Sysctl 测试 ==" +cat /proc/sys/kernel/hostname && echo "sysctl 读取 OK (意外)" || echo "sysctl 读取被阻断 (预期)" +' +``` + +预期输出: +- `8080 OK` - 端口 8080 允许访问 +- `9090 被阻断 (预期)` - 端口 9090 被阻断 +- `/dev/null 被阻断 (预期)` - 设备 1:3 被阻断 +- `sysctl 读取被阻断 (预期)` - 读取 kernel/hostname 被阻断 + +### 终端 A 输出(事件) + +``` +[DENY connect4] pid=12345 comm=curl daddr=127.0.0.1 dport=9090 proto=6 +[DENY device] pid=12346 comm=cat major=1 minor=3 access_type=0x... +[DENY sysctl] pid=12347 comm=cat write=0 name=kernel/hostname +``` + +## 一键测试 + +我们提供了一个测试脚本,可以自动完成编译、启动服务器、运行测试和清理: + +```bash +sudo ./test.sh +``` + + +## 使用 bpftool 验证 + +```bash +sudo bpftool cgroup tree /sys/fs/cgroup/ebpf_demo +``` + +## 何时使用 cgroup eBPF + +选择合适的技术取决于你的控制粒度需求。 + +cgroup eBPF 的控制粒度是**进程组**,你把进程放进 cgroup,挂上 BPF 程序,策略就对这组进程生效。这非常适合容器场景:每个容器就是一个 cgroup,你可以给不同容器设置不同的网络策略、设备权限、sysctl 访问规则。进程离开 cgroup,策略自动失效,不需要手动清理。 + +XDP 和 tc 的控制粒度是**网卡**。它们处理经过某个网卡的所有流量,不区分来自哪个进程。如果你需要做高性能包处理、DDoS 防护、负载均衡,XDP/tc 是更好的选择。但如果你想"只允许容器 A 访问端口 80,容器 B 可以访问任意端口",XDP/tc 就不太方便了。 + +seccomp-BPF 的控制粒度是**单个进程**。它过滤系统调用,比如禁止进程调用 `fork`、`exec`、`socket`。seccomp 更底层,适合做进程沙箱。但它不能控制网络目的地址、设备 major:minor 这些高层语义。 + +传统的 iptables/nftables 是**全局**生效的。你配置的规则对整个系统的所有进程都有效,无法区分"这条规则只对容器 A 生效"。 + +总结一下:如果你需要按容器/进程组做策略,同时控制网络、设备、sysctl,并且希望策略随进程生命周期自动管理,cgroup eBPF 就是正确的选择。 + +## 总结 + +cgroup eBPF 通过将策略与进程组绑定,解决了传统全局策略无法精细控制的问题。本教程演示了三种常用的 cgroup 钩子: + +- **`cgroup/connect4`**:在 TCP 连接时过滤目标端口,阻断不允许的出站连接 +- **`cgroup/dev`**:在设备访问时检查 major:minor,限制对特定设备的读写 +- **`cgroup/sysctl`**:在 sysctl 读写时检查名称,防止敏感配置泄露或篡改 + +这种"策略守卫"模式可以扩展到生产用例:容器网络策略(类似 Kubernetes NetworkPolicy)、设备隔离(GPU/TPU 独占)、安全沙箱(限制系统信息访问)。通过 ringbuf 事件上报,你还可以实现策略审计和告警。 + +> 如果你想深入了解 eBPF,请查看我们的教程仓库 或访问我们的网站 。 + +## 参考资料 + +- **内核文档:** [libbpf 程序类型](https://docs.kernel.org/bpf/libbpf/program_types.html) - 所有 cgroup 相关 section 名称 +- **eBPF 文档:** [CGROUP_SOCK_ADDR](https://docs.ebpf.io/linux/program-type/BPF_PROG_TYPE_CGROUP_SOCK_ADDR/) - socket 地址钩子详解 +- **eBPF 文档:** [CGROUP_DEVICE](https://docs.ebpf.io/linux/program-type/BPF_PROG_TYPE_CGROUP_DEVICE/) - 设备访问控制详解 +- **eBPF 文档:** [CGROUP_SYSCTL](https://docs.ebpf.io/linux/program-type/BPF_PROG_TYPE_CGROUP_SYSCTL/) - sysctl 访问控制详解 +- **教程仓库:** + +完整源代码可在教程仓库中获得。需要 Linux 内核 4.10+(cgroup v2)和 libbpf。 diff --git a/src/cgroup/cgroup_guard.bpf.c b/src/cgroup/cgroup_guard.bpf.c new file mode 100644 index 0000000..826550b --- /dev/null +++ b/src/cgroup/cgroup_guard.bpf.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* cgroup_guard.bpf.c - cgroup eBPF policy guard + * + * This program demonstrates three types of cgroup eBPF hooks: + * 1. cgroup/connect4 - TCP connection filtering + * 2. cgroup/dev - Device access control + * 3. cgroup/sysctl - Sysctl read/write control + */ +#include "vmlinux.h" +#include +#include + +#include "cgroup_guard.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + +/* ===== Configurable options: set by userspace before load ===== */ +#define IPPROTO_TCP 6 + +const volatile __u16 blocked_tcp_dport = 0; /* host order */ +const volatile __u32 blocked_dev_major = 0; +const volatile __u32 blocked_dev_minor = 0; +const volatile char denied_sysctl_name[SYSCTL_NAME_LEN] = {}; /* NUL-terminated */ + +/* ===== ringbuf: send denied events to userspace ===== */ +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 1 << 24); /* 16MB */ +} events SEC(".maps"); + +static __always_inline void fill_common(struct event *e, __u32 type) +{ + e->ts_ns = bpf_ktime_get_ns(); + e->type = type; + e->pid = (__u32)(bpf_get_current_pid_tgid() >> 32); + bpf_get_current_comm(&e->comm, sizeof(e->comm)); +} + +/* Compare two strings, return 1 if equal, 0 if not + * Note: b is volatile to handle const volatile rodata arrays correctly */ +static __always_inline int str_eq(const char *a, const volatile char *b, int max_len) +{ +#pragma unroll + for (int i = 0; i < SYSCTL_NAME_LEN; i++) { + char ca = a[i]; + char cb = b[i]; + if (ca != cb) + return 0; + if (ca == '\0') + return 1; + } + return 1; +} + +/* ===== 1) Network: block TCP connect4 to specified port ===== + * ctx: struct bpf_sock_addr + * user_ip4/user_port: network byte order (need conversion) + * + * Return semantics: + * - return 1: allow + * - return 0: deny (userspace gets EPERM) + */ +SEC("cgroup/connect4") +int cg_connect4(struct bpf_sock_addr *ctx) +{ + if (blocked_tcp_dport == 0) + return 1; + + if (ctx->protocol != IPPROTO_TCP) + return 1; + + __u16 dport = bpf_ntohs((__u16)ctx->user_port); + if (dport != blocked_tcp_dport) + return 1; + + struct event *e = bpf_ringbuf_reserve(&events, sizeof(*e), 0); + if (e) { + fill_common(e, EVENT_CONNECT4); + e->connect4.daddr = ctx->user_ip4; /* network order */ + e->connect4.dport = dport; /* host order */ + e->connect4.proto = ctx->protocol; + bpf_ringbuf_submit(e, 0); + } + + return 0; /* deny -> userspace gets EPERM on connect */ +} + +/* ===== 2) Device: block access to specified major:minor ===== + * ctx: struct bpf_cgroup_dev_ctx { access_type, major, minor } + * + * Return semantics: + * - return 0: deny (userspace gets EPERM) + * - return non-zero: allow + */ +SEC("cgroup/dev") +int cg_dev(struct bpf_cgroup_dev_ctx *ctx) +{ + if (blocked_dev_major == 0 && blocked_dev_minor == 0) + return 1; + + if (ctx->major != blocked_dev_major || ctx->minor != blocked_dev_minor) + return 1; + + struct event *e = bpf_ringbuf_reserve(&events, sizeof(*e), 0); + if (e) { + fill_common(e, EVENT_DEVICE); + e->device.major = ctx->major; + e->device.minor = ctx->minor; + e->device.access_type = ctx->access_type; + bpf_ringbuf_submit(e, 0); + } + + return 0; /* deny -> -EPERM */ +} + +/* ===== 3) Sysctl: block reading specified sysctl ===== + * ctx: struct bpf_sysctl + * Use bpf_sysctl_get_name() to get name + * + * Return semantics: + * - return 0: reject + * - return 1: proceed + * If return 0, userspace read/write returns -1 with errno=EPERM + */ +SEC("cgroup/sysctl") +int cg_sysctl(struct bpf_sysctl *ctx) +{ + char name[SYSCTL_NAME_LEN]; + int ret = bpf_sysctl_get_name(ctx, name, sizeof(name), 0); + if (ret < 0) + return 1; + + if (denied_sysctl_name[0] == '\0') + return 1; + + /* Only deny reads, allow writes (safer for testing) */ + if (ctx->write) + return 1; + + if (!str_eq(name, denied_sysctl_name, SYSCTL_NAME_LEN)) + return 1; + + struct event *e = bpf_ringbuf_reserve(&events, sizeof(*e), 0); + if (e) { + fill_common(e, EVENT_SYSCTL); + e->sysctl.write = ctx->write; +#pragma unroll + for (int i = 0; i < SYSCTL_NAME_LEN; i++) { + e->sysctl.name[i] = name[i]; + if (name[i] == '\0') + break; + } + bpf_ringbuf_submit(e, 0); + } + + return 0; /* deny -> -EPERM */ +} diff --git a/src/cgroup/cgroup_guard.c b/src/cgroup/cgroup_guard.c new file mode 100644 index 0000000..35d3c57 --- /dev/null +++ b/src/cgroup/cgroup_guard.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* cgroup_guard.c - Userspace loader for cgroup eBPF policy guard + * + * This loader attaches three eBPF programs to a cgroup: + * 1. cgroup/connect4 - TCP connection filtering + * 2. cgroup/dev - Device access control + * 3. cgroup/sysctl - Sysctl read/write control + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "cgroup_guard.skel.h" +#include "cgroup_guard.h" + +static volatile sig_atomic_t exiting = 0; + +static void sig_handler(int sig) +{ + (void)sig; + exiting = 1; +} + +static int libbpf_print_fn(enum libbpf_print_level level, + const char *format, va_list args) +{ + if (level == LIBBPF_DEBUG) + return 0; + return vfprintf(stderr, format, args); +} + +static void usage(const char *prog) +{ + fprintf(stderr, + "Usage: %s [OPTIONS]\n" + "\n" + "cgroup eBPF policy guard - demonstrates cgroup-based access control\n" + "\n" + "Options:\n" + " -c, --cgroup PATH cgroup v2 path (default: /sys/fs/cgroup/ebpf_demo)\n" + " -p, --block-port PORT block TCP connect() to this dst port (IPv4)\n" + " -d, --deny-device MAJ:MIN deny device access for (major:minor), e.g. 1:3 (/dev/null)\n" + " -s, --deny-sysctl NAME deny sysctl READ of this name, e.g. kernel/hostname\n" + " -h, --help show this help\n" + "\n" + "Examples:\n" + " # Block TCP port 9090, /dev/null (1:3), and reading kernel/hostname\n" + " sudo ./cgroup_guard -p 9090 -d 1:3 -s kernel/hostname\n" + "\n" + " # Test from within the cgroup:\n" + " sudo bash -c 'echo $$ > /sys/fs/cgroup/ebpf_demo/cgroup.procs && curl http://127.0.0.1:9090'\n", + prog); +} + +static int mkdir_p_onelevel(const char *path) +{ + if (mkdir(path, 0755) == 0) + return 0; + if (errno == EEXIST) + return 0; + return -errno; +} + +static int parse_maj_min(const char *s, int *maj, int *min) +{ + char *colon = strchr(s, ':'); + if (!colon) + return -EINVAL; + + char a[32] = {0}; + char b[32] = {0}; + + size_t la = (size_t)(colon - s); + if (la == 0 || la >= sizeof(a)) + return -EINVAL; + + memcpy(a, s, la); + snprintf(b, sizeof(b), "%s", colon + 1); + + char *end = NULL; + long m1 = strtol(a, &end, 10); + if (!end || *end != '\0' || m1 < 0) + return -EINVAL; + + end = NULL; + long m2 = strtol(b, &end, 10); + if (!end || *end != '\0' || m2 < 0) + return -EINVAL; + + *maj = (int)m1; + *min = (int)m2; + return 0; +} + +static int handle_event(void *ctx, void *data, size_t data_sz) +{ + (void)ctx; + (void)data_sz; + + const struct event *e = (const struct event *)data; + + if (e->type == EVENT_CONNECT4) { + char ip[INET_ADDRSTRLEN] = {0}; + struct in_addr addr = { .s_addr = e->connect4.daddr }; + inet_ntop(AF_INET, &addr, ip, sizeof(ip)); + + printf("[DENY connect4] pid=%u comm=%s daddr=%s dport=%u proto=%u\n", + e->pid, e->comm, ip, e->connect4.dport, e->connect4.proto); + } else if (e->type == EVENT_DEVICE) { + printf("[DENY device] pid=%u comm=%s major=%u minor=%u access_type=0x%x\n", + e->pid, e->comm, e->device.major, e->device.minor, e->device.access_type); + } else if (e->type == EVENT_SYSCTL) { + printf("[DENY sysctl] pid=%u comm=%s write=%u name=%s\n", + e->pid, e->comm, e->sysctl.write, e->sysctl.name); + } else { + printf("[UNKNOWN] type=%u pid=%u comm=%s\n", e->type, e->pid, e->comm); + } + + fflush(stdout); + return 0; +} + +int main(int argc, char **argv) +{ + const char *cgroup_path = "/sys/fs/cgroup/ebpf_demo"; + int block_port = 0; + int dev_major = 0, dev_minor = 0; + const char *deny_sysctl = NULL; + + static const struct option long_opts[] = { + { "cgroup", required_argument, NULL, 'c' }, + { "block-port", required_argument, NULL, 'p' }, + { "deny-device", required_argument, NULL, 'd' }, + { "deny-sysctl", required_argument, NULL, 's' }, + { "help", no_argument, NULL, 'h' }, + {} + }; + + int opt; + while ((opt = getopt_long(argc, argv, "c:p:d:s:h", long_opts, NULL)) != -1) { + switch (opt) { + case 'c': + cgroup_path = optarg; + break; + case 'p': + block_port = atoi(optarg); + break; + case 'd': { + int err = parse_maj_min(optarg, &dev_major, &dev_minor); + if (err) { + fprintf(stderr, "Invalid --deny-device %s, expect MAJ:MIN\n", optarg); + return 1; + } + break; + } + case 's': + deny_sysctl = optarg; + break; + case 'h': + default: + usage(argv[0]); + return opt == 'h' ? 0 : 1; + } + } + + libbpf_set_print(libbpf_print_fn); + + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + fprintf(stderr, "Warning: setrlimit(RLIMIT_MEMLOCK) failed: %s\n", strerror(errno)); + } + + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + int err = mkdir_p_onelevel(cgroup_path); + if (err) { + fprintf(stderr, "mkdir(%s) failed: %s\n", cgroup_path, strerror(-err)); + return 1; + } + + int cg_fd = open(cgroup_path, O_RDONLY | O_DIRECTORY); + if (cg_fd < 0) { + fprintf(stderr, "open(%s) failed: %s\n", cgroup_path, strerror(errno)); + return 1; + } + + struct cgroup_guard_bpf *skel = cgroup_guard_bpf__open(); + if (!skel) { + fprintf(stderr, "cgroup_guard_bpf__open() failed\n"); + close(cg_fd); + return 1; + } + + /* Write .rodata configuration (must be before load) */ + if (block_port > 0 && block_port <= 65535) + skel->rodata->blocked_tcp_dport = (__u16)block_port; + + if (dev_major > 0 || dev_minor > 0) { + skel->rodata->blocked_dev_major = (__u32)dev_major; + skel->rodata->blocked_dev_minor = (__u32)dev_minor; + } + + if (deny_sysctl) { + snprintf((char *)skel->rodata->denied_sysctl_name, + SYSCTL_NAME_LEN, "%s", deny_sysctl); + } + + err = cgroup_guard_bpf__load(skel); + if (err) { + fprintf(stderr, "cgroup_guard_bpf__load() failed: %d\n", err); + goto cleanup; + } + + struct bpf_link *link_connect = NULL; + struct bpf_link *link_dev = NULL; + struct bpf_link *link_sysctl = NULL; + + link_connect = bpf_program__attach_cgroup(skel->progs.cg_connect4, cg_fd); + err = libbpf_get_error(link_connect); + if (err) { + link_connect = NULL; + fprintf(stderr, "attach cgroup/connect4 failed: %d\n", err); + goto cleanup; + } + + link_dev = bpf_program__attach_cgroup(skel->progs.cg_dev, cg_fd); + err = libbpf_get_error(link_dev); + if (err) { + link_dev = NULL; + fprintf(stderr, "attach cgroup/dev failed: %d\n", err); + goto cleanup; + } + + link_sysctl = bpf_program__attach_cgroup(skel->progs.cg_sysctl, cg_fd); + err = libbpf_get_error(link_sysctl); + if (err) { + link_sysctl = NULL; + fprintf(stderr, "attach cgroup/sysctl failed: %d\n", err); + goto cleanup; + } + + struct ring_buffer *rb = ring_buffer__new(bpf_map__fd(skel->maps.events), + handle_event, NULL, NULL); + if (!rb) { + fprintf(stderr, "ring_buffer__new() failed\n"); + goto cleanup; + } + + printf("Attached to cgroup: %s\n", cgroup_path); + printf("Config: block_port=%d, deny_device=%d:%d, deny_sysctl_read=%s\n", + block_port, dev_major, dev_minor, deny_sysctl ? deny_sysctl : "(none)"); + printf("Press Ctrl-C to stop.\n\n"); + + while (!exiting) { + err = ring_buffer__poll(rb, 200 /* ms */); + if (err == -EINTR) + break; + if (err < 0) { + fprintf(stderr, "ring_buffer__poll() error: %d\n", err); + break; + } + } + + ring_buffer__free(rb); + err = 0; + +cleanup: + if (link_sysctl) + bpf_link__destroy(link_sysctl); + if (link_dev) + bpf_link__destroy(link_dev); + if (link_connect) + bpf_link__destroy(link_connect); + + cgroup_guard_bpf__destroy(skel); + close(cg_fd); + return err ? 1 : 0; +} diff --git a/src/cgroup/cgroup_guard.h b/src/cgroup/cgroup_guard.h new file mode 100644 index 0000000..3ba880e --- /dev/null +++ b/src/cgroup/cgroup_guard.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +#ifndef __CGROUP_GUARD_H +#define __CGROUP_GUARD_H + +#ifndef TASK_COMM_LEN +#define TASK_COMM_LEN 16 +#endif + +#define SYSCTL_NAME_LEN 64 + +enum event_type { + EVENT_CONNECT4 = 1, + EVENT_DEVICE = 2, + EVENT_SYSCTL = 3, +}; + +struct event { + __u64 ts_ns; + __u32 pid; + __u32 type; + char comm[TASK_COMM_LEN]; + + union { + struct { + __u32 daddr; /* IPv4, network order */ + __u16 dport; /* host order */ + __u16 proto; /* e.g. 6 for TCP */ + } connect4; + + struct { + __u32 major; + __u32 minor; + __u32 access_type; + } device; + + struct { + __u32 write; + char name[SYSCTL_NAME_LEN]; + } sysctl; + }; +}; + +#endif /* __CGROUP_GUARD_H */ diff --git a/src/cgroup/test.sh b/src/cgroup/test.sh new file mode 100755 index 0000000..93222c7 --- /dev/null +++ b/src/cgroup/test.sh @@ -0,0 +1,141 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +# test.sh - One-click test script for cgroup eBPF tutorial +# +# This script: +# 1. Builds the program if needed +# 2. Starts test HTTP servers +# 3. Runs the cgroup_guard loader +# 4. Executes tests from within the cgroup +# 5. Cleans up everything + +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +cd "$SCRIPT_DIR" + +CGROUP_PATH="/sys/fs/cgroup/ebpf_demo" +BLOCK_PORT=9090 +DENY_DEVICE="1:3" +DENY_SYSCTL="kernel/hostname" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +cleanup() { + echo -e "\n${YELLOW}=== Cleaning up ===${NC}" + + # Kill our processes + if [ -n "$LOADER_PID" ] && kill -0 "$LOADER_PID" 2>/dev/null; then + kill "$LOADER_PID" 2>/dev/null || true + wait "$LOADER_PID" 2>/dev/null || true + fi + + if [ -n "$SERVER_8080_PID" ] && kill -0 "$SERVER_8080_PID" 2>/dev/null; then + kill "$SERVER_8080_PID" 2>/dev/null || true + fi + + if [ -n "$SERVER_9090_PID" ] && kill -0 "$SERVER_9090_PID" 2>/dev/null; then + kill "$SERVER_9090_PID" 2>/dev/null || true + fi + + # Remove test cgroup (will fail if processes still in it, which is fine) + if [ -d "$CGROUP_PATH" ]; then + rmdir "$CGROUP_PATH" 2>/dev/null || true + fi + + echo -e "${GREEN}Cleanup complete${NC}" +} + +trap cleanup EXIT + +# Check if running as root +if [ "$EUID" -ne 0 ]; then + echo -e "${RED}Error: This script must be run as root${NC}" + echo "Usage: sudo $0" + exit 1 +fi + +# Build if needed +if [ ! -f "./cgroup_guard" ]; then + echo -e "${YELLOW}=== Building cgroup_guard ===${NC}" + make +fi + +echo -e "${YELLOW}=== Starting test HTTP servers ===${NC}" +python3 -m http.server 8080 --bind 127.0.0.1 >/dev/null 2>&1 & +SERVER_8080_PID=$! +python3 -m http.server 9090 --bind 127.0.0.1 >/dev/null 2>&1 & +SERVER_9090_PID=$! +sleep 1 +echo "HTTP server on port 8080 (PID: $SERVER_8080_PID)" +echo "HTTP server on port 9090 (PID: $SERVER_9090_PID)" + +echo -e "\n${YELLOW}=== Starting cgroup_guard ===${NC}" +./cgroup_guard \ + --cgroup "$CGROUP_PATH" \ + --block-port "$BLOCK_PORT" \ + --deny-device "$DENY_DEVICE" \ + --deny-sysctl "$DENY_SYSCTL" & +LOADER_PID=$! +sleep 2 + +echo -e "\n${YELLOW}=== Running tests from within cgroup ===${NC}" +echo "Testing from cgroup: $CGROUP_PATH" +echo "" + +# Create a temp file outside the test for output (since /dev/null is blocked in cgroup) +TMPOUT=$(mktemp) +trap "rm -f $TMPOUT" EXIT + +# Run tests in a subshell that joins the cgroup +# Note: We can't use /dev/null redirects inside the cgroup since it's blocked +bash -c " +echo \$\$ > $CGROUP_PATH/cgroup.procs + +echo '--- TCP Connection Test ---' +# Test port 8080 - should work (write to temp file to avoid /dev/null) +curl -s --connect-timeout 2 -o $TMPOUT http://127.0.0.1:8080 2>$TMPOUT.err +if [ \$? -eq 0 ]; then + echo -e '${GREEN}[PASS]${NC} Port 8080: Connection allowed' +else + echo -e '${RED}[FAIL]${NC} Port 8080: Connection failed (should be allowed)' +fi + +# Test port 9090 - should be blocked +curl -s --connect-timeout 2 -o $TMPOUT http://127.0.0.1:9090 2>$TMPOUT.err +if [ \$? -eq 0 ]; then + echo -e '${RED}[FAIL]${NC} Port 9090: Connection allowed (should be blocked!)' +else + echo -e '${GREEN}[PASS]${NC} Port 9090: Connection blocked' +fi + +echo '' +echo '--- Device Access Test ---' +# Test /dev/null (1:3) - should be blocked +cat /dev/null >$TMPOUT 2>$TMPOUT.err +if [ \$? -eq 0 ]; then + echo -e '${RED}[FAIL]${NC} /dev/null (1:3): Access allowed (should be blocked!)' +else + echo -e '${GREEN}[PASS]${NC} /dev/null (1:3): Access blocked' +fi + +echo '' +echo '--- Sysctl Read Test ---' +# Test kernel/hostname - should be blocked +cat /proc/sys/kernel/hostname >$TMPOUT 2>$TMPOUT.err +if [ \$? -eq 0 ]; then + echo -e '${RED}[FAIL]${NC} kernel/hostname: Read allowed (should be blocked!)' +else + echo -e '${GREEN}[PASS]${NC} kernel/hostname: Read blocked' +fi +" + +# Clean up temp files +rm -f "$TMPOUT" "$TMPOUT.err" 2>/dev/null || true + +echo "" +echo -e "${GREEN}=== All tests completed ===${NC}"